diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13768,19 +13768,29 @@ static void createTblShuffleForZExt(ZExtInst *ZExt, bool IsLittleEndian) { Value *Op = ZExt->getOperand(0); - auto *SrcTy = dyn_cast(Op->getType()); - auto *DstTy = dyn_cast(ZExt->getType()); + auto *SrcTy = cast(Op->getType()); + auto *DstTy = cast(ZExt->getType()); + unsigned ZExtFactor = + (cast(DstTy->getElementType())->getBitWidth()) / + (cast(SrcTy->getElementType())->getBitWidth()); unsigned NumElts = SrcTy->getNumElements(); IRBuilder<> Builder(ZExt); - SmallVector Mask(4 * NumElts, NumElts); - // Create a mask that selects <0,0,0,Op[i]> for each lane of vector of i32 to - // replace the original ZExt. This can later be lowered to a set of tbl - // instructions. - for (unsigned i = 0; i < NumElts; i++) { - if (IsLittleEndian) - Mask[i * 4] = i; - else - Mask[i * 4 + 3] = i; + SmallVector Mask; + // Create a mask that selects <0,...,Op[i]> for each lane of the destination + // vector to replace the original ZExt. This can later be lowered to a set of + // tbl instructions. + for (unsigned i = 0; i < NumElts * ZExtFactor; i++) { + if (IsLittleEndian) { + if (i % ZExtFactor == 0) + Mask.push_back(i / ZExtFactor); + else + Mask.push_back(NumElts); + } else { + if ((i + 1) % ZExtFactor == 0) + Mask.push_back((i - ZExtFactor + 1) / ZExtFactor); + else + Mask.push_back(NumElts); + } } auto *FirstEltZero = Builder.CreateInsertElement( @@ -13845,21 +13855,20 @@ if (!SrcTy || !DstTy) return false; - // Convert 'zext <(8|16) x i8> %x to <(8|16) x i32>' to a shuffle that can be - // lowered to either 2 or 4 tbl instructions to insert the original i8 - // elements into i32 lanes. + // Convert 'zext %x to ' to a shuffle that can be + // lowered to tbl instructions to insert the original i8 elements + // into i8x lanes. Conversion to is ignored as using tbl is not + // optimal for this case auto *ZExt = dyn_cast(I); - if (ZExt && (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) && - SrcTy->getElementType()->isIntegerTy(8) && - DstTy->getElementType()->isIntegerTy(32)) { + if (ZExt && SrcTy->getElementType()->isIntegerTy(8) && + (cast(DstTy->getElementType())->getBitWidth() % 8 == 0) && + !DstTy->getElementType()->isIntegerTy(16)) { createTblShuffleForZExt(ZExt, Subtarget->isLittleEndian()); return true; } auto *UIToFP = dyn_cast(I); - if (UIToFP && - (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) && - SrcTy->getElementType()->isIntegerTy(8) && + if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) && DstTy->getElementType()->isFloatTy()) { IRBuilder<> Builder(I); auto *ZExt = cast( diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -503,42 +503,42 @@ ret void } - - define void @zext_v16i8_to_v16i16_in_loop(i8* %src, i16* %dst) { ; CHECK-LABEL: zext_v16i8_to_v16i16_in_loop: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: LBB5_1: ; %loop -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q0, [x0, x8] -; CHECK-NEXT: add x8, x8, #16 -; CHECK-NEXT: cmp x8, #128 -; CHECK-NEXT: ushll2.8h v1, v0, #0 -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: stp q0, q1, [x1], #32 -; CHECK-NEXT: b.ne LBB5_1 -; CHECK-NEXT: ; %bb.2: ; %exit -; CHECK-NEXT: ret +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB5_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr q0, [x0, x8] +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: cmp x8, #128 +; CHECK-NEXT: ushll2.8h v1, v0, #0 +; CHECK-NEXT: ushll.8h v0, v0, #0 +; CHECK-NEXT: stp q0, q1, [x1], #32 +; CHECK-NEXT: b.ne LBB5_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret + ; ; CHECK-BE-LABEL: zext_v16i8_to_v16i16_in_loop: ; CHECK-BE: // %bb.0: // %entry -; CHECK-BE-NEXT: mov x8, xzr -; CHECK-BE-NEXT: .LBB5_1: // %loop -; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-BE-NEXT: add x9, x0, x8 -; CHECK-BE-NEXT: add x8, x8, #16 -; CHECK-BE-NEXT: cmp x8, #128 -; CHECK-BE-NEXT: ld1 { v0.16b }, [x9] -; CHECK-BE-NEXT: add x9, x1, #16 +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB5_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, x8 +; CHECK-BE-NEXT: add x8, x8, #16 +; CHECK-BE-NEXT: cmp x8, #128 +; CHECK-BE-NEXT: ld1 { v0.16b }, [x9] +; CHECK-BE-NEXT: add x9, x1, #16 ; CHECK-BE-NEXT: ushll v1.8h, v0.8b, #0 ; CHECK-BE-NEXT: ushll2 v0.8h, v0.16b, #0 -; CHECK-BE-NEXT: st1 { v1.8h }, [x1] -; CHECK-BE-NEXT: add x1, x1, #32 -; CHECK-BE-NEXT: st1 { v0.8h }, [x9] -; CHECK-BE-NEXT: b.ne .LBB5_1 -; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: st1 { v1.8h }, [x1] +; CHECK-BE-NEXT: add x1, x1, #32 +; CHECK-BE-NEXT: st1 { v0.8h }, [x9] +; CHECK-BE-NEXT: b.ne .LBB5_1 +; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: ret + entry: br label %loop @@ -699,79 +699,412 @@ ret void } +; CHECK-LABEL: lCPI7_0: +; CHECK-NEXT: .byte 0 ; 0x0 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 1 ; 0x1 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI7_1: +; CHECK-NEXT: .byte 2 ; 0x2 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 3 ; 0x3 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI7_2: +; CHECK-NEXT: .byte 4 ; 0x4 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 5 ; 0x5 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI7_3: +; CHECK-NEXT: .byte 6 ; 0x6 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 7 ; 0x7 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI7_4: +; CHECK-NEXT: .byte 8 ; 0x8 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 9 ; 0x9 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI7_5: +; CHECK-NEXT: .byte 10 ; 0xa +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 11 ; 0xb +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI7_6: +; CHECK-NEXT: .byte 12 ; 0xc +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 13 ; 0xd +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI7_7: +; CHECK-NEXT: .byte 14 ; 0xe +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 15 ; 0xf +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff + +; CHECK-BE-LABEL: .LCPI7_0: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 0 // 0x0 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 1 // 0x1 +; CHECK-BE-NEXT: .LCPI7_1: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 2 // 0x2 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 3 // 0x3 +; CHECK-BE-NEXT: .LCPI7_2: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 4 // 0x4 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 5 // 0x5 +; CHECK-BE-NEXT: .LCPI7_3: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 6 // 0x6 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 7 // 0x7 +; CHECK-BE-NEXT: .LCPI7_4: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 8 // 0x8 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 9 // 0x9 +; CHECK-BE-NEXT: .LCPI7_5: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 10 // 0xa +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 11 // 0xb +; CHECK-BE-NEXT: .LCPI7_6: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 12 // 0xc +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 13 // 0xd +; CHECK-BE-NEXT: .LCPI7_7: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 14 // 0xe +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 15 // 0xf + define void @zext_v16i8_to_v16i64_in_loop(i8* %src, i64* %dst) { ; CHECK-LABEL: zext_v16i8_to_v16i64_in_loop: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: LBB7_1: ; %loop -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q0, [x0, x8] -; CHECK-NEXT: add x8, x8, #16 -; CHECK-NEXT: cmp x8, #128 -; CHECK-NEXT: ushll.8h v1, v0, #0 -; CHECK-NEXT: ushll2.8h v0, v0, #0 -; CHECK-NEXT: ushll2.4s v2, v1, #0 -; CHECK-NEXT: ushll2.4s v3, v0, #0 -; CHECK-NEXT: ushll.4s v0, v0, #0 -; CHECK-NEXT: ushll2.2d v4, v3, #0 -; CHECK-NEXT: ushll2.2d v5, v0, #0 -; CHECK-NEXT: ushll.2d v0, v0, #0 -; CHECK-NEXT: ushll.2d v3, v3, #0 -; CHECK-NEXT: stp q0, q5, [x1, #64] -; CHECK-NEXT: ushll.4s v0, v1, #0 -; CHECK-NEXT: stp q3, q4, [x1, #96] -; CHECK-NEXT: ushll2.2d v3, v2, #0 -; CHECK-NEXT: ushll.2d v2, v2, #0 -; CHECK-NEXT: ushll2.2d v1, v0, #0 -; CHECK-NEXT: ushll.2d v0, v0, #0 -; CHECK-NEXT: stp q2, q3, [x1, #32] -; CHECK-NEXT: stp q0, q1, [x1], #128 -; CHECK-NEXT: b.ne LBB7_1 -; CHECK-NEXT: ; %bb.2: ; %exit -; CHECK-NEXT: ret -; +; CHECK-NEXT: Lloh12: +; CHECK-NEXT: adrp x9, lCPI7_0@PAGE +; CHECK-NEXT: Lloh13: +; CHECK-NEXT: adrp x10, lCPI7_1@PAGE +; CHECK-NEXT: Lloh14: +; CHECK-NEXT: adrp x11, lCPI7_2@PAGE +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: Lloh15: +; CHECK-NEXT: ldr q0, [x9, lCPI7_0@PAGEOFF] +; CHECK-NEXT: Lloh16: +; CHECK-NEXT: adrp x9, lCPI7_3@PAGE +; CHECK-NEXT: Lloh17: +; CHECK-NEXT: ldr q1, [x10, lCPI7_1@PAGEOFF] +; CHECK-NEXT: Lloh18: +; CHECK-NEXT: adrp x10, lCPI7_4@PAGE +; CHECK-NEXT: Lloh19: +; CHECK-NEXT: ldr q2, [x11, lCPI7_2@PAGEOFF] +; CHECK-NEXT: Lloh20: +; CHECK-NEXT: adrp x11, lCPI7_5@PAGE +; CHECK-NEXT: Lloh21: +; CHECK-NEXT: ldr q3, [x9, lCPI7_3@PAGEOFF] +; CHECK-NEXT: Lloh22: +; CHECK-NEXT: adrp x9, lCPI7_6@PAGE +; CHECK-NEXT: Lloh23: +; CHECK-NEXT: ldr q4, [x10, lCPI7_4@PAGEOFF] +; CHECK-NEXT: Lloh24: +; CHECK-NEXT: adrp x10, lCPI7_7@PAGE +; CHECK-NEXT: Lloh25: +; CHECK-NEXT: ldr q5, [x11, lCPI7_5@PAGEOFF] +; CHECK-NEXT: Lloh26: +; CHECK-NEXT: ldr q6, [x9, lCPI7_6@PAGEOFF] +; CHECK-NEXT: Lloh27: +; CHECK-NEXT: ldr q7, [x10, lCPI7_7@PAGEOFF] +; CHECK-NEXT: LBB7_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr q16, [x0, x8] +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: cmp x8, #128 +; CHECK-NEXT: tbl.16b v17, { v16 }, v7 +; CHECK-NEXT: tbl.16b v18, { v16 }, v6 +; CHECK-NEXT: tbl.16b v19, { v16 }, v5 +; CHECK-NEXT: tbl.16b v20, { v16 }, v4 +; CHECK-NEXT: tbl.16b v21, { v16 }, v3 +; CHECK-NEXT: stp q18, q17, [x1, #96] +; CHECK-NEXT: tbl.16b v17, { v16 }, v2 +; CHECK-NEXT: tbl.16b v18, { v16 }, v1 +; CHECK-NEXT: stp q20, q19, [x1, #64] +; CHECK-NEXT: tbl.16b v16, { v16 }, v0 +; CHECK-NEXT: stp q17, q21, [x1, #32] +; CHECK-NEXT: stp q16, q18, [x1], #128 +; CHECK-NEXT: b.ne LBB7_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh24, Lloh27 +; CHECK-NEXT: .loh AdrpLdr Lloh22, Lloh26 +; CHECK-NEXT: .loh AdrpLdr Lloh20, Lloh25 +; CHECK-NEXT: .loh AdrpAdrp Lloh18, Lloh24 +; CHECK-NEXT: .loh AdrpLdr Lloh18, Lloh23 +; CHECK-NEXT: .loh AdrpAdrp Lloh16, Lloh22 +; CHECK-NEXT: .loh AdrpLdr Lloh16, Lloh21 +; CHECK-NEXT: .loh AdrpAdrp Lloh14, Lloh20 +; CHECK-NEXT: .loh AdrpLdr Lloh14, Lloh19 +; CHECK-NEXT: .loh AdrpAdrp Lloh13, Lloh18 +; CHECK-NEXT: .loh AdrpLdr Lloh13, Lloh17 +; CHECK-NEXT: .loh AdrpAdrp Lloh12, Lloh16 +; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh15 + + ; CHECK-BE-LABEL: zext_v16i8_to_v16i64_in_loop: ; CHECK-BE: // %bb.0: // %entry -; CHECK-BE-NEXT: mov x8, xzr -; CHECK-BE-NEXT: .LBB7_1: // %loop -; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-BE-NEXT: add x9, x0, x8 -; CHECK-BE-NEXT: add x10, x1, #96 -; CHECK-BE-NEXT: add x8, x8, #16 -; CHECK-BE-NEXT: cmp x8, #128 -; CHECK-BE-NEXT: ld1 { v0.16b }, [x9] -; CHECK-BE-NEXT: add x9, x1, #112 -; CHECK-BE-NEXT: ushll2 v1.8h, v0.16b, #0 -; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-BE-NEXT: ushll2 v2.4s, v1.8h, #0 -; CHECK-BE-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-BE-NEXT: ushll2 v3.2d, v2.4s, #0 -; CHECK-BE-NEXT: ushll v2.2d, v2.2s, #0 -; CHECK-BE-NEXT: st1 { v3.2d }, [x9] -; CHECK-BE-NEXT: add x9, x1, #80 -; CHECK-BE-NEXT: ushll2 v3.2d, v1.4s, #0 -; CHECK-BE-NEXT: st1 { v2.2d }, [x10] -; CHECK-BE-NEXT: ushll2 v2.4s, v0.8h, #0 -; CHECK-BE-NEXT: add x10, x1, #48 -; CHECK-BE-NEXT: st1 { v3.2d }, [x9] -; CHECK-BE-NEXT: add x9, x1, #64 -; CHECK-BE-NEXT: ushll v1.2d, v1.2s, #0 -; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-BE-NEXT: ushll2 v4.2d, v2.4s, #0 -; CHECK-BE-NEXT: st1 { v1.2d }, [x9] -; CHECK-BE-NEXT: ushll v1.2d, v0.2s, #0 -; CHECK-BE-NEXT: add x9, x1, #16 -; CHECK-BE-NEXT: st1 { v4.2d }, [x10] -; CHECK-BE-NEXT: add x10, x1, #32 -; CHECK-BE-NEXT: st1 { v1.2d }, [x1] -; CHECK-BE-NEXT: add x1, x1, #128 -; CHECK-BE-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-BE-NEXT: ushll v2.2d, v2.2s, #0 -; CHECK-BE-NEXT: st1 { v0.2d }, [x9] -; CHECK-BE-NEXT: st1 { v2.2d }, [x10] -; CHECK-BE-NEXT: b.ne .LBB7_1 -; CHECK-BE-NEXT: // %bb.2: // %exit -; CHECK-BE-NEXT: ret +; CHECK-BE-NEXT: adrp x8, .LCPI7_0 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI7_0 +; CHECK-BE-NEXT: ld1 { v0.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI7_1 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI7_1 +; CHECK-BE-NEXT: ld1 { v1.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI7_2 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI7_2 +; CHECK-BE-NEXT: ld1 { v2.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI7_3 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI7_3 +; CHECK-BE-NEXT: ld1 { v3.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI7_4 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI7_4 +; CHECK-BE-NEXT: ld1 { v4.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI7_5 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI7_5 +; CHECK-BE-NEXT: ld1 { v5.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI7_6 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI7_6 +; CHECK-BE-NEXT: ld1 { v6.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI7_7 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI7_7 +; CHECK-BE-NEXT: ld1 { v7.16b }, [x8] +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB7_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, x8 +; CHECK-BE-NEXT: add x10, x1, #96 +; CHECK-BE-NEXT: add x8, x8, #16 +; CHECK-BE-NEXT: cmp x8, #128 +; CHECK-BE-NEXT: ld1 { v16.16b }, [x9] +; CHECK-BE-NEXT: add x9, x1, #112 +; CHECK-BE-NEXT: tbl v17.16b, { v16.16b }, v7.16b +; CHECK-BE-NEXT: tbl v18.16b, { v16.16b }, v6.16b +; CHECK-BE-NEXT: tbl v19.16b, { v16.16b }, v5.16b +; CHECK-BE-NEXT: st1 { v17.16b }, [x9] +; CHECK-BE-NEXT: add x9, x1, #80 +; CHECK-BE-NEXT: tbl v17.16b, { v16.16b }, v4.16b +; CHECK-BE-NEXT: st1 { v18.16b }, [x10] +; CHECK-BE-NEXT: add x10, x1, #64 +; CHECK-BE-NEXT: st1 { v19.16b }, [x9] +; CHECK-BE-NEXT: add x9, x1, #48 +; CHECK-BE-NEXT: tbl v18.16b, { v16.16b }, v3.16b +; CHECK-BE-NEXT: tbl v19.16b, { v16.16b }, v0.16b +; CHECK-BE-NEXT: st1 { v17.16b }, [x10] +; CHECK-BE-NEXT: tbl v17.16b, { v16.16b }, v2.16b +; CHECK-BE-NEXT: add x10, x1, #32 +; CHECK-BE-NEXT: tbl v16.16b, { v16.16b }, v1.16b +; CHECK-BE-NEXT: st1 { v18.16b }, [x9] +; CHECK-BE-NEXT: add x9, x1, #16 +; CHECK-BE-NEXT: st1 { v19.16b }, [x1] +; CHECK-BE-NEXT: add x1, x1, #128 +; CHECK-BE-NEXT: st1 { v17.16b }, [x10] +; CHECK-BE-NEXT: st1 { v16.16b }, [x9] +; CHECK-BE-NEXT: b.ne .LBB7_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + entry: br label %loop @@ -792,12 +1125,1435 @@ ret void } +; CHECK-LABEL: lCPI8_0: +; CHECK-NEXT: .byte 0 ; 0x0 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 1 ; 0x1 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI8_1: +; CHECK-NEXT: .byte 2 ; 0x2 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 3 ; 0x3 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI8_2: +; CHECK-NEXT: .byte 4 ; 0x4 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 5 ; 0x5 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI8_3: +; CHECK-NEXT: .byte 6 ; 0x6 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 7 ; 0x7 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff + +; CHECK-BE-LABEL: .LCPI8_0: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 0 // 0x0 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 1 // 0x1 +; CHECK-BE-NEXT: .LCPI8_1: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 2 // 0x2 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 3 // 0x3 +; CHECK-BE-NEXT: .LCPI8_2: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 4 // 0x4 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 5 // 0x5 +; CHECK-BE-NEXT: .LCPI8_3: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 6 // 0x6 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 7 // 0x7 + +define void @zext_v8i8_to_v8i64_in_loop(i8* %src, i64* %dst) { +; CHECK-LABEL: _zext_v8i8_to_v8i64_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: Lloh28: +; CHECK-NEXT: adrp x9, lCPI8_0@PAGE +; CHECK-NEXT: Lloh29: +; CHECK-NEXT: adrp x10, lCPI8_1@PAGE +; CHECK-NEXT: Lloh30: +; CHECK-NEXT: adrp x11, lCPI8_2@PAGE +; CHECK-NEXT: Lloh31: +; CHECK-NEXT: adrp x12, lCPI8_3@PAGE +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: Lloh32: +; CHECK-NEXT: ldr q0, [x9, lCPI8_0@PAGEOFF] +; CHECK-NEXT: Lloh33: +; CHECK-NEXT: ldr q1, [x10, lCPI8_1@PAGEOFF] +; CHECK-NEXT: Lloh34: +; CHECK-NEXT: ldr q2, [x11, lCPI8_2@PAGEOFF] +; CHECK-NEXT: Lloh35: +; CHECK-NEXT: ldr q3, [x12, lCPI8_3@PAGEOFF] +; CHECK-NEXT: LBB8_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr d4, [x0, x8] +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: cmp x8, #128 +; CHECK-NEXT: tbl.16b v5, { v4 }, v3 +; CHECK-NEXT: tbl.16b v6, { v4 }, v2 +; CHECK-NEXT: tbl.16b v7, { v4 }, v1 +; CHECK-NEXT: tbl.16b v4, { v4 }, v0 +; CHECK-NEXT: stp q6, q5, [x1, #32] +; CHECK-NEXT: stp q4, q7, [x1], #128 +; CHECK-NEXT: b.ne LBB8_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh31, Lloh35 +; CHECK-NEXT: .loh AdrpLdr Lloh30, Lloh34 +; CHECK-NEXT: .loh AdrpLdr Lloh29, Lloh33 +; CHECK-NEXT: .loh AdrpLdr Lloh28, Lloh32 + +; CHECK-BE-LABEL: zext_v8i8_to_v8i64_in_loop: +; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: adrp x8, .LCPI8_0 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI8_0 +; CHECK-BE-NEXT: ld1 { v0.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI8_1 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI8_1 +; CHECK-BE-NEXT: ld1 { v1.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI8_2 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI8_2 +; CHECK-BE-NEXT: ld1 { v2.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI8_3 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI8_3 +; CHECK-BE-NEXT: ld1 { v3.16b }, [x8] +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB8_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, x8 +; CHECK-BE-NEXT: add x10, x1, #32 +; CHECK-BE-NEXT: add x8, x8, #16 +; CHECK-BE-NEXT: cmp x8, #128 +; CHECK-BE-NEXT: ld1 { v4.8b }, [x9] +; CHECK-BE-NEXT: add x9, x1, #48 +; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v3.16b +; CHECK-BE-NEXT: tbl v6.16b, { v4.16b }, v0.16b +; CHECK-BE-NEXT: tbl v7.16b, { v4.16b }, v2.16b +; CHECK-BE-NEXT: tbl v4.16b, { v4.16b }, v1.16b +; CHECK-BE-NEXT: st1 { v5.16b }, [x9] +; CHECK-BE-NEXT: add x9, x1, #16 +; CHECK-BE-NEXT: st1 { v6.16b }, [x1] +; CHECK-BE-NEXT: add x1, x1, #128 +; CHECK-BE-NEXT: st1 { v7.16b }, [x10] +; CHECK-BE-NEXT: st1 { v4.16b }, [x9] +; CHECK-BE-NEXT: b.ne .LBB8_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %src.gep = getelementptr i8, i8* %src, i64 %iv + %src.gep.cast = bitcast i8* %src.gep to <8 x i8>* + %load = load <8 x i8>, <8 x i8>* %src.gep.cast + %ext = zext <8 x i8> %load to <8 x i64> + %dst.gep = getelementptr i64, i64* %dst, i64 %iv + %dst.gep.cast = bitcast i64* %dst.gep to <8 x i64>* + store <8 x i64> %ext, <8 x i64>* %dst.gep.cast + %iv.next = add nuw i64 %iv, 16 + %ec = icmp eq i64 %iv.next, 128 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @zext_v8i8_to_v8i16_in_loop(i8* %src, i16* %dst) { +; CHECK-LABEL: _zext_v8i8_to_v8i16_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT:LBB9_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr d0, [x0, x8] +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: cmp x8, #128 +; CHECK-NEXT: ushll.8h v0, v0, #0 +; CHECK-NEXT: str q0, [x1], #32 +; CHECK-NEXT: b.ne LBB9_1 +; CHECK-NEXT:; %bb.2: ; %exit +; CHECK-NEXT: ret + + +; CHECK-BE-LABEL: zext_v8i8_to_v8i16_in_loop: +; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB9_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, x8 +; CHECK-BE-NEXT: add x8, x8, #16 +; CHECK-BE-NEXT: cmp x8, #128 +; CHECK-BE-NEXT: ld1 { v0.8b }, [x9] +; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: st1 { v0.8h }, [x1] +; CHECK-BE-NEXT: add x1, x1, #32 +; CHECK-BE-NEXT: b.ne .LBB9_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %src.gep = getelementptr i8, i8* %src, i64 %iv + %src.gep.cast = bitcast i8* %src.gep to <8 x i8>* + %load = load <8 x i8>, <8 x i8>* %src.gep.cast + %ext = zext <8 x i8> %load to <8 x i16> + %dst.gep = getelementptr i16, i16* %dst, i64 %iv + %dst.gep.cast = bitcast i16* %dst.gep to <8 x i16>* + store <8 x i16> %ext, <8 x i16>* %dst.gep.cast + %iv.next = add nuw i64 %iv, 16 + %ec = icmp eq i64 %iv.next, 128 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @zext_v8i8_to_v8i20_in_loop(i8* %src, i20* %dst) { +; CHECK-LABEL: _zext_v8i8_to_v8i20_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB10_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr d0, [x0, x8] +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: cmp x8, #128 +; CHECK-NEXT: ushll.8h v0, v0, #0 +; CHECK-NEXT: ushll2.4s v1, v0, #0 +; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: mov.s w10, v1[1] +; CHECK-NEXT: mov.s w13, v0[1] +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: mov.s w12, v1[2] +; CHECK-NEXT: fmov w15, s0 +; CHECK-NEXT: mov.s w16, v0[2] +; CHECK-NEXT: mov.s w9, v1[3] +; CHECK-NEXT: mov.s w14, v0[3] +; CHECK-NEXT: orr x10, x11, x10, lsl #20 +; CHECK-NEXT: orr x11, x15, x13, lsl #20 +; CHECK-NEXT: orr x10, x10, x12, lsl #40 +; CHECK-NEXT: orr x11, x11, x16, lsl #40 +; CHECK-NEXT: lsr x13, x9, #4 +; CHECK-NEXT: lsr x12, x14, #4 +; CHECK-NEXT: orr x9, x10, x9, lsl #60 +; CHECK-NEXT: orr x10, x11, x14, lsl #60 +; CHECK-NEXT: strh w13, [x1, #18] +; CHECK-NEXT: strh w12, [x1, #8] +; CHECK-NEXT: stur x9, [x1, #10] +; CHECK-NEXT: str x10, [x1], #64 +; CHECK-NEXT: b.ne LBB10_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret + +; CHECK-BE-LABEL: zext_v8i8_to_v8i20_in_loop: +; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB10_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, x8 +; CHECK-BE-NEXT: add x8, x8, #16 +; CHECK-BE-NEXT: cmp x8, #128 +; CHECK-BE-NEXT: ld1 { v0.8b }, [x9] +; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: mov w9, v1.s[1] +; CHECK-BE-NEXT: mov w11, v0.s[1] +; CHECK-BE-NEXT: mov w13, v1.s[2] +; CHECK-BE-NEXT: fmov w14, s1 +; CHECK-BE-NEXT: mov w15, v0.s[2] +; CHECK-BE-NEXT: fmov w16, s0 +; CHECK-BE-NEXT: mov w10, v1.s[3] +; CHECK-BE-NEXT: lsl x9, x9, #40 +; CHECK-BE-NEXT: mov w12, v0.s[3] +; CHECK-BE-NEXT: lsl x11, x11, #40 +; CHECK-BE-NEXT: orr x9, x9, x14, lsl #60 +; CHECK-BE-NEXT: orr x11, x11, x16, lsl #60 +; CHECK-BE-NEXT: orr x9, x9, x13, lsl #20 +; CHECK-BE-NEXT: orr x11, x11, x15, lsl #20 +; CHECK-BE-NEXT: lsr w13, w14, #4 +; CHECK-BE-NEXT: lsr w14, w16, #4 +; CHECK-BE-NEXT: strh w10, [x1, #18] +; CHECK-BE-NEXT: extr x9, x13, x9, #16 +; CHECK-BE-NEXT: strh w12, [x1, #8] +; CHECK-BE-NEXT: extr x10, x14, x11, #16 +; CHECK-BE-NEXT: stur x9, [x1, #10] +; CHECK-BE-NEXT: str x10, [x1], #64 +; CHECK-BE-NEXT: b.ne .LBB10_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %src.gep = getelementptr i8, i8* %src, i64 %iv + %src.gep.cast = bitcast i8* %src.gep to <8 x i8>* + %load = load <8 x i8>, <8 x i8>* %src.gep.cast + %ext = zext <8 x i8> %load to <8 x i20> + %dst.gep = getelementptr i20, i20* %dst, i64 %iv + %dst.gep.cast = bitcast i20* %dst.gep to <8 x i20>* + store <8 x i20> %ext, <8 x i20>* %dst.gep.cast + %iv.next = add nuw i64 %iv, 16 + %ec = icmp eq i64 %iv.next, 128 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; CHECK-LABEL: lCPI11_0: +; CHECK-NEXT: .byte 0 ; 0x0 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 1 ; 0x1 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 2 ; 0x2 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 3 ; 0x3 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff + +; CHECK-BE-LABEL: .LCPI11_0: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 0 // 0x0 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 1 // 0x1 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 2 // 0x2 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 3 // 0x3 + +define void @zext_v4i8_to_v4i32_in_loop(i8* %src, i32* %dst) { +; CHECK-LABEL: _zext_v4i8_to_v4i32_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: Lloh36: +; CHECK-NEXT: adrp x9, lCPI11_0@PAGE +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: Lloh37: +; CHECK-NEXT: ldr q0, [x9, lCPI11_0@PAGEOFF] +; CHECK-NEXT: LBB11_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr s1, [x0, x8] +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: cmp x8, #128 +; CHECK-NEXT: tbl.16b v1, { v1 }, v0 +; CHECK-NEXT: str q1, [x1], #64 +; CHECK-NEXT: b.ne LBB11_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret + +; CHECK-BE-LABEL: zext_v4i8_to_v4i32_in_loop: +; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: adrp x8, .LCPI11_0 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI11_0 +; CHECK-BE-NEXT: ld1 { v0.16b }, [x8] +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB11_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: ldr s1, [x0, x8] +; CHECK-BE-NEXT: add x8, x8, #16 +; CHECK-BE-NEXT: cmp x8, #128 +; CHECK-BE-NEXT: rev32 v1.16b, v1.16b +; CHECK-BE-NEXT: tbl v1.16b, { v1.16b }, v0.16b +; CHECK-BE-NEXT: st1 { v1.16b }, [x1] +; CHECK-BE-NEXT: add x1, x1, #64 +; CHECK-BE-NEXT: b.ne .LBB11_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %src.gep = getelementptr i8, i8* %src, i64 %iv + %src.gep.cast = bitcast i8* %src.gep to <4 x i8>* + %load = load <4 x i8>, <4 x i8>* %src.gep.cast + %ext = zext <4 x i8> %load to <4 x i32> + %dst.gep = getelementptr i32, i32* %dst, i64 %iv + %dst.gep.cast = bitcast i32* %dst.gep to <4 x i32>* + store <4 x i32> %ext, <4 x i32>* %dst.gep.cast + %iv.next = add nuw i64 %iv, 16 + %ec = icmp eq i64 %iv.next, 128 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; CHECK-LABEL: lCPI12_0: +; CHECK-NEXT: .byte 0 ; 0x0 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 1 ; 0x1 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 2 ; 0x2 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 3 ; 0x3 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI12_1: +; CHECK-NEXT: .byte 4 ; 0x4 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 5 ; 0x5 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 6 ; 0x6 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 7 ; 0x7 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI12_2: +; CHECK-NEXT: .byte 8 ; 0x8 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 9 ; 0x9 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 10 ; 0xa +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 11 ; 0xb +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff + +; CHECK-BE-LABEL: .LCPI12_0: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 0 // 0x0 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 1 // 0x1 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 2 // 0x2 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 3 // 0x3 +; CHECK-BE-NEXT: .LCPI12_1: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 4 // 0x4 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 5 // 0x5 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 6 // 0x6 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 7 // 0x7 +; CHECK-BE-NEXT: .LCPI12_2: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 8 // 0x8 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 9 // 0x9 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 10 // 0xa +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 11 // 0xb + +define void @zext_v12i8_to_v12i32_in_loop(i8* %src, i32* %dst) { +; CHECK-LABEL: _zext_v12i8_to_v12i32_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: Lloh38: +; CHECK-NEXT: adrp x9, lCPI12_0@PAGE +; CHECK-NEXT: Lloh39: +; CHECK-NEXT: adrp x10, lCPI12_1@PAGE +; CHECK-NEXT: Lloh40: +; CHECK-NEXT: adrp x11, lCPI12_2@PAGE +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: Lloh41: +; CHECK-NEXT: ldr q0, [x9, lCPI12_0@PAGEOFF] +; CHECK-NEXT: Lloh42: +; CHECK-NEXT: ldr q1, [x10, lCPI12_1@PAGEOFF] +; CHECK-NEXT: Lloh43: +; CHECK-NEXT: ldr q2, [x11, lCPI12_2@PAGEOFF] +; CHECK-NEXT: LBB12_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr q3, [x0, x8] +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: cmp x8, #128 +; CHECK-NEXT: tbl.16b v4, { v3 }, v2 +; CHECK-NEXT: tbl.16b v5, { v3 }, v1 +; CHECK-NEXT: tbl.16b v3, { v3 }, v0 +; CHECK-NEXT: stp q5, q4, [x1, #16] +; CHECK-NEXT: str q3, [x1], #64 +; CHECK-NEXT: b.ne LBB12_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh40, Lloh43 +; CHECK-NEXT: .loh AdrpLdr Lloh39, Lloh42 +; CHECK-NEXT: .loh AdrpLdr Lloh38, Lloh41 + +; CHECK-BE-LABEL: zext_v12i8_to_v12i32_in_loop: +; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: adrp x8, .LCPI12_0 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI12_0 +; CHECK-BE-NEXT: ld1 { v0.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI12_1 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI12_1 +; CHECK-BE-NEXT: ld1 { v1.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI12_2 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI12_2 +; CHECK-BE-NEXT: ld1 { v2.16b }, [x8] +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB12_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, x8 +; CHECK-BE-NEXT: add x10, x1, #16 +; CHECK-BE-NEXT: add x8, x8, #16 +; CHECK-BE-NEXT: cmp x8, #128 +; CHECK-BE-NEXT: ld1 { v3.16b }, [x9] +; CHECK-BE-NEXT: add x9, x1, #32 +; CHECK-BE-NEXT: tbl v4.16b, { v3.16b }, v0.16b +; CHECK-BE-NEXT: tbl v5.16b, { v3.16b }, v2.16b +; CHECK-BE-NEXT: tbl v3.16b, { v3.16b }, v1.16b +; CHECK-BE-NEXT: st1 { v4.16b }, [x1] +; CHECK-BE-NEXT: add x1, x1, #64 +; CHECK-BE-NEXT: st1 { v5.16b }, [x9] +; CHECK-BE-NEXT: st1 { v3.16b }, [x10] +; CHECK-BE-NEXT: b.ne .LBB12_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %src.gep = getelementptr i8, i8* %src, i64 %iv + %src.gep.cast = bitcast i8* %src.gep to <12 x i8>* + %load = load <12 x i8>, <12 x i8>* %src.gep.cast + %ext = zext <12 x i8> %load to <12 x i32> + %dst.gep = getelementptr i32, i32* %dst, i64 %iv + %dst.gep.cast = bitcast i32* %dst.gep to <12 x i32>* + store <12 x i32> %ext, <12 x i32>* %dst.gep.cast + %iv.next = add nuw i64 %iv, 16 + %ec = icmp eq i64 %iv.next, 128 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @zext_v16i4_to_v16i32_in_loop(i4* %src, i32* %dst) { +; CHECK-LABEL: _zext_v16i4_to_v16i32_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: movi.4s v0, #15 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB13_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr x9, [x0, x8] +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: cmp x8, #128 +; CHECK-NEXT: and w10, w9, #0xf +; CHECK-NEXT: ubfx w11, w9, #4, #4 +; CHECK-NEXT: fmov s1, w10 +; CHECK-NEXT: ubfx w10, w9, #8, #4 +; CHECK-NEXT: mov.b v1[1], w11 +; CHECK-NEXT: mov.b v1[2], w10 +; CHECK-NEXT: ubfx w10, w9, #12, #4 +; CHECK-NEXT: mov.b v1[3], w10 +; CHECK-NEXT: ubfx w10, w9, #16, #4 +; CHECK-NEXT: mov.b v1[4], w10 +; CHECK-NEXT: ubfx w10, w9, #20, #4 +; CHECK-NEXT: mov.b v1[5], w10 +; CHECK-NEXT: ubfx w10, w9, #24, #4 +; CHECK-NEXT: mov.b v1[6], w10 +; CHECK-NEXT: ubfx x10, x9, #28, #4 +; CHECK-NEXT: mov.b v1[7], w10 +; CHECK-NEXT: ubfx x10, x9, #32, #4 +; CHECK-NEXT: mov.b v1[8], w10 +; CHECK-NEXT: ubfx x10, x9, #36, #4 +; CHECK-NEXT: mov.b v1[9], w10 +; CHECK-NEXT: ubfx x10, x9, #40, #4 +; CHECK-NEXT: mov.b v1[10], w10 +; CHECK-NEXT: ubfx x10, x9, #44, #4 +; CHECK-NEXT: mov.b v1[11], w10 +; CHECK-NEXT: ubfx x10, x9, #48, #4 +; CHECK-NEXT: mov.b v1[12], w10 +; CHECK-NEXT: ubfx x10, x9, #52, #4 +; CHECK-NEXT: mov.b v1[13], w10 +; CHECK-NEXT: ubfx x10, x9, #56, #4 +; CHECK-NEXT: lsr x9, x9, #60 +; CHECK-NEXT: mov.b v1[14], w10 +; CHECK-NEXT: mov.b v1[15], w9 +; CHECK-NEXT: ext.16b v2, v1, v1, #8 +; CHECK-NEXT: zip2.8b v3, v1, v0 +; CHECK-NEXT: zip1.8b v1, v1, v0 +; CHECK-NEXT: zip1.8b v4, v2, v0 +; CHECK-NEXT: zip2.8b v2, v2, v0 +; CHECK-NEXT: ushll.4s v3, v3, #0 +; CHECK-NEXT: ushll.4s v1, v1, #0 +; CHECK-NEXT: and.16b v3, v3, v0 +; CHECK-NEXT: and.16b v1, v1, v0 +; CHECK-NEXT: stp q1, q3, [x1] +; CHECK-NEXT: ushll.4s v1, v2, #0 +; CHECK-NEXT: ushll.4s v2, v4, #0 +; CHECK-NEXT: and.16b v1, v1, v0 +; CHECK-NEXT: and.16b v2, v2, v0 +; CHECK-NEXT: stp q2, q1, [x1, #32] +; CHECK-NEXT: add x1, x1, #64 +; CHECK-NEXT: b.ne LBB13_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret + +; CHECK-BE-LABEL: zext_v16i4_to_v16i32_in_loop: +; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: movi v0.4s, #15 +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB13_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: ldr x9, [x0, x8] +; CHECK-BE-NEXT: add x8, x8, #16 +; CHECK-BE-NEXT: cmp x8, #128 +; CHECK-BE-NEXT: lsr x10, x9, #60 +; CHECK-BE-NEXT: ubfx x11, x9, #56, #4 +; CHECK-BE-NEXT: fmov s1, w10 +; CHECK-BE-NEXT: ubfx x10, x9, #52, #4 +; CHECK-BE-NEXT: mov v1.b[1], w11 +; CHECK-BE-NEXT: add x11, x1, #32 +; CHECK-BE-NEXT: mov v1.b[2], w10 +; CHECK-BE-NEXT: ubfx x10, x9, #48, #4 +; CHECK-BE-NEXT: mov v1.b[3], w10 +; CHECK-BE-NEXT: ubfx x10, x9, #44, #4 +; CHECK-BE-NEXT: mov v1.b[4], w10 +; CHECK-BE-NEXT: ubfx x10, x9, #40, #4 +; CHECK-BE-NEXT: mov v1.b[5], w10 +; CHECK-BE-NEXT: ubfx x10, x9, #36, #4 +; CHECK-BE-NEXT: mov v1.b[6], w10 +; CHECK-BE-NEXT: ubfx x10, x9, #32, #4 +; CHECK-BE-NEXT: mov v1.b[7], w10 +; CHECK-BE-NEXT: ubfx x10, x9, #28, #4 +; CHECK-BE-NEXT: mov v1.b[8], w10 +; CHECK-BE-NEXT: ubfx w10, w9, #24, #4 +; CHECK-BE-NEXT: mov v1.b[9], w10 +; CHECK-BE-NEXT: ubfx w10, w9, #20, #4 +; CHECK-BE-NEXT: mov v1.b[10], w10 +; CHECK-BE-NEXT: ubfx w10, w9, #16, #4 +; CHECK-BE-NEXT: mov v1.b[11], w10 +; CHECK-BE-NEXT: ubfx w10, w9, #12, #4 +; CHECK-BE-NEXT: mov v1.b[12], w10 +; CHECK-BE-NEXT: ubfx w10, w9, #8, #4 +; CHECK-BE-NEXT: mov v1.b[13], w10 +; CHECK-BE-NEXT: ubfx w10, w9, #4, #4 +; CHECK-BE-NEXT: and w9, w9, #0xf +; CHECK-BE-NEXT: mov v1.b[14], w10 +; CHECK-BE-NEXT: add x10, x1, #48 +; CHECK-BE-NEXT: mov v1.b[15], w9 +; CHECK-BE-NEXT: add x9, x1, #16 +; CHECK-BE-NEXT: ext v2.16b, v1.16b, v1.16b, #8 +; CHECK-BE-NEXT: zip2 v3.8b, v1.8b, v0.8b +; CHECK-BE-NEXT: zip1 v1.8b, v1.8b, v0.8b +; CHECK-BE-NEXT: zip1 v4.8b, v2.8b, v0.8b +; CHECK-BE-NEXT: zip2 v2.8b, v2.8b, v0.8b +; CHECK-BE-NEXT: rev16 v1.8b, v1.8b +; CHECK-BE-NEXT: rev16 v3.8b, v3.8b +; CHECK-BE-NEXT: rev16 v4.8b, v4.8b +; CHECK-BE-NEXT: rev16 v2.8b, v2.8b +; CHECK-BE-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-BE-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-BE-NEXT: and v1.16b, v1.16b, v0.16b +; CHECK-BE-NEXT: st1 { v1.4s }, [x1] +; CHECK-BE-NEXT: add x1, x1, #64 +; CHECK-BE-NEXT: ushll v1.4s, v2.4h, #0 +; CHECK-BE-NEXT: ushll v2.4s, v4.4h, #0 +; CHECK-BE-NEXT: and v3.16b, v3.16b, v0.16b +; CHECK-BE-NEXT: and v1.16b, v1.16b, v0.16b +; CHECK-BE-NEXT: st1 { v3.4s }, [x9] +; CHECK-BE-NEXT: and v2.16b, v2.16b, v0.16b +; CHECK-BE-NEXT: st1 { v1.4s }, [x10] +; CHECK-BE-NEXT: st1 { v2.4s }, [x11] +; CHECK-BE-NEXT: b.ne .LBB13_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %src.gep = getelementptr i4, i4* %src, i64 %iv + %src.gep.cast = bitcast i4* %src.gep to <16 x i4>* + %load = load <16 x i4>, <16 x i4>* %src.gep.cast + %ext = zext <16 x i4> %load to <16 x i32> + %dst.gep = getelementptr i32, i32* %dst, i64 %iv + %dst.gep.cast = bitcast i32* %dst.gep to <16 x i32>* + store <16 x i32> %ext, <16 x i32>* %dst.gep.cast + %iv.next = add nuw i64 %iv, 16 + %ec = icmp eq i64 %iv.next, 128 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @zext_v16i16_to_v16i64_in_loop(i16* %src, i64* %dst) { +; CHECK-LABEL: _zext_v16i16_to_v16i64_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB14_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add x9, x0, x8 +; CHECK-NEXT: add x8, x8, #32 +; CHECK-NEXT: cmp x8, #256 +; CHECK-NEXT: ldp q0, q1, [x9] +; CHECK-NEXT: ushll.4s v2, v0, #0 +; CHECK-NEXT: ushll2.4s v0, v0, #0 +; CHECK-NEXT: ushll.4s v3, v1, #0 +; CHECK-NEXT: ushll2.4s v1, v1, #0 +; CHECK-NEXT: ushll2.2d v5, v0, #0 +; CHECK-NEXT: ushll2.2d v4, v1, #0 +; CHECK-NEXT: ushll.2d v1, v1, #0 +; CHECK-NEXT: ushll.2d v0, v0, #0 +; CHECK-NEXT: stp q1, q4, [x1, #96] +; CHECK-NEXT: ushll2.2d v1, v3, #0 +; CHECK-NEXT: stp q0, q5, [x1, #32] +; CHECK-NEXT: ushll.2d v3, v3, #0 +; CHECK-NEXT: ushll2.2d v0, v2, #0 +; CHECK-NEXT: stp q3, q1, [x1, #64] +; CHECK-NEXT: ushll.2d v1, v2, #0 +; CHECK-NEXT: stp q1, q0, [x1], #128 +; CHECK-NEXT: b.ne LBB14_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret + +; CHECK-BE-LABEL: zext_v16i16_to_v16i64_in_loop: +; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB14_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, x8 +; CHECK-BE-NEXT: add x10, x1, #48 +; CHECK-BE-NEXT: add x8, x8, #32 +; CHECK-BE-NEXT: cmp x8, #256 +; CHECK-BE-NEXT: ld1 { v0.8h }, [x9] +; CHECK-BE-NEXT: add x9, x9, #16 +; CHECK-BE-NEXT: ld1 { v2.8h }, [x9] +; CHECK-BE-NEXT: add x9, x1, #32 +; CHECK-BE-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: ushll2 v3.2d, v1.4s, #0 +; CHECK-BE-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-BE-NEXT: st1 { v3.2d }, [x10] +; CHECK-BE-NEXT: add x10, x1, #112 +; CHECK-BE-NEXT: st1 { v1.2d }, [x9] +; CHECK-BE-NEXT: add x9, x1, #16 +; CHECK-BE-NEXT: ushll2 v3.2d, v0.4s, #0 +; CHECK-BE-NEXT: ushll2 v1.4s, v2.8h, #0 +; CHECK-BE-NEXT: st1 { v3.2d }, [x9] +; CHECK-BE-NEXT: add x9, x1, #96 +; CHECK-BE-NEXT: ushll2 v4.2d, v1.4s, #0 +; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-BE-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-BE-NEXT: st1 { v4.2d }, [x10] +; CHECK-BE-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-BE-NEXT: add x10, x1, #80 +; CHECK-BE-NEXT: st1 { v0.2d }, [x1] +; CHECK-BE-NEXT: st1 { v1.2d }, [x9] +; CHECK-BE-NEXT: add x9, x1, #64 +; CHECK-BE-NEXT: add x1, x1, #128 +; CHECK-BE-NEXT: ushll v3.2d, v2.2s, #0 +; CHECK-BE-NEXT: ushll2 v2.2d, v2.4s, #0 +; CHECK-BE-NEXT: st1 { v3.2d }, [x9] +; CHECK-BE-NEXT: st1 { v2.2d }, [x10] +; CHECK-BE-NEXT: b.ne .LBB14_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %src.gep = getelementptr i16, i16* %src, i64 %iv + %src.gep.cast = bitcast i16* %src.gep to <16 x i16>* + %load = load <16 x i16>, <16 x i16>* %src.gep.cast + %ext = zext <16 x i16> %load to <16 x i64> + %dst.gep = getelementptr i64, i64* %dst, i64 %iv + %dst.gep.cast = bitcast i64* %dst.gep to <16 x i64>* + store <16 x i64> %ext, <16 x i64>* %dst.gep.cast + %iv.next = add nuw i64 %iv, 16 + %ec = icmp eq i64 %iv.next, 128 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @zext_v16i32_to_v16i64_in_loop(i32* %src, i64* %dst) { +; CHECK-LABEL: _zext_v16i32_to_v16i64_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB15_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add x9, x0, x8 +; CHECK-NEXT: add x8, x8, #64 +; CHECK-NEXT: cmp x8, #512 +; CHECK-NEXT: ldp q1, q0, [x9, #32] +; CHECK-NEXT: ushll2.2d v5, v1, #0 +; CHECK-NEXT: ushll.2d v1, v1, #0 +; CHECK-NEXT: ldp q3, q2, [x9] +; CHECK-NEXT: ushll2.2d v4, v0, #0 +; CHECK-NEXT: stp q1, q5, [x1, #64] +; CHECK-NEXT: ushll.2d v0, v0, #0 +; CHECK-NEXT: stp q0, q4, [x1, #96] +; CHECK-NEXT: ushll2.2d v1, v3, #0 +; CHECK-NEXT: ushll2.2d v0, v2, #0 +; CHECK-NEXT: ushll.2d v2, v2, #0 +; CHECK-NEXT: stp q2, q0, [x1, #32] +; CHECK-NEXT: ushll.2d v0, v3, #0 +; CHECK-NEXT: stp q0, q1, [x1], #128 +; CHECK-NEXT: b.ne LBB15_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret + +; CHECK-BE-LABEL: zext_v16i32_to_v16i64_in_loop: +; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB15_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, x8 +; CHECK-BE-NEXT: add x8, x8, #64 +; CHECK-BE-NEXT: add x10, x9, #48 +; CHECK-BE-NEXT: add x11, x9, #32 +; CHECK-BE-NEXT: cmp x8, #512 +; CHECK-BE-NEXT: ld1 { v0.4s }, [x9] +; CHECK-BE-NEXT: add x9, x9, #16 +; CHECK-BE-NEXT: ld1 { v1.4s }, [x10] +; CHECK-BE-NEXT: add x10, x1, #16 +; CHECK-BE-NEXT: ld1 { v2.4s }, [x11] +; CHECK-BE-NEXT: ushll2 v3.2d, v0.4s, #0 +; CHECK-BE-NEXT: ld1 { v4.4s }, [x9] +; CHECK-BE-NEXT: add x9, x1, #112 +; CHECK-BE-NEXT: st1 { v3.2d }, [x10] +; CHECK-BE-NEXT: add x10, x1, #80 +; CHECK-BE-NEXT: ushll2 v3.2d, v1.4s, #0 +; CHECK-BE-NEXT: ushll2 v5.2d, v2.4s, #0 +; CHECK-BE-NEXT: st1 { v3.2d }, [x9] +; CHECK-BE-NEXT: add x9, x1, #48 +; CHECK-BE-NEXT: st1 { v5.2d }, [x10] +; CHECK-BE-NEXT: add x10, x1, #96 +; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-BE-NEXT: ushll v3.2d, v4.2s, #0 +; CHECK-BE-NEXT: ushll2 v4.2d, v4.4s, #0 +; CHECK-BE-NEXT: st1 { v0.2d }, [x1] +; CHECK-BE-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-BE-NEXT: st1 { v4.2d }, [x9] +; CHECK-BE-NEXT: add x9, x1, #64 +; CHECK-BE-NEXT: st1 { v1.2d }, [x10] +; CHECK-BE-NEXT: add x10, x1, #32 +; CHECK-BE-NEXT: add x1, x1, #128 +; CHECK-BE-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-BE-NEXT: st1 { v3.2d }, [x10] +; CHECK-BE-NEXT: st1 { v2.2d }, [x9] +; CHECK-BE-NEXT: b.ne .LBB15_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %src.gep = getelementptr i32, i32* %src, i64 %iv + %src.gep.cast = bitcast i32* %src.gep to <16 x i32>* + %load = load <16 x i32>, <16 x i32>* %src.gep.cast + %ext = zext <16 x i32> %load to <16 x i64> + %dst.gep = getelementptr i64, i64* %dst, i64 %iv + %dst.gep.cast = bitcast i64* %dst.gep to <16 x i64>* + store <16 x i64> %ext, <16 x i64>* %dst.gep.cast + %iv.next = add nuw i64 %iv, 16 + %ec = icmp eq i64 %iv.next, 128 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +; CHECK-LABEL: lCPI16_0: +; CHECK-NEXT: .byte 0 ; 0x0 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI16_1: +; CHECK-NEXT: .byte 1 ; 0x1 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI16_2: +; CHECK-NEXT: .byte 2 ; 0x2 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI16_3: +; CHECK-NEXT: .byte 3 ; 0x3 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI16_4: +; CHECK-NEXT: .byte 4 ; 0x4 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI16_5: +; CHECK-NEXT: .byte 5 ; 0x5 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI16_6: +; CHECK-NEXT: .byte 6 ; 0x6 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI16_7: +; CHECK-NEXT: .byte 7 ; 0x7 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff + +; CHECK-BE-LABEL: .LCPI16_0: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 0 // 0x0 +; CHECK-BE-NEXT: .LCPI16_1: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 1 // 0x1 +; CHECK-BE-NEXT: .LCPI16_2: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 2 // 0x2 +; CHECK-BE-NEXT: .LCPI16_3: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 3 // 0x3 +; CHECK-BE-NEXT: .LCPI16_4: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 4 // 0x4 +; CHECK-BE-NEXT: .LCPI16_5: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 5 // 0x5 +; CHECK-BE-NEXT: .LCPI16_6: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 6 // 0x6 +; CHECK-BE-NEXT: .LCPI16_7: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 7 // 0x7 + +define void @zext_v8i8_to_v8i128_in_loop(i8* %src, i128* %dst) { +; CHECK-LABEL: _zext_v8i8_to_v8i128_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: Lloh44: +; CHECK-NEXT: adrp x9, lCPI16_0@PAGE +; CHECK-NEXT: Lloh45: +; CHECK-NEXT: adrp x10, lCPI16_1@PAGE +; CHECK-NEXT: Lloh46: +; CHECK-NEXT: adrp x11, lCPI16_2@PAGE +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: Lloh47: +; CHECK-NEXT: ldr q0, [x9, lCPI16_0@PAGEOFF] +; CHECK-NEXT: Lloh48: +; CHECK-NEXT: adrp x9, lCPI16_3@PAGE +; CHECK-NEXT: Lloh49: +; CHECK-NEXT: ldr q1, [x10, lCPI16_1@PAGEOFF] +; CHECK-NEXT: Lloh50: +; CHECK-NEXT: adrp x10, lCPI16_4@PAGE +; CHECK-NEXT: Lloh51: +; CHECK-NEXT: ldr q2, [x11, lCPI16_2@PAGEOFF] +; CHECK-NEXT: Lloh52: +; CHECK-NEXT: adrp x11, lCPI16_5@PAGE +; CHECK-NEXT: Lloh53: +; CHECK-NEXT: ldr q3, [x9, lCPI16_3@PAGEOFF] +; CHECK-NEXT: Lloh54: +; CHECK-NEXT: adrp x9, lCPI16_6@PAGE +; CHECK-NEXT: Lloh55: +; CHECK-NEXT: ldr q4, [x10, lCPI16_4@PAGEOFF] +; CHECK-NEXT: Lloh56: +; CHECK-NEXT: adrp x10, lCPI16_7@PAGE +; CHECK-NEXT: Lloh57: +; CHECK-NEXT: ldr q5, [x11, lCPI16_5@PAGEOFF] +; CHECK-NEXT: Lloh58: +; CHECK-NEXT: ldr q6, [x9, lCPI16_6@PAGEOFF] +; CHECK-NEXT: Lloh59: +; CHECK-NEXT: ldr q7, [x10, lCPI16_7@PAGEOFF] +; CHECK-NEXT: LBB16_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr d16, [x0, x8] +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: cmp x8, #128 +; CHECK-NEXT: tbl.16b v17, { v16 }, v7 +; CHECK-NEXT: tbl.16b v18, { v16 }, v6 +; CHECK-NEXT: tbl.16b v19, { v16 }, v5 +; CHECK-NEXT: tbl.16b v20, { v16 }, v4 +; CHECK-NEXT: tbl.16b v21, { v16 }, v3 +; CHECK-NEXT: stp q18, q17, [x1, #96] +; CHECK-NEXT: tbl.16b v17, { v16 }, v2 +; CHECK-NEXT: tbl.16b v18, { v16 }, v1 +; CHECK-NEXT: stp q20, q19, [x1, #64] +; CHECK-NEXT: tbl.16b v16, { v16 }, v0 +; CHECK-NEXT: stp q17, q21, [x1, #32] +; CHECK-NEXT: stp q16, q18, [x1], #256 +; CHECK-NEXT: b.ne LBB16_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh56, Lloh59 +; CHECK-NEXT: .loh AdrpLdr Lloh54, Lloh58 +; CHECK-NEXT: .loh AdrpLdr Lloh52, Lloh57 +; CHECK-NEXT: .loh AdrpAdrp Lloh50, Lloh56 +; CHECK-NEXT: .loh AdrpLdr Lloh50, Lloh55 +; CHECK-NEXT: .loh AdrpAdrp Lloh48, Lloh54 +; CHECK-NEXT: .loh AdrpLdr Lloh48, Lloh53 +; CHECK-NEXT: .loh AdrpAdrp Lloh46, Lloh52 +; CHECK-NEXT: .loh AdrpLdr Lloh46, Lloh51 +; CHECK-NEXT: .loh AdrpAdrp Lloh45, Lloh50 +; CHECK-NEXT: .loh AdrpLdr Lloh45, Lloh49 +; CHECK-NEXT: .loh AdrpAdrp Lloh44, Lloh48 +; CHECK-NEXT: .loh AdrpLdr Lloh44, Lloh47 + +; CHECK-BE-LABEL: zext_v8i8_to_v8i128_in_loop: +; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: adrp x8, .LCPI16_0 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI16_0 +; CHECK-BE-NEXT: ld1 { v0.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI16_1 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI16_1 +; CHECK-BE-NEXT: ld1 { v1.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI16_2 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI16_2 +; CHECK-BE-NEXT: ld1 { v2.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI16_3 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI16_3 +; CHECK-BE-NEXT: ld1 { v3.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI16_4 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI16_4 +; CHECK-BE-NEXT: ld1 { v4.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI16_5 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI16_5 +; CHECK-BE-NEXT: ld1 { v5.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI16_6 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI16_6 +; CHECK-BE-NEXT: ld1 { v6.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI16_7 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI16_7 +; CHECK-BE-NEXT: ld1 { v7.16b }, [x8] +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB16_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, x8 +; CHECK-BE-NEXT: add x10, x1, #96 +; CHECK-BE-NEXT: add x8, x8, #16 +; CHECK-BE-NEXT: cmp x8, #128 +; CHECK-BE-NEXT: ld1 { v16.8b }, [x9] +; CHECK-BE-NEXT: add x9, x1, #112 +; CHECK-BE-NEXT: tbl v17.16b, { v16.16b }, v7.16b +; CHECK-BE-NEXT: tbl v18.16b, { v16.16b }, v6.16b +; CHECK-BE-NEXT: tbl v19.16b, { v16.16b }, v5.16b +; CHECK-BE-NEXT: st1 { v17.16b }, [x9] +; CHECK-BE-NEXT: add x9, x1, #80 +; CHECK-BE-NEXT: tbl v17.16b, { v16.16b }, v4.16b +; CHECK-BE-NEXT: st1 { v18.16b }, [x10] +; CHECK-BE-NEXT: add x10, x1, #64 +; CHECK-BE-NEXT: st1 { v19.16b }, [x9] +; CHECK-BE-NEXT: add x9, x1, #48 +; CHECK-BE-NEXT: tbl v18.16b, { v16.16b }, v3.16b +; CHECK-BE-NEXT: tbl v19.16b, { v16.16b }, v0.16b +; CHECK-BE-NEXT: st1 { v17.16b }, [x10] +; CHECK-BE-NEXT: tbl v17.16b, { v16.16b }, v2.16b +; CHECK-BE-NEXT: add x10, x1, #32 +; CHECK-BE-NEXT: tbl v16.16b, { v16.16b }, v1.16b +; CHECK-BE-NEXT: st1 { v18.16b }, [x9] +; CHECK-BE-NEXT: add x9, x1, #16 +; CHECK-BE-NEXT: st1 { v19.16b }, [x1] +; CHECK-BE-NEXT: add x1, x1, #256 +; CHECK-BE-NEXT: st1 { v17.16b }, [x10] +; CHECK-BE-NEXT: st1 { v16.16b }, [x9] +; CHECK-BE-NEXT: b.ne .LBB16_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %src.gep = getelementptr i8, i8* %src, i64 %iv + %src.gep.cast = bitcast i8* %src.gep to <8 x i8>* + %load = load <8 x i8>, <8 x i8>* %src.gep.cast + %ext = zext <8 x i8> %load to <8 x i128> + %dst.gep = getelementptr i128, i128* %dst, i64 %iv + %dst.gep.cast = bitcast i128* %dst.gep to <8 x i128>* + store <8 x i128> %ext, <8 x i128>* %dst.gep.cast + %iv.next = add nuw i64 %iv, 16 + %ec = icmp eq i64 %iv.next, 128 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + + define void @zext_v16i8_to_v16i32_in_loop_scalable_vectors(i8* %src, i32* %dst) { ; CHECK-LABEL: zext_v16i8_to_v16i32_in_loop_scalable_vectors: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: LBB8_1: ; %loop +; CHECK-NEXT: LBB17_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add x9, x0, x8 ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, x8] @@ -815,7 +2571,7 @@ ; CHECK-NEXT: st1w { z1.s }, p0, [x9, #2, mul vl] ; CHECK-NEXT: st1w { z2.s }, p0, [x9, #3, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [x9, #1, mul vl] -; CHECK-NEXT: b.ne LBB8_1 +; CHECK-NEXT: b.ne LBB17_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret ; @@ -823,7 +2579,7 @@ ; CHECK-BE: // %bb.0: // %entry ; CHECK-BE-NEXT: mov x8, xzr ; CHECK-BE-NEXT: ptrue p0.s -; CHECK-BE-NEXT: .LBB8_1: // %loop +; CHECK-BE-NEXT: .LBB17_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: add x9, x0, x8 ; CHECK-BE-NEXT: ld1b { z0.s }, p0/z, [x0, x8] @@ -841,7 +2597,7 @@ ; CHECK-BE-NEXT: st1w { z1.s }, p0, [x9, #2, mul vl] ; CHECK-BE-NEXT: st1w { z2.s }, p0, [x9, #3, mul vl] ; CHECK-BE-NEXT: st1w { z0.s }, p0, [x9, #1, mul vl] -; CHECK-BE-NEXT: b.ne .LBB8_1 +; CHECK-BE-NEXT: b.ne .LBB17_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: ret entry: diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll --- a/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll +++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll @@ -138,9 +138,10 @@ ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[IV]] ; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, ptr [[SRC_GEP]], align 16 -; CHECK-NEXT: [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i64> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> %load, <16 x i8> , <128 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <128 x i8> [[TMP0]] to <16 x i64> ; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr i64, ptr [[DST:%.*]], i64 [[IV]] -; CHECK-NEXT: store <16 x i64> [[EXT]], ptr [[DST_GEP]], align 128 +; CHECK-NEXT: store <16 x i64> [[TMP1:%.*]], ptr [[DST_GEP]], align 128 ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128 ; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]