diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13604,17 +13604,27 @@ Value *Op = ZExt->getOperand(0); auto *SrcTy = dyn_cast(Op->getType()); auto *DstTy = dyn_cast(ZExt->getType()); + unsigned ZExtFactor = + (dyn_cast(DstTy->getElementType())->getBitWidth()) / + (dyn_cast(SrcTy->getElementType())->getBitWidth()); unsigned NumElts = SrcTy->getNumElements(); IRBuilder<> Builder(ZExt); - SmallVector Mask(4 * NumElts, NumElts); + SmallVector Mask; // Create a mask that selects <0,0,0,Op[i]> for each lane of vector of i32 to // replace the original ZExt. This can later be lowered to a set of tbl // instructions. - for (unsigned i = 0; i < NumElts; i++) { - if (IsLittleEndian) - Mask[i * 4] = i; - else - Mask[i * 4 + 3] = i; + for (unsigned i = 0; i < NumElts * ZExtFactor; i++) { + if (IsLittleEndian) { + if (i % ZExtFactor == 0) + Mask.push_back(i / ZExtFactor); + else + Mask.push_back(NumElts); + } else { + if ((i + 1) % ZExtFactor == 0) + Mask.push_back((i - ZExtFactor + 1) / ZExtFactor); + else + Mask.push_back(NumElts); + } } auto *FirstEltZero = Builder.CreateInsertElement( @@ -13685,7 +13695,9 @@ auto *ZExt = dyn_cast(I); if (ZExt && (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) && SrcTy->getElementType()->isIntegerTy(8) && - DstTy->getElementType()->isIntegerTy(32)) { + (DstTy->getElementType()->isIntegerTy(16) || + DstTy->getElementType()->isIntegerTy(32) || + DstTy->getElementType()->isIntegerTy(64))) { createTblShuffleForZExt(ZExt, Subtarget->isLittleEndian()); return true; } diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll --- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -427,23 +427,25 @@ ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB5_4: // %vector.ph -; CHECK-NEXT: dup v2.8b, w9 -; CHECK-NEXT: and x11, x10, #0xfffffff0 -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: add x8, x0, #8 -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: mov x12, x11 -; CHECK-NEXT: sshll v2.8h, v2.8b, #0 -; CHECK-NEXT: .LBB5_5: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldp d3, d4, [x8, #-8] -; CHECK-NEXT: subs x12, x12, #16 -; CHECK-NEXT: add x8, x8, #16 -; CHECK-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NEXT: ushll v4.8h, v4.8b, #0 -; CHECK-NEXT: mla v0.8h, v2.8h, v3.8h -; CHECK-NEXT: mla v1.8h, v2.8h, v4.8h -; CHECK-NEXT: b.ne .LBB5_5 +; CHECK-NEXT: adrp x12, .LCPI5_0 +; CHECK-NEXT: dup v3.8b, w9 +; CHECK-NEXT: and x11, x10, #0xfffffff0 +; CHECK-NEXT: add x8, x0, #8 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: ldr q2, [x12, :lo12:.LCPI5_0] +; CHECK-NEXT: mov x12, x11 +; CHECK-NEXT: sshll v3.8h, v3.8b, #0 +; CHECK-NEXT: .LBB5_5: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldp d4, d5, [x8, #-8] +; CHECK-NEXT: subs x12, x12, #16 +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: tbl v4.16b, { v4.16b }, v2.16b +; CHECK-NEXT: tbl v5.16b, { v5.16b }, v2.16b +; CHECK-NEXT: mla v0.8h, v3.8h, v4.8h +; CHECK-NEXT: mla v1.8h, v3.8h, v5.8h +; CHECK-NEXT: b.ne .LBB5_5 ; CHECK-NEXT: // %bb.6: // %middle.block ; CHECK-NEXT: add v0.8h, v1.8h, v0.8h ; CHECK-NEXT: cmp x11, x10 @@ -615,20 +617,24 @@ define void @sink_v8z16_0(i32 *%p, i32 *%d, i64 %n, <16 x i8> %a) { ; CHECK-LABEL: sink_v8z16_0: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: dup v0.8b, v0.b[0] -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: .LBB8_1: // %loop -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: add x8, x8, #8 -; CHECK-NEXT: subs x2, x2, #8 -; CHECK-NEXT: umull v1.8h, v1.8b, v0.8b -; CHECK-NEXT: cmlt v1.8h, v1.8h, #0 -; CHECK-NEXT: xtn v1.8b, v1.8h -; CHECK-NEXT: str d1, [x0], #32 -; CHECK-NEXT: b.ne .LBB8_1 -; CHECK-NEXT: // %bb.2: // %exit -; CHECK-NEXT: ret +;CHECK-NEXT: adrp x9, .LCPI8_0 +;CHECK-NEXT: dup v1.8b, v0.b[0] +;CHECK-NEXT: mov x8, xzr +;CHECK-NEXT: ldr q0, [x9, :lo12:.LCPI8_0] +;CHECK-NEXT: ushll v1.8h, v1.8b, #0 +;CHECK-NEXT: .LBB8_1: // %loop +;CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +;CHECK-NEXT: ldr d2, [x0] +;CHECK-NEXT: add x8, x8, #8 +;CHECK-NEXT: subs x2, x2, #8 +;CHECK-NEXT: tbl v2.16b, { v2.16b }, v0.16b +;CHECK-NEXT: mul v2.8h, v2.8h, v1.8h +;CHECK-NEXT: cmlt v2.8h, v2.8h, #0 +;CHECK-NEXT: xtn v2.8b, v2.8h +;CHECK-NEXT: str d2, [x0], #32 +;CHECK-NEXT: b.ne .LBB8_1 +;CHECK-NEXT: // %bb.2: // %exit +;CHECK-NEXT: ret entry: %ext = zext <16 x i8> %a to <16 x i16> %broadcast.splat = shufflevector <16 x i16> %ext, <16 x i16> poison, <8 x i32> diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -503,42 +503,126 @@ ret void } +; CHECK-LABEL: lCPI5_0: +; CHECK-NEXT: .byte 0 ; 0x0 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 1 ; 0x1 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 2 ; 0x2 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 3 ; 0x3 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 4 ; 0x4 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 5 ; 0x5 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 6 ; 0x6 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 7 ; 0x7 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI5_1: +; CHECK-NEXT: .byte 8 ; 0x8 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 9 ; 0x9 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 10 ; 0xa +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 11 ; 0xb +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 12 ; 0xc +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 13 ; 0xd +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 14 ; 0xe +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 15 ; 0xf +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-BE-LABEL: .LCPI5_0: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 0 // 0x0 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 1 // 0x1 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 2 // 0x2 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 3 // 0x3 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 4 // 0x4 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 5 // 0x5 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 6 // 0x6 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 7 // 0x7 +; CHECK-BE-NEXT: .LCPI5_1: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 8 // 0x8 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 9 // 0x9 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 10 // 0xa +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 11 // 0xb +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 12 // 0xc +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 13 // 0xd +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 14 // 0xe +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 15 // 0xf define void @zext_v16i8_to_v16i16_in_loop(i8* %src, i16* %dst) { ; CHECK-LABEL: zext_v16i8_to_v16i16_in_loop: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: LBB5_1: ; %loop -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q0, [x0, x8] -; CHECK-NEXT: add x8, x8, #16 -; CHECK-NEXT: cmp x8, #128 -; CHECK-NEXT: ushll2.8h v1, v0, #0 -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: stp q0, q1, [x1], #32 -; CHECK-NEXT: b.ne LBB5_1 -; CHECK-NEXT: ; %bb.2: ; %exit -; CHECK-NEXT: ret +; CHECK-NEXT: Lloh8: +; CHECK-NEXT: adrp x9, lCPI5_0@PAGE +; CHECK-NEXT: Lloh9: +; CHECK-NEXT: adrp x10, lCPI5_1@PAGE +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: Lloh10: +; CHECK-NEXT: ldr q0, [x9, lCPI5_0@PAGEOFF] +; CHECK-NEXT: Lloh11: +; CHECK-NEXT: ldr q1, [x10, lCPI5_1@PAGEOFF] +; CHECK-NEXT: LBB5_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr q2, [x0, x8] +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: cmp x8, #128 +; CHECK-NEXT: tbl.16b v3, { v2 }, v1 +; CHECK-NEXT: tbl.16b v2, { v2 }, v0 +; CHECK-NEXT: stp q2, q3, [x1], #32 +; CHECK-NEXT: b.ne LBB5_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret + ; ; CHECK-BE-LABEL: zext_v16i8_to_v16i16_in_loop: ; CHECK-BE: // %bb.0: // %entry -; CHECK-BE-NEXT: mov x8, xzr -; CHECK-BE-NEXT: .LBB5_1: // %loop -; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-BE-NEXT: add x9, x0, x8 -; CHECK-BE-NEXT: add x8, x8, #16 -; CHECK-BE-NEXT: cmp x8, #128 -; CHECK-BE-NEXT: ld1 { v0.16b }, [x9] -; CHECK-BE-NEXT: add x9, x1, #16 -; CHECK-BE-NEXT: ushll v1.8h, v0.8b, #0 -; CHECK-BE-NEXT: ushll2 v0.8h, v0.16b, #0 -; CHECK-BE-NEXT: st1 { v1.8h }, [x1] -; CHECK-BE-NEXT: add x1, x1, #32 -; CHECK-BE-NEXT: st1 { v0.8h }, [x9] -; CHECK-BE-NEXT: b.ne .LBB5_1 -; CHECK-BE-NEXT: // %bb.2: // %exit -; CHECK-BE-NEXT: ret +; CHECK-BE-NEXT: adrp x8, .LCPI5_0 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI5_0 +; CHECK-BE-NEXT: ld1 { v0.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI5_1 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI5_1 +; CHECK-BE-NEXT: ld1 { v1.16b }, [x8] +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB5_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, x8 +; CHECK-BE-NEXT: add x8, x8, #16 +; CHECK-BE-NEXT: cmp x8, #128 +; CHECK-BE-NEXT: ld1 { v2.16b }, [x9] +; CHECK-BE-NEXT: add x9, x1, #16 +; CHECK-BE-NEXT: tbl v3.16b, { v2.16b }, v0.16b +; CHECK-BE-NEXT: tbl v2.16b, { v2.16b }, v1.16b +; CHECK-BE-NEXT: st1 { v3.16b }, [x1] +; CHECK-BE-NEXT: add x1, x1, #32 +; CHECK-BE-NEXT: st1 { v2.16b }, [x9] +; CHECK-BE-NEXT: b.ne .LBB5_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + entry: br label %loop @@ -632,14 +716,14 @@ define void @zext_v8i8_to_v8i32_in_loop(i8* %src, i32* %dst) { ; CHECK-LABEL: zext_v8i8_to_v8i32_in_loop: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh8: +; CHECK-NEXT: Lloh12: ; CHECK-NEXT: adrp x9, lCPI6_0@PAGE -; CHECK-NEXT: Lloh9: +; CHECK-NEXT: Lloh13: ; CHECK-NEXT: adrp x10, lCPI6_1@PAGE ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: Lloh10: +; CHECK-NEXT: Lloh14: ; CHECK-NEXT: ldr q0, [x9, lCPI6_0@PAGEOFF] -; CHECK-NEXT: Lloh11: +; CHECK-NEXT: Lloh15: ; CHECK-NEXT: ldr q1, [x10, lCPI6_1@PAGEOFF] ; CHECK-NEXT: LBB6_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -652,8 +736,8 @@ ; CHECK-NEXT: b.ne LBB6_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh9, Lloh11 -; CHECK-NEXT: .loh AdrpLdr Lloh8, Lloh10 +; CHECK-NEXT: .loh AdrpLdr Lloh13, Lloh15 +; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh14 ; ; CHECK-BE-LABEL: zext_v8i8_to_v8i32_in_loop: ; CHECK-BE: // %bb.0: // %entry @@ -699,79 +783,412 @@ ret void } +; CHECK-LABEL: lCPI7_0: +; CHECK-NEXT: .byte 0 ; 0x0 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 1 ; 0x1 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI7_1: +; CHECK-NEXT: .byte 2 ; 0x2 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 3 ; 0x3 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI7_2: +; CHECK-NEXT: .byte 4 ; 0x4 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 5 ; 0x5 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI7_3: +; CHECK-NEXT: .byte 6 ; 0x6 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 7 ; 0x7 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI7_4: +; CHECK-NEXT: .byte 8 ; 0x8 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 9 ; 0x9 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI7_5: +; CHECK-NEXT: .byte 10 ; 0xa +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 11 ; 0xb +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI7_6: +; CHECK-NEXT: .byte 12 ; 0xc +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 13 ; 0xd +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI7_7: +; CHECK-NEXT: .byte 14 ; 0xe +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 15 ; 0xf +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff + +; CHECK-BE-LABEL: .LCPI7_0: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 0 // 0x0 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 1 // 0x1 +; CHECK-BE-NEXT: .LCPI7_1: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 2 // 0x2 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 3 // 0x3 +; CHECK-BE-NEXT: .LCPI7_2: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 4 // 0x4 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 5 // 0x5 +; CHECK-BE-NEXT: .LCPI7_3: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 6 // 0x6 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 7 // 0x7 +; CHECK-BE-NEXT: .LCPI7_4: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 8 // 0x8 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 9 // 0x9 +; CHECK-BE-NEXT: .LCPI7_5: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 10 // 0xa +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 11 // 0xb +; CHECK-BE-NEXT: .LCPI7_6: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 12 // 0xc +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 13 // 0xd +; CHECK-BE-NEXT: .LCPI7_7: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 14 // 0xe +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 15 // 0xf + define void @zext_v16i8_to_v16i64_in_loop(i8* %src, i64* %dst) { ; CHECK-LABEL: zext_v16i8_to_v16i64_in_loop: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: LBB7_1: ; %loop -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q0, [x0, x8] -; CHECK-NEXT: add x8, x8, #16 -; CHECK-NEXT: cmp x8, #128 -; CHECK-NEXT: ushll.8h v1, v0, #0 -; CHECK-NEXT: ushll2.8h v0, v0, #0 -; CHECK-NEXT: ushll2.4s v2, v1, #0 -; CHECK-NEXT: ushll2.4s v3, v0, #0 -; CHECK-NEXT: ushll.4s v0, v0, #0 -; CHECK-NEXT: ushll2.2d v4, v3, #0 -; CHECK-NEXT: ushll2.2d v5, v0, #0 -; CHECK-NEXT: ushll.2d v0, v0, #0 -; CHECK-NEXT: ushll.2d v3, v3, #0 -; CHECK-NEXT: stp q0, q5, [x1, #64] -; CHECK-NEXT: ushll.4s v0, v1, #0 -; CHECK-NEXT: stp q3, q4, [x1, #96] -; CHECK-NEXT: ushll2.2d v3, v2, #0 -; CHECK-NEXT: ushll.2d v2, v2, #0 -; CHECK-NEXT: ushll2.2d v1, v0, #0 -; CHECK-NEXT: ushll.2d v0, v0, #0 -; CHECK-NEXT: stp q2, q3, [x1, #32] -; CHECK-NEXT: stp q0, q1, [x1], #128 -; CHECK-NEXT: b.ne LBB7_1 -; CHECK-NEXT: ; %bb.2: ; %exit -; CHECK-NEXT: ret -; +; CHECK-NEXT: Lloh16: +; CHECK-NEXT: adrp x9, lCPI7_0@PAGE +; CHECK-NEXT: Lloh17: +; CHECK-NEXT: adrp x10, lCPI7_1@PAGE +; CHECK-NEXT: Lloh18: +; CHECK-NEXT: adrp x11, lCPI7_2@PAGE +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: Lloh19: +; CHECK-NEXT: ldr q0, [x9, lCPI7_0@PAGEOFF] +; CHECK-NEXT: Lloh20: +; CHECK-NEXT: adrp x9, lCPI7_3@PAGE +; CHECK-NEXT: Lloh21: +; CHECK-NEXT: ldr q1, [x10, lCPI7_1@PAGEOFF] +; CHECK-NEXT: Lloh22: +; CHECK-NEXT: adrp x10, lCPI7_4@PAGE +; CHECK-NEXT: Lloh23: +; CHECK-NEXT: ldr q2, [x11, lCPI7_2@PAGEOFF] +; CHECK-NEXT: Lloh24: +; CHECK-NEXT: adrp x11, lCPI7_5@PAGE +; CHECK-NEXT: Lloh25: +; CHECK-NEXT: ldr q3, [x9, lCPI7_3@PAGEOFF] +; CHECK-NEXT: Lloh26: +; CHECK-NEXT: adrp x9, lCPI7_6@PAGE +; CHECK-NEXT: Lloh27: +; CHECK-NEXT: ldr q4, [x10, lCPI7_4@PAGEOFF] +; CHECK-NEXT: Lloh28: +; CHECK-NEXT: adrp x10, lCPI7_7@PAGE +; CHECK-NEXT: Lloh29: +; CHECK-NEXT: ldr q5, [x11, lCPI7_5@PAGEOFF] +; CHECK-NEXT: Lloh30: +; CHECK-NEXT: ldr q6, [x9, lCPI7_6@PAGEOFF] +; CHECK-NEXT: Lloh31: +; CHECK-NEXT: ldr q7, [x10, lCPI7_7@PAGEOFF] +; CHECK-NEXT: LBB7_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr q16, [x0, x8] +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: cmp x8, #128 +; CHECK-NEXT: tbl.16b v17, { v16 }, v7 +; CHECK-NEXT: tbl.16b v18, { v16 }, v6 +; CHECK-NEXT: tbl.16b v19, { v16 }, v5 +; CHECK-NEXT: tbl.16b v20, { v16 }, v4 +; CHECK-NEXT: tbl.16b v21, { v16 }, v3 +; CHECK-NEXT: stp q18, q17, [x1, #96] +; CHECK-NEXT: tbl.16b v17, { v16 }, v2 +; CHECK-NEXT: tbl.16b v18, { v16 }, v1 +; CHECK-NEXT: stp q20, q19, [x1, #64] +; CHECK-NEXT: tbl.16b v16, { v16 }, v0 +; CHECK-NEXT: stp q17, q21, [x1, #32] +; CHECK-NEXT: stp q16, q18, [x1], #128 +; CHECK-NEXT: b.ne LBB7_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh28, Lloh31 +; CHECK-NEXT: .loh AdrpLdr Lloh26, Lloh30 +; CHECK-NEXT: .loh AdrpLdr Lloh24, Lloh29 +; CHECK-NEXT: .loh AdrpAdrp Lloh22, Lloh28 +; CHECK-NEXT: .loh AdrpLdr Lloh22, Lloh27 +; CHECK-NEXT: .loh AdrpAdrp Lloh20, Lloh26 +; CHECK-NEXT: .loh AdrpLdr Lloh20, Lloh25 +; CHECK-NEXT: .loh AdrpAdrp Lloh18, Lloh24 +; CHECK-NEXT: .loh AdrpLdr Lloh18, Lloh23 +; CHECK-NEXT: .loh AdrpAdrp Lloh17, Lloh22 +; CHECK-NEXT: .loh AdrpLdr Lloh17, Lloh21 +; CHECK-NEXT: .loh AdrpAdrp Lloh16, Lloh20 +; CHECK-NEXT: .loh AdrpLdr Lloh16, Lloh19 + + ; CHECK-BE-LABEL: zext_v16i8_to_v16i64_in_loop: ; CHECK-BE: // %bb.0: // %entry -; CHECK-BE-NEXT: mov x8, xzr -; CHECK-BE-NEXT: .LBB7_1: // %loop -; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-BE-NEXT: add x9, x0, x8 -; CHECK-BE-NEXT: add x10, x1, #96 -; CHECK-BE-NEXT: add x8, x8, #16 -; CHECK-BE-NEXT: cmp x8, #128 -; CHECK-BE-NEXT: ld1 { v0.16b }, [x9] -; CHECK-BE-NEXT: add x9, x1, #112 -; CHECK-BE-NEXT: ushll2 v1.8h, v0.16b, #0 -; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-BE-NEXT: ushll2 v2.4s, v1.8h, #0 -; CHECK-BE-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-BE-NEXT: ushll2 v3.2d, v2.4s, #0 -; CHECK-BE-NEXT: ushll v2.2d, v2.2s, #0 -; CHECK-BE-NEXT: st1 { v3.2d }, [x9] -; CHECK-BE-NEXT: add x9, x1, #80 -; CHECK-BE-NEXT: ushll2 v3.2d, v1.4s, #0 -; CHECK-BE-NEXT: st1 { v2.2d }, [x10] -; CHECK-BE-NEXT: ushll2 v2.4s, v0.8h, #0 -; CHECK-BE-NEXT: add x10, x1, #48 -; CHECK-BE-NEXT: st1 { v3.2d }, [x9] -; CHECK-BE-NEXT: add x9, x1, #64 -; CHECK-BE-NEXT: ushll v1.2d, v1.2s, #0 -; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-BE-NEXT: ushll2 v4.2d, v2.4s, #0 -; CHECK-BE-NEXT: st1 { v1.2d }, [x9] -; CHECK-BE-NEXT: ushll v1.2d, v0.2s, #0 -; CHECK-BE-NEXT: add x9, x1, #16 -; CHECK-BE-NEXT: st1 { v4.2d }, [x10] -; CHECK-BE-NEXT: add x10, x1, #32 -; CHECK-BE-NEXT: st1 { v1.2d }, [x1] -; CHECK-BE-NEXT: add x1, x1, #128 -; CHECK-BE-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-BE-NEXT: ushll v2.2d, v2.2s, #0 -; CHECK-BE-NEXT: st1 { v0.2d }, [x9] -; CHECK-BE-NEXT: st1 { v2.2d }, [x10] -; CHECK-BE-NEXT: b.ne .LBB7_1 -; CHECK-BE-NEXT: // %bb.2: // %exit -; CHECK-BE-NEXT: ret +; CHECK-BE-NEXT: adrp x8, .LCPI7_0 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI7_0 +; CHECK-BE-NEXT: ld1 { v0.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI7_1 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI7_1 +; CHECK-BE-NEXT: ld1 { v1.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI7_2 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI7_2 +; CHECK-BE-NEXT: ld1 { v2.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI7_3 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI7_3 +; CHECK-BE-NEXT: ld1 { v3.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI7_4 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI7_4 +; CHECK-BE-NEXT: ld1 { v4.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI7_5 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI7_5 +; CHECK-BE-NEXT: ld1 { v5.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI7_6 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI7_6 +; CHECK-BE-NEXT: ld1 { v6.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI7_7 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI7_7 +; CHECK-BE-NEXT: ld1 { v7.16b }, [x8] +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB7_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, x8 +; CHECK-BE-NEXT: add x10, x1, #96 +; CHECK-BE-NEXT: add x8, x8, #16 +; CHECK-BE-NEXT: cmp x8, #128 +; CHECK-BE-NEXT: ld1 { v16.16b }, [x9] +; CHECK-BE-NEXT: add x9, x1, #112 +; CHECK-BE-NEXT: tbl v17.16b, { v16.16b }, v7.16b +; CHECK-BE-NEXT: tbl v18.16b, { v16.16b }, v6.16b +; CHECK-BE-NEXT: tbl v19.16b, { v16.16b }, v5.16b +; CHECK-BE-NEXT: st1 { v17.16b }, [x9] +; CHECK-BE-NEXT: add x9, x1, #80 +; CHECK-BE-NEXT: tbl v17.16b, { v16.16b }, v4.16b +; CHECK-BE-NEXT: st1 { v18.16b }, [x10] +; CHECK-BE-NEXT: add x10, x1, #64 +; CHECK-BE-NEXT: st1 { v19.16b }, [x9] +; CHECK-BE-NEXT: add x9, x1, #48 +; CHECK-BE-NEXT: tbl v18.16b, { v16.16b }, v3.16b +; CHECK-BE-NEXT: tbl v19.16b, { v16.16b }, v0.16b +; CHECK-BE-NEXT: st1 { v17.16b }, [x10] +; CHECK-BE-NEXT: tbl v17.16b, { v16.16b }, v2.16b +; CHECK-BE-NEXT: add x10, x1, #32 +; CHECK-BE-NEXT: tbl v16.16b, { v16.16b }, v1.16b +; CHECK-BE-NEXT: st1 { v18.16b }, [x9] +; CHECK-BE-NEXT: add x9, x1, #16 +; CHECK-BE-NEXT: st1 { v19.16b }, [x1] +; CHECK-BE-NEXT: add x1, x1, #128 +; CHECK-BE-NEXT: st1 { v17.16b }, [x10] +; CHECK-BE-NEXT: st1 { v16.16b }, [x9] +; CHECK-BE-NEXT: b.ne .LBB7_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + entry: br label %loop diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll --- a/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll +++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll @@ -76,10 +76,11 @@ ; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr i8, i8* [[SRC:%.*]], i64 [[IV]] ; CHECK-NEXT: [[SRC_GEP_CAST:%.*]] = bitcast i8* [[SRC_GEP]] to <16 x i8>* ; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[SRC_GEP_CAST]], align 16 -; CHECK-NEXT: [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i16> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> %load, <16 x i8> , <32 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i8> [[TMP0:%.*]] to <16 x i16> ; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr i16, i16* [[DST:%.*]], i64 [[IV]] ; CHECK-NEXT: [[DST_GEP_CAST:%.*]] = bitcast i16* [[DST_GEP]] to <16 x i16>* -; CHECK-NEXT: store <16 x i16> [[EXT]], <16 x i16>* [[DST_GEP_CAST]], align 32 +; CHECK-NEXT: store <16 x i16> [[TMP1:%.*]], <16 x i16>* [[DST_GEP_CAST]], align 32 ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128 ; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] @@ -150,15 +151,17 @@ ; CHECK-LABEL: @zext_v16i8_to_v16i64_in_loop( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: + +;CHECK: loop: ; preds = %loop, %entry ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr i8, i8* [[SRC:%.*]], i64 [[IV]] ; CHECK-NEXT: [[SRC_GEP_CAST:%.*]] = bitcast i8* [[SRC_GEP]] to <16 x i8>* ; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[SRC_GEP_CAST]], align 16 -; CHECK-NEXT: [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i64> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> %load, <16 x i8> , <128 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <128 x i8> [[TMP0]] to <16 x i64> ; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr i64, i64* [[DST:%.*]], i64 [[IV]] ; CHECK-NEXT: [[DST_GEP_CAST:%.*]] = bitcast i64* [[DST_GEP]] to <16 x i64>* -; CHECK-NEXT: store <16 x i64> [[EXT]], <16 x i64>* [[DST_GEP_CAST]], align 128 +; CHECK-NEXT: store <16 x i64> [[TMP1]], <16 x i64>* [[DST_GEP_CAST]], align 128 ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128 ; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]