Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13401,29 +13401,66 @@ static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) { IRBuilder<> Builder(TI); SmallVector Parts; + unsigned NumElements = cast(TI->getType())->getNumElements(); + auto *SrcTy = dyn_cast(TI->getOperand(0)->getType()); + auto *DstTy = dyn_cast(TI->getType()); + assert(SrcTy->getElementType()->isIntegerTy() && + "Non-integer type source vector element is not supported"); + assert(DstTy->getElementType()->isIntegerTy(8) && + "Unsupported destination vector element type"); + unsigned SrcElemTySz = + dyn_cast(SrcTy->getElementType())->getBitWidth(); + unsigned TruncFactor = + SrcElemTySz / + dyn_cast(DstTy->getElementType())->getBitWidth(); + assert((SrcElemTySz == 32 || SrcElemTySz == 64) && + "Unsupported source vector element type size"); Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16); - Parts.push_back(Builder.CreateBitCast( - Builder.CreateShuffleVector(TI->getOperand(0), {0, 1, 2, 3}), VecTy)); - Parts.push_back(Builder.CreateBitCast( - Builder.CreateShuffleVector(TI->getOperand(0), {4, 5, 6, 7}), VecTy)); - Intrinsic::ID TblID = Intrinsic::aarch64_neon_tbl2; - unsigned NumElements = cast(TI->getType())->getNumElements(); - if (NumElements == 16) { + + if (SrcElemTySz == 64 || (SrcElemTySz == 32 && NumElements == 16)) + TblID = Intrinsic::aarch64_neon_tbl4; + + switch (SrcElemTySz) { + case 32: Parts.push_back(Builder.CreateBitCast( - Builder.CreateShuffleVector(TI->getOperand(0), {8, 9, 10, 11}), VecTy)); + Builder.CreateShuffleVector(TI->getOperand(0), {0, 1, 2, 3}), VecTy)); Parts.push_back(Builder.CreateBitCast( - Builder.CreateShuffleVector(TI->getOperand(0), {12, 13, 14, 15}), - VecTy)); + Builder.CreateShuffleVector(TI->getOperand(0), {4, 5, 6, 7}), VecTy)); + if (NumElements == 16) { + TblID = Intrinsic::aarch64_neon_tbl4; + Parts.push_back(Builder.CreateBitCast( + Builder.CreateShuffleVector(TI->getOperand(0), {8, 9, 10, 11}), + VecTy)); + Parts.push_back(Builder.CreateBitCast( + Builder.CreateShuffleVector(TI->getOperand(0), {12, 13, 14, 15}), + VecTy)); + } + break; + case 64: TblID = Intrinsic::aarch64_neon_tbl4; + Parts.push_back(Builder.CreateBitCast( + Builder.CreateShuffleVector(TI->getOperand(0), {0, 1}), VecTy)); + Parts.push_back(Builder.CreateBitCast( + Builder.CreateShuffleVector(TI->getOperand(0), {2, 3}), VecTy)); + Parts.push_back(Builder.CreateBitCast( + Builder.CreateShuffleVector(TI->getOperand(0), {4, 5}), VecTy)); + Parts.push_back(Builder.CreateBitCast( + Builder.CreateShuffleVector(TI->getOperand(0), {6, 7}), VecTy)); + break; } - SmallVector MaskConst; - for (unsigned Idx = 0; Idx < NumElements * 4; Idx += 4) - MaskConst.push_back( - ConstantInt::get(Builder.getInt8Ty(), IsLittleEndian ? Idx : Idx + 3)); - for (unsigned Idx = NumElements * 4; Idx < 64; Idx += 4) - MaskConst.push_back(ConstantInt::get(Builder.getInt8Ty(), 255)); + SmallVector MaskConst; + unsigned Idx = 0; + for (unsigned Itr = 0; Itr < 16; Itr++) { + if (Itr < NumElements) + MaskConst.push_back(ConstantInt::get( + Builder.getInt8Ty(), IsLittleEndian + ? Itr * TruncFactor + : Itr * TruncFactor + (TruncFactor - 1))); + else + MaskConst.push_back(ConstantInt::get(Builder.getInt8Ty(), 255)); + } Parts.push_back(ConstantVector::get(MaskConst)); auto *F = @@ -13495,13 +13532,17 @@ return true; } - // Convert 'trunc <(8|16) x i32> %x to <(8|16) x i8>' to a single tbl.4 + // Convert 'trunc <(8|16) x i32> %x to <(8|16) x i8>' + // or 'trunc <8 x i64> %x to <8 x i8> to a single tbl.4 // instruction selecting the lowest 8 bits per lane of the input interpreted // as 2 or 4 <4 x i32> vectors. auto *TI = dyn_cast(I); - if (TI && (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) && - SrcTy->getElementType()->isIntegerTy(32) && - DstTy->getElementType()->isIntegerTy(8)) { + + if (TI && DstTy->getElementType()->isIntegerTy(8) && + ((SrcTy->getElementType()->isIntegerTy(32) && + (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16)) || + (SrcTy->getElementType()->isIntegerTy(64) && + SrcTy->getNumElements() == 8))) { createTblForTrunc(TI, Subtarget->isLittleEndian()); return true; } Index: llvm/test/CodeGen/AArch64/trunc-to-tbl.ll =================================================================== --- llvm/test/CodeGen/AArch64/trunc-to-tbl.ll +++ llvm/test/CodeGen/AArch64/trunc-to-tbl.ll @@ -314,50 +314,87 @@ ret void } +; CHECK-LABEL: lCPI4_0: +; CHECK-NEXT: .byte 0 ; 0x0 +; CHECK-NEXT: .byte 8 ; 0x8 +; CHECK-NEXT: .byte 16 ; 0x10 +; CHECK-NEXT: .byte 24 ; 0x18 +; CHECK-NEXT: .byte 32 ; 0x20 +; CHECK-NEXT: .byte 40 ; 0x28 +; CHECK-NEXT: .byte 48 ; 0x30 +; CHECK-NEXT: .byte 56 ; 0x38 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff + +; CHECK-BE-LABEL: .LCPI4_0: +; CHECK-BE-NEXT: .byte 7 // 0x7 +; CHECK-BE-NEXT: .byte 15 // 0xf +; CHECK-BE-NEXT: .byte 23 // 0x17 +; CHECK-BE-NEXT: .byte 31 // 0x1f +; CHECK-BE-NEXT: .byte 39 // 0x27 +; CHECK-BE-NEXT: .byte 47 // 0x2f +; CHECK-BE-NEXT: .byte 55 // 0x37 +; CHECK-BE-NEXT: .byte 63 // 0x3f +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff define void @trunc_v8i64_to_v8i8_in_loop(ptr %A, ptr %dst) { ; CHECK-LABEL: trunc_v8i64_to_v8i8_in_loop: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: LBB4_1: ; %loop -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add x9, x0, x8, lsl #6 -; CHECK-NEXT: ldp q1, q0, [x9, #32] -; CHECK-NEXT: ldp q3, q2, [x9] -; CHECK-NEXT: uzp1.4s v0, v1, v0 -; CHECK-NEXT: uzp1.4s v1, v3, v2 -; CHECK-NEXT: uzp1.8h v0, v1, v0 -; CHECK-NEXT: xtn.8b v0, v0 -; CHECK-NEXT: str d0, [x1, x8, lsl #3] -; CHECK-NEXT: add x8, x8, #1 -; CHECK-NEXT: cmp x8, #1000 -; CHECK-NEXT: b.eq LBB4_1 -; CHECK-NEXT: ; %bb.2: ; %exit -; CHECK-NEXT: ret +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: Lloh4: +; CHECK-NEXT: adrp x9, lCPI4_0@PAGE +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: Lloh5: +; CHECK-NEXT: ldr q0, [x9, lCPI4_0@PAGEOFF] +; CHECK-NEXT: LBB4_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add x9, x0, x8, lsl #6 +; CHECK-NEXT: ldp q1, q2, [x9] +; CHECK-NEXT: ldp q3, q4, [x9, #32] +; CHECK-NEXT: tbl.16b v1, { v1, v2, v3, v4 }, v0 +; CHECK-NEXT: str d1, [x1, x8, lsl #3] +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #1000 +; CHECK-NEXT: b.eq LBB4_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh5 ; CHECK-BE-LABEL: trunc_v8i64_to_v8i8_in_loop: ; CHECK-BE: // %bb.0: // %entry -; CHECK-BE-NEXT: mov x8, xzr -; CHECK-BE-NEXT: .LBB4_1: // %loop -; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-BE-NEXT: add x9, x0, x8, lsl #6 -; CHECK-BE-NEXT: add x10, x9, #48 -; CHECK-BE-NEXT: ld1 { v1.2d }, [x9] -; CHECK-BE-NEXT: ld1 { v0.2d }, [x10] -; CHECK-BE-NEXT: add x10, x9, #32 -; CHECK-BE-NEXT: add x9, x9, #16 -; CHECK-BE-NEXT: ld1 { v2.2d }, [x10] -; CHECK-BE-NEXT: ld1 { v3.2d }, [x9] -; CHECK-BE-NEXT: add x9, x1, x8, lsl #3 -; CHECK-BE-NEXT: add x8, x8, #1 -; CHECK-BE-NEXT: cmp x8, #1000 -; CHECK-BE-NEXT: uzp1 v0.4s, v2.4s, v0.4s -; CHECK-BE-NEXT: uzp1 v1.4s, v1.4s, v3.4s -; CHECK-BE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; CHECK-BE-NEXT: xtn v0.8b, v0.8h -; CHECK-BE-NEXT: st1 { v0.8b }, [x9] -; CHECK-BE-NEXT: b.eq .LBB4_1 +; CHECK-BE-NEXT: adrp x8, .LCPI4_0 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI4_0 +; CHECK-BE-NEXT: ld1 { v0.16b }, [x8] +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB4_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, x8, lsl #6 +; CHECK-BE-NEXT: add x10, x9, #16 +; CHECK-BE-NEXT: add x11, x9, #32 +; CHECK-BE-NEXT: ld1 { v1.16b }, [x9] +; CHECK-BE-NEXT: add x9, x9, #48 +; CHECK-BE-NEXT: ld1 { v2.16b }, [x10] +; CHECK-BE-NEXT: ld1 { v3.16b }, [x11] +; CHECK-BE-NEXT: ld1 { v4.16b }, [x9] +; CHECK-BE-NEXT: add x9, x1, x8, lsl #3 +; CHECK-BE-NEXT: add x8, x8, #1 +; CHECK-BE-NEXT: cmp x8, #1000 +; CHECK-BE-NEXT: tbl v1.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v0.16b +; CHECK-BE-NEXT: st1 { v1.8b }, [x9] +; CHECK-BE-NEXT: b.eq .LBB4_1 ; CHECK-BE-NEXT: // %bb.2: // %exit -; CHECK-BE-NEXT: ret +; CHECK-BE-NEXT: ret entry: br label %loop