diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2672,6 +2672,8 @@ return false; } + virtual bool shouldReplaceZExtWithShuffle(ZExtInst *I) const { return false; } + /// Return true if the target supplies and combines to a paired load /// two loaded values of type LoadedType next to each other in memory. /// RequiredAlignment gives the minimal alignment constraints that must be met diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -6234,6 +6234,7 @@ return true; TPT.rollback(LastKnownGood); + return false; } @@ -6310,6 +6311,34 @@ bool CodeGenPrepare::optimizeExtUses(Instruction *I) { BasicBlock *DefBB = I->getParent(); + ZExtInst *ZExt = dyn_cast(I); + // Try to lower zext v16i8 as a shuffle, if it is profitable for the target, + // like on AArch64 where such shuffles can be lowered directly using tbl + // instructions. + if (ZExt && TLI->shouldReplaceZExtWithShuffle(ZExt) && + LI->getLoopFor(ZExt->getParent())) { + Value *Op = ZExt->getOperand(0); + auto *SrcTy = dyn_cast(Op->getType()); + auto *DstTy = dyn_cast(ZExt->getType()); + if (SrcTy && SrcTy->getNumElements() == 16 && + SrcTy->getElementType()->isIntegerTy(8) && + DstTy->getElementType()->isIntegerTy(32)) { + + IRBuilder<> Builder(ZExt); + SmallVector Mask(64, 16); + for (unsigned i = 0; i < 16; i++) + Mask[i * 4] = i; + + auto *FirstEltZero = Builder.CreateInsertElement( + PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0)); + Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask); + Result = Builder.CreateBitCast(Result, DstTy); + ZExt->replaceAllUsesWith(Result); + ZExt->eraseFromParent(); + return true; + } + } + // If the result of a {s|z}ext and its source are both live out, rewrite all // other uses of the source with result of extension. Value *Src = I->getOperand(0); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -587,6 +587,8 @@ bool shouldSinkOperands(Instruction *I, SmallVectorImpl &Ops) const override; + bool shouldReplaceZExtWithShuffle(ZExtInst *I) const override; + bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override; unsigned getMaxSupportedInterleaveFactor() const override { return 4; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12347,6 +12347,14 @@ return false; } +bool AArch64TargetLowering::shouldReplaceZExtWithShuffle(ZExtInst *I) const { + auto *SrcTy = dyn_cast(I->getOperand(0)->getType()); + auto *DstTy = dyn_cast(I->getType()); + return SrcTy && SrcTy->getNumElements() == 16 && + SrcTy->getElementType()->isIntegerTy(8) && + DstTy->getElementType()->isIntegerTy(32); +} + bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const { if (!LoadedType.isSimple() || diff --git a/llvm/test/CodeGen/AArch64/vselect-ext.ll b/llvm/test/CodeGen/AArch64/vselect-ext.ll --- a/llvm/test/CodeGen/AArch64/vselect-ext.ll +++ b/llvm/test/CodeGen/AArch64/vselect-ext.ll @@ -192,32 +192,54 @@ define void @extension_in_loop_v16i8_to_v16i32(i8* %src, i32* %dst) { ; CHECK-LABEL: extension_in_loop_v16i8_to_v16i32: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: movi.2d v0, #0x0000ff000000ff +; CHECK-NEXT: Lloh0: +; CHECK-NEXT: adrp x9, lCPI7_0@PAGE +; CHECK-NEXT: Lloh1: +; CHECK-NEXT: adrp x10, lCPI7_1@PAGE +; CHECK-NEXT: Lloh2: +; CHECK-NEXT: adrp x11, lCPI7_2@PAGE +; CHECK-NEXT: Lloh3: +; CHECK-NEXT: adrp x12, lCPI7_3@PAGE +; CHECK-NEXT: movi.2d v2, #0000000000000000 ; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: movi.2d v4, #0xffffffffffffffff +; CHECK-NEXT: Lloh4: +; CHECK-NEXT: ldr q0, [x9, lCPI7_0@PAGEOFF] +; CHECK-NEXT: Lloh5: +; CHECK-NEXT: ldr q3, [x10, lCPI7_1@PAGEOFF] +; CHECK-NEXT: Lloh6: +; CHECK-NEXT: ldr q5, [x11, lCPI7_2@PAGEOFF] +; CHECK-NEXT: Lloh7: +; CHECK-NEXT: ldr q6, [x12, lCPI7_3@PAGEOFF] ; CHECK-NEXT: LBB7_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr q1, [x0, x8] ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: cmp x8, #128 -; CHECK-NEXT: ushll2.8h v2, v1, #0 -; CHECK-NEXT: ushll.8h v1, v1, #0 -; CHECK-NEXT: ushll2.4s v3, v2, #0 -; CHECK-NEXT: ushll.4s v2, v2, #0 -; CHECK-NEXT: cmhi.4s v5, v0, v3 -; CHECK-NEXT: cmhi.4s v6, v0, v2 -; CHECK-NEXT: ushll2.4s v4, v1, #0 -; CHECK-NEXT: ushll.4s v1, v1, #0 -; CHECK-NEXT: and.16b v3, v3, v5 -; CHECK-NEXT: and.16b v2, v2, v6 -; CHECK-NEXT: cmhi.4s v7, v0, v4 -; CHECK-NEXT: stp q2, q3, [x1, #32] -; CHECK-NEXT: cmhi.4s v3, v0, v1 -; CHECK-NEXT: and.16b v2, v4, v7 -; CHECK-NEXT: and.16b v1, v1, v3 -; CHECK-NEXT: stp q1, q2, [x1], #64 +; CHECK-NEXT: cmgt.16b v7, v1, v4 +; CHECK-NEXT: tbl.16b v16, { v1, v2 }, v0 +; CHECK-NEXT: tbl.16b v17, { v1, v2 }, v3 +; CHECK-NEXT: sshll2.8h v20, v7, #0 +; CHECK-NEXT: tbl.16b v18, { v1, v2 }, v5 +; CHECK-NEXT: sshll2.4s v21, v20, #0 +; CHECK-NEXT: sshll.4s v20, v20, #0 +; CHECK-NEXT: tbl.16b v19, { v1, v2 }, v6 +; CHECK-NEXT: sshll.8h v7, v7, #0 +; CHECK-NEXT: and.16b v16, v16, v21 +; CHECK-NEXT: and.16b v17, v17, v20 +; CHECK-NEXT: stp q17, q16, [x1, #32] +; CHECK-NEXT: sshll2.4s v16, v7, #0 +; CHECK-NEXT: sshll.4s v7, v7, #0 +; CHECK-NEXT: and.16b v16, v18, v16 +; CHECK-NEXT: and.16b v7, v19, v7 +; CHECK-NEXT: stp q7, q16, [x1], #64 ; CHECK-NEXT: b.ne LBB7_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh3, Lloh7 +; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh6 +; CHECK-NEXT: .loh AdrpLdr Lloh1, Lloh5 +; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh4 entry: br label %loop @@ -243,24 +265,24 @@ define void @extension_in_loop_as_shuffle_v16i8_to_v16i32(i8* %src, i32* %dst) { ; CHECK-LABEL: extension_in_loop_as_shuffle_v16i8_to_v16i32: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh0: +; CHECK-NEXT: Lloh8: ; CHECK-NEXT: adrp x9, lCPI8_0@PAGE -; CHECK-NEXT: Lloh1: +; CHECK-NEXT: Lloh9: ; CHECK-NEXT: adrp x10, lCPI8_1@PAGE -; CHECK-NEXT: Lloh2: +; CHECK-NEXT: Lloh10: ; CHECK-NEXT: adrp x11, lCPI8_2@PAGE -; CHECK-NEXT: Lloh3: +; CHECK-NEXT: Lloh11: ; CHECK-NEXT: adrp x12, lCPI8_3@PAGE ; CHECK-NEXT: movi.2d v1, #0xffffffffffffffff ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: movi.2d v3, #0000000000000000 -; CHECK-NEXT: Lloh4: +; CHECK-NEXT: Lloh12: ; CHECK-NEXT: ldr q0, [x9, lCPI8_0@PAGEOFF] -; CHECK-NEXT: Lloh5: +; CHECK-NEXT: Lloh13: ; CHECK-NEXT: ldr q2, [x10, lCPI8_1@PAGEOFF] -; CHECK-NEXT: Lloh6: +; CHECK-NEXT: Lloh14: ; CHECK-NEXT: ldr q5, [x11, lCPI8_2@PAGEOFF] -; CHECK-NEXT: Lloh7: +; CHECK-NEXT: Lloh15: ; CHECK-NEXT: ldr q6, [x12, lCPI8_3@PAGEOFF] ; CHECK-NEXT: LBB8_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -287,10 +309,10 @@ ; CHECK-NEXT: b.ne LBB8_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh3, Lloh7 -; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh6 -; CHECK-NEXT: .loh AdrpLdr Lloh1, Lloh5 -; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh4 +; CHECK-NEXT: .loh AdrpLdr Lloh11, Lloh15 +; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh14 +; CHECK-NEXT: .loh AdrpLdr Lloh9, Lloh13 +; CHECK-NEXT: .loh AdrpLdr Lloh8, Lloh12 entry: br label %loop @@ -317,24 +339,24 @@ define void @shuffle_in_loop_is_no_extend_v16i8_to_v16i32(i8* %src, i32* %dst) { ; CHECK-LABEL: shuffle_in_loop_is_no_extend_v16i8_to_v16i32: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh8: +; CHECK-NEXT: Lloh16: ; CHECK-NEXT: adrp x9, lCPI9_0@PAGE -; CHECK-NEXT: Lloh9: +; CHECK-NEXT: Lloh17: ; CHECK-NEXT: adrp x10, lCPI9_1@PAGE -; CHECK-NEXT: Lloh10: +; CHECK-NEXT: Lloh18: ; CHECK-NEXT: adrp x11, lCPI9_2@PAGE -; CHECK-NEXT: Lloh11: +; CHECK-NEXT: Lloh19: ; CHECK-NEXT: adrp x12, lCPI9_3@PAGE ; CHECK-NEXT: movi.2d v2, #0000000000000000 ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: movi.2d v5, #0xffffffffffffffff -; CHECK-NEXT: Lloh12: +; CHECK-NEXT: Lloh20: ; CHECK-NEXT: ldr q0, [x9, lCPI9_0@PAGEOFF] -; CHECK-NEXT: Lloh13: +; CHECK-NEXT: Lloh21: ; CHECK-NEXT: ldr q4, [x10, lCPI9_1@PAGEOFF] -; CHECK-NEXT: Lloh14: +; CHECK-NEXT: Lloh22: ; CHECK-NEXT: ldr q6, [x11, lCPI9_2@PAGEOFF] -; CHECK-NEXT: Lloh15: +; CHECK-NEXT: Lloh23: ; CHECK-NEXT: ldr q7, [x12, lCPI9_3@PAGEOFF] ; CHECK-NEXT: LBB9_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -362,10 +384,10 @@ ; CHECK-NEXT: b.ne LBB9_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh11, Lloh15 -; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh14 -; CHECK-NEXT: .loh AdrpLdr Lloh9, Lloh13 -; CHECK-NEXT: .loh AdrpLdr Lloh8, Lloh12 +; CHECK-NEXT: .loh AdrpLdr Lloh19, Lloh23 +; CHECK-NEXT: .loh AdrpLdr Lloh18, Lloh22 +; CHECK-NEXT: .loh AdrpLdr Lloh17, Lloh21 +; CHECK-NEXT: .loh AdrpLdr Lloh16, Lloh20 entry: br label %loop diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -9,23 +9,42 @@ define void @zext_v16i8_to_v16i32_in_loop(i8* %src, i32* %dst) { ; CHECK-LABEL: zext_v16i8_to_v16i32_in_loop: ; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: Lloh0: +; CHECK-NEXT: adrp x9, lCPI0_0@PAGE +; CHECK-NEXT: Lloh1: +; CHECK-NEXT: adrp x10, lCPI0_1@PAGE +; CHECK-NEXT: Lloh2: +; CHECK-NEXT: adrp x11, lCPI0_2@PAGE +; CHECK-NEXT: Lloh3: +; CHECK-NEXT: adrp x12, lCPI0_3@PAGE +; CHECK-NEXT: movi.2d v3, #0000000000000000 ; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: Lloh4: +; CHECK-NEXT: ldr q0, [x9, lCPI0_0@PAGEOFF] +; CHECK-NEXT: Lloh5: +; CHECK-NEXT: ldr q1, [x10, lCPI0_1@PAGEOFF] +; CHECK-NEXT: Lloh6: +; CHECK-NEXT: ldr q4, [x11, lCPI0_2@PAGEOFF] +; CHECK-NEXT: Lloh7: +; CHECK-NEXT: ldr q5, [x12, lCPI0_3@PAGEOFF] ; CHECK-NEXT: LBB0_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q0, [x0, x8] +; CHECK-NEXT: ldr q2, [x0, x8] ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: cmp x8, #128 -; CHECK-NEXT: ushll2.8h v1, v0, #0 -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: ushll2.4s v2, v1, #0 -; CHECK-NEXT: ushll.4s v1, v1, #0 -; CHECK-NEXT: ushll2.4s v3, v0, #0 -; CHECK-NEXT: ushll.4s v0, v0, #0 -; CHECK-NEXT: stp q1, q2, [x1, #32] -; CHECK-NEXT: stp q0, q3, [x1], #64 +; CHECK-NEXT: tbl.16b v6, { v2, v3 }, v5 +; CHECK-NEXT: tbl.16b v7, { v2, v3 }, v4 +; CHECK-NEXT: tbl.16b v16, { v2, v3 }, v1 +; CHECK-NEXT: tbl.16b v17, { v2, v3 }, v0 +; CHECK-NEXT: stp q7, q6, [x1, #32] +; CHECK-NEXT: stp q17, q16, [x1], #64 ; CHECK-NEXT: b.ne LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh3, Lloh7 +; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh6 +; CHECK-NEXT: .loh AdrpLdr Lloh1, Lloh5 +; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh4 entry: br label %loop diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll --- a/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll +++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll @@ -15,10 +15,11 @@ ; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr i8, i8* [[SRC:%.*]], i64 [[IV]] ; CHECK-NEXT: [[SRC_GEP_CAST:%.*]] = bitcast i8* [[SRC_GEP]] to <16 x i8>* ; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[SRC_GEP_CAST]], align 16 -; CHECK-NEXT: [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[LOAD]], <16 x i8> , <64 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <64 x i8> [[TMP0]] to <16 x i32> ; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[IV]] ; CHECK-NEXT: [[DST_GEP_CAST:%.*]] = bitcast i32* [[DST_GEP]] to <16 x i32>* -; CHECK-NEXT: store <16 x i32> [[EXT]], <16 x i32>* [[DST_GEP_CAST]], align 64 +; CHECK-NEXT: store <16 x i32> [[TMP1]], <16 x i32>* [[DST_GEP_CAST]], align 64 ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128 ; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]