Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -376,6 +376,9 @@ SmallVectorImpl &SpeculativelyMovedExts); bool splitBranchCondition(Function &F); bool simplifyOffsetableRelocate(Instruction &I); + + bool sinkOps(Instruction *I, std::initializer_list Uses); + bool tryToSinkFreeOperands(Instruction *I); }; } // end anonymous namespace @@ -1735,6 +1738,7 @@ InsertedInsts.insert(ExtVal); return true; } + case Intrinsic::launder_invariant_group: case Intrinsic::strip_invariant_group: { Value *ArgVal = II->getArgOperand(0); @@ -5958,6 +5962,93 @@ return MadeChange; } +bool CodeGenPrepare::sinkOps(Instruction *I, + std::initializer_list Uses) { + BasicBlock *DefBB = I->getParent(); + + bool Changed = false; + SmallVector ToReplace; + for (Use *U : Uses) { + auto *UI = cast(U->get()); + if (UI->getParent() == DefBB || isa(UI) || InsertedInsts.count(UI)) + continue; + ToReplace.push_back(U); + } + + SmallPtrSet MaybeDead; + for (Use *U : ToReplace) { + auto *UI = cast(U->get()); + Instruction *NI = UI->clone(); + MaybeDead.insert(UI); + LLVM_DEBUG(dbgs() << "Sinking " << *UI << " to user " << *I << "\n"); + NI->insertBefore(I); + InsertedInsts.insert(NI); + U->set(NI); + Changed = true; + } + + if (!MaybeDead.empty()) + for (auto *I : MaybeDead) + if (!I->hasNUsesOrMore(1)) + I->eraseFromParent(); + + return Changed; +} +bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) { + if (!I->getType()->isVectorTy()) + return false; + + if (IntrinsicInst *II = dyn_cast(I)) { + auto areAllUsesInSameBB = [](Value *V) { + BasicBlock *DefBB = cast(*V->use_begin())->getParent(); + return all_of(V->uses(), [DefBB](Use &U) { + return cast(U.getUser())->getParent() == DefBB; + }); + }; + switch (II->getIntrinsicID()) { + case Intrinsic::aarch64_neon_umull: { + auto areTypesHalfed = [](Value *FullV, Value *HalfV) { + auto *FullVT = dyn_cast(FullV->getType()); + auto *HalfVT = dyn_cast(HalfV->getType()); + return FullVT->getBitWidth() == 2 * HalfVT->getBitWidth(); + }; + + Constant *M1, *M2; + Value *S1Op1, *S2Op1; + if (!match(II->getOperand(0), + m_ShuffleVector(m_Value(S1Op1), m_Undef(), m_Constant(M1))) || + !match(II->getOperand(1), + m_ShuffleVector(m_Value(S2Op1), m_Undef(), m_Constant(M2))) || + !areTypesHalfed(S1Op1, II->getOperand(0)) || + !areTypesHalfed(S2Op1, II->getOperand(1))) + return false; + return sinkOps(II, {&II->getOperandUse(0), &II->getOperandUse(1)}); + } + default: + return false; + } + } + + switch (I->getOpcode()) { + case Instruction::Sub: + case Instruction::Add: { + auto areExtDoubled = [](Instruction *Ext) { + return Ext->getType()->getScalarSizeInBits() == + 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); + }; + + if (!match(I->getOperand(0), m_ZExtOrSExt(m_Value())) || + !match(I->getOperand(1), m_ZExtOrSExt(m_Value())) || + !areExtDoubled(cast(I->getOperand(0))) || + !areExtDoubled(cast(I->getOperand(1)))) + return false; + return sinkOps(I, {&I->getOperandUse(0), &I->getOperandUse(1)}); + } + default: + return false; + } + return false; +} bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) { if (!TLI || !DL) return false; @@ -6772,6 +6863,9 @@ return false; } + if (tryToSinkFreeOperands(I)) + return true; + if (CallInst *CI = dyn_cast(I)) return optimizeCallInst(CI, ModifiedDT); Index: test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll =================================================================== --- /dev/null +++ test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll @@ -0,0 +1,161 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -codegenprepare -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown" + +define <8 x i16> @sink_zext(<8 x i8> %a, <8 x i8> %b, i1 %c) { +; CHECK-LABEL: @sink_zext( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[ZB_1:%.*]] = zext <8 x i8> [[B:%.*]] to <8 x i16> +; CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[A:%.*]] to <8 x i16> +; CHECK-NEXT: [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]] +; CHECK-NEXT: ret <8 x i16> [[RES_1]] +; CHECK: if.else: +; CHECK-NEXT: [[ZB_2:%.*]] = zext <8 x i8> [[B]] to <8 x i16> +; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +; CHECK-NEXT: [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]] +; CHECK-NEXT: ret <8 x i16> [[RES_2]] +; +entry: + %za = zext <8 x i8> %a to <8 x i16> + br i1 %c, label %if.then, label %if.else + +if.then: + %zb.1 = zext <8 x i8> %b to <8 x i16> + %res.1 = add <8 x i16> %za, %zb.1 + ret <8 x i16> %res.1 + +if.else: + %zb.2 = zext <8 x i8> %b to <8 x i16> + %res.2 = sub <8 x i16> %za, %zb.2 + ret <8 x i16> %res.2 +} + +define <8 x i16> @sink_sext(<8 x i8> %a, <8 x i8> %b, i1 %c) { +; CHECK-LABEL: @sink_sext( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16> +; CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16> +; CHECK-NEXT: [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]] +; CHECK-NEXT: ret <8 x i16> [[RES_1]] +; CHECK: if.else: +; CHECK-NEXT: [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16> +; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +; CHECK-NEXT: [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]] +; CHECK-NEXT: ret <8 x i16> [[RES_2]] +; +entry: + %za = sext <8 x i8> %a to <8 x i16> + br i1 %c, label %if.then, label %if.else + +if.then: + %zb.1 = sext <8 x i8> %b to <8 x i16> + %res.1 = add <8 x i16> %za, %zb.1 + ret <8 x i16> %res.1 + +if.else: + %zb.2 = sext <8 x i8> %b to <8 x i16> + %res.2 = sub <8 x i16> %za, %zb.2 + ret <8 x i16> %res.2 +} + +define <8 x i16> @do_not_sink_nonfree_zext(<8 x i8> %a, <8 x i8> %b, i1 %c) { +; CHECK-LABEL: @do_not_sink_nonfree_zext( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16> +; CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16> +; CHECK-NEXT: [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]] +; CHECK-NEXT: ret <8 x i16> [[RES_1]] +; CHECK: if.else: +; CHECK-NEXT: [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16> +; CHECK-NEXT: ret <8 x i16> [[ZB_2]] +; +entry: + %za = sext <8 x i8> %a to <8 x i16> + br i1 %c, label %if.then, label %if.else + +if.then: + %zb.1 = sext <8 x i8> %b to <8 x i16> + %res.1 = add <8 x i16> %za, %zb.1 + ret <8 x i16> %res.1 + +if.else: + %zb.2 = sext <8 x i8> %b to <8 x i16> + ret <8 x i16> %zb.2 +} + +define <8 x i16> @do_not_sink_nonfree_sext(<8 x i8> %a, <8 x i8> %b, i1 %c) { +; CHECK-LABEL: @do_not_sink_nonfree_sext( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16> +; CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16> +; CHECK-NEXT: [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]] +; CHECK-NEXT: ret <8 x i16> [[RES_1]] +; CHECK: if.else: +; CHECK-NEXT: [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16> +; CHECK-NEXT: ret <8 x i16> [[ZB_2]] +; +entry: + %za = sext <8 x i8> %a to <8 x i16> + br i1 %c, label %if.then, label %if.else + +if.then: + %zb.1 = sext <8 x i8> %b to <8 x i16> + %res.1 = add <8 x i16> %za, %zb.1 + ret <8 x i16> %res.1 + +if.else: + %zb.2 = sext <8 x i8> %b to <8 x i16> + ret <8 x i16> %zb.2 +} + +define void @test(i32 %bg, <16 x i8>* %src0, <16 x i8>* %src1, <8 x i16>* %dst0, <8 x i16>* %dst1) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[L_SRC0:%.*]] = load <16 x i8>, <16 x i8>* [[SRC0:%.*]] +; CHECK-NEXT: br i1 undef, label [[BB2:%.*]], label [[EXIT:%.*]] +; CHECK: bb2: +; CHECK-NEXT: [[L_SRC1:%.*]] = load <16 x i8>, <16 x i8>* [[SRC1:%.*]] +; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i8> [[L_SRC1]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[L_SRC0]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[VMULL0:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[TMP0]], <8 x i8> [[S2]]) +; CHECK-NEXT: [[S4:%.*]] = shufflevector <16 x i8> [[L_SRC1]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[L_SRC0]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[VMULL1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[S4]]) +; CHECK-NEXT: store <8 x i16> [[VMULL0]], <8 x i16>* [[DST0:%.*]], align 16 +; CHECK-NEXT: store <8 x i16> [[VMULL1]], <8 x i16>* [[DST1:%.*]], align 16 +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %l.src0 = load <16 x i8>, <16 x i8>* %src0 + %s1 = shufflevector <16 x i8> %l.src0, <16 x i8> undef, <8 x i32> + %s3= shufflevector <16 x i8> %l.src0, <16 x i8> undef, <8 x i32> + br i1 undef, label %bb2, label %exit + +bb2: + %l.src1 = load <16 x i8>, <16 x i8>* %src1 + %s2 = shufflevector <16 x i8> %l.src1, <16 x i8> undef, <8 x i32> + %vmull0 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %s1, <8 x i8> %s2) #3 + %s4 = shufflevector <16 x i8> %l.src1, <16 x i8> undef, <8 x i32> + %vmull1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %s3, <8 x i8> %s4) #3 + store <8 x i16> %vmull0, <8 x i16>* %dst0, align 16 + store <8 x i16> %vmull1, <8 x i16>* %dst1, align 16 + br label %exit + +exit: + ret void +} + +; Function Attrs: nounwind readnone +declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) #2