Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -2270,6 +2270,16 @@ return false; } + /// Return true if sinking I's operands to the same basic block as I is + /// profitable, e.g. because the operands can be folded into a target + /// instruction during instruction selection. After calling the function + /// \p Ops contains the Uses to sink ordered by dominance (dominating users + /// come first). + virtual bool shouldSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const { + return false; + } + /// Return true if the target supplies and combines to a paired load /// two loaded values of type LoadedType next to each other in memory. /// RequiredAlignment gives the minimal alignment constraints that must be met Index: llvm/lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- llvm/lib/CodeGen/CodeGenPrepare.cpp +++ llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -375,6 +375,8 @@ SmallVectorImpl &SpeculativelyMovedExts); bool splitBranchCondition(Function &F); bool simplifyOffsetableRelocate(Instruction &I); + + bool tryToSinkFreeOperands(Instruction *I); }; } // end anonymous namespace @@ -1734,6 +1736,7 @@ InsertedInsts.insert(ExtVal); return true; } + case Intrinsic::launder_invariant_group: case Intrinsic::strip_invariant_group: { Value *ArgVal = II->getArgOperand(0); @@ -5971,6 +5974,48 @@ return MadeChange; } +bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) { + // If the operands of I can be folded into a target instruction together with + // I, duplicate and sink them. + SmallVector OpsToSink; + if (!TLI || !TLI->shouldSinkOperands(I, OpsToSink)) + return false; + + // OpsToSink can contain multiple uses in a use chain (e.g. + // (%u1 with %u1 = shufflevector), (%u2 with %u2 = zext %u1)). The dominating + // uses must come first, which means they are sunk first, temporarily creating + // invalid IR. This will be fixed once their dominated users are sunk and + // updated. + BasicBlock *TargetBB = I->getParent(); + bool Changed = false; + SmallVector ToReplace; + for (Use *U : OpsToSink) { + auto *UI = cast(U->get()); + if (UI->getParent() == TargetBB || isa(UI)) + continue; + ToReplace.push_back(U); + } + + SmallPtrSet MaybeDead; + for (Use *U : ToReplace) { + auto *UI = cast(U->get()); + Instruction *NI = UI->clone(); + MaybeDead.insert(UI); + LLVM_DEBUG(dbgs() << "Sinking " << *UI << " to user " << *I << "\n"); + NI->insertBefore(I); + InsertedInsts.insert(NI); + U->set(NI); + Changed = true; + } + + // Remove instructions that are dead after sinking. + for (auto *I : MaybeDead) + if (!I->hasNUsesOrMore(1)) + I->eraseFromParent(); + + return Changed; +} + bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) { if (!TLI || !DL) return false; @@ -6785,6 +6830,9 @@ return false; } + if (tryToSinkFreeOperands(I)) + return true; + if (CallInst *CI = dyn_cast(I)) return optimizeCallInst(CI, ModifiedDT); Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -327,6 +327,9 @@ bool isZExtFree(EVT VT1, EVT VT2) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; + bool shouldSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const override; + bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override; unsigned getMaxSupportedInterleaveFactor() const override { return 4; } Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -54,9 +54,11 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/OperandTraits.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" @@ -86,6 +88,7 @@ #include using namespace llvm; +using namespace llvm::PatternMatch; #define DEBUG_TYPE "aarch64-lower" @@ -8282,6 +8285,110 @@ return true; } +/// Check if both Op1 and Op2 are shufflevector extracts of either the lower +/// or upper half of the vector elements. +static bool areExtractShuffleVectors(Value *Op1, Value *Op2) { + auto areTypesHalfed = [](Value *FullV, Value *HalfV) { + auto *FullVT = cast(FullV->getType()); + auto *HalfVT = cast(HalfV->getType()); + return FullVT->getBitWidth() == 2 * HalfVT->getBitWidth(); + }; + + auto extractHalf = [](Value *FullV, Value *HalfV) { + auto *FullVT = dyn_cast(FullV->getType()); + auto *HalfVT = dyn_cast(HalfV->getType()); + return FullVT->getNumElements() == 2 * HalfVT->getNumElements(); + }; + + Constant *M1, *M2; + Value *S1Op1, *S2Op1; + if (!match(Op1, m_ShuffleVector(m_Value(S1Op1), m_Undef(), m_Constant(M1))) || + !match(Op2, m_ShuffleVector(m_Value(S2Op1), m_Undef(), m_Constant(M2)))) + return false; + + // Check that the operands are half as wide as the result and we extract + // half of the elements of the input vectors. + if (!areTypesHalfed(S1Op1, Op1) || !areTypesHalfed(S2Op1, Op2) || + !extractHalf(S1Op1, Op1) || !extractHalf(S2Op1, Op2)) + return false; + + // Check the mask extracts either the lower or upper half of vector + // elements. + int M1Start = -1; + int M2Start = -1; + int NumElements = cast(Op1->getType())->getNumElements() * 2; + if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) || + !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) || + M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2))) + return false; + + return true; +} + +/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth +/// of the vector elements. +static bool areExtractExts(Value *Ext1, Value *Ext2) { + auto areExtDoubled = [](Instruction *Ext) { + return Ext->getType()->getScalarSizeInBits() == + 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); + }; + + if (!match(Ext1, m_ZExtOrSExt(m_Value())) || + !match(Ext2, m_ZExtOrSExt(m_Value())) || + !areExtDoubled(cast(Ext1)) || + !areExtDoubled(cast(Ext2))) + return false; + + return true; +} + +/// Check if sinking \p I's operands to I's basic block is profitable, because +/// the operands can be folded into a target instruction, e.g. +/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). +bool AArch64TargetLowering::shouldSinkOperands( + Instruction *I, SmallVectorImpl &Ops) const { + if (!I->getType()->isVectorTy()) + return false; + + if (IntrinsicInst *II = dyn_cast(I)) { + switch (II->getIntrinsicID()) { + case Intrinsic::aarch64_neon_umull: + if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) + return false; + Ops.push_back(&II->getOperandUse(0)); + Ops.push_back(&II->getOperandUse(1)); + return true; + default: + return false; + } + } + + switch (I->getOpcode()) { + case Instruction::Sub: + case Instruction::Add: { + if (!areExtractExts(I->getOperand(0), I->getOperand(1))) + return false; + + // If the exts' operands extract either the lower or upper elements, we + // can sink them too. + auto Ext1 = cast(I->getOperand(0)); + auto Ext2 = cast(I->getOperand(1)); + if (areExtractShuffleVectors(Ext1, Ext2)) { + Ops.push_back(&Ext1->getOperandUse(0)); + Ops.push_back(&Ext2->getOperandUse(0)); + } + + Ops.push_back(&I->getOperandUse(0)); + Ops.push_back(&I->getOperandUse(1)); + + return true; + } + default: + return false; + } + return false; +} + bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const { if (!LoadedType.isSimple() || Index: llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll @@ -0,0 +1,236 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -codegenprepare -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown" + +define <8 x i16> @sink_zext(<8 x i8> %a, <8 x i8> %b, i1 %c) { +; CHECK-LABEL: @sink_zext( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[ZB_1:%.*]] = zext <8 x i8> [[B:%.*]] to <8 x i16> +; CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[A:%.*]] to <8 x i16> +; CHECK-NEXT: [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]] +; CHECK-NEXT: ret <8 x i16> [[RES_1]] +; CHECK: if.else: +; CHECK-NEXT: [[ZB_2:%.*]] = zext <8 x i8> [[B]] to <8 x i16> +; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +; CHECK-NEXT: [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]] +; CHECK-NEXT: ret <8 x i16> [[RES_2]] +; +entry: + %za = zext <8 x i8> %a to <8 x i16> + br i1 %c, label %if.then, label %if.else + +if.then: + %zb.1 = zext <8 x i8> %b to <8 x i16> + %res.1 = add <8 x i16> %za, %zb.1 + ret <8 x i16> %res.1 + +if.else: + %zb.2 = zext <8 x i8> %b to <8 x i16> + %res.2 = sub <8 x i16> %za, %zb.2 + ret <8 x i16> %res.2 +} + +define <8 x i16> @sink_sext(<8 x i8> %a, <8 x i8> %b, i1 %c) { +; CHECK-LABEL: @sink_sext( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16> +; CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16> +; CHECK-NEXT: [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]] +; CHECK-NEXT: ret <8 x i16> [[RES_1]] +; CHECK: if.else: +; CHECK-NEXT: [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16> +; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +; CHECK-NEXT: [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]] +; CHECK-NEXT: ret <8 x i16> [[RES_2]] +; +entry: + %za = sext <8 x i8> %a to <8 x i16> + br i1 %c, label %if.then, label %if.else + +if.then: + %zb.1 = sext <8 x i8> %b to <8 x i16> + %res.1 = add <8 x i16> %za, %zb.1 + ret <8 x i16> %res.1 + +if.else: + %zb.2 = sext <8 x i8> %b to <8 x i16> + %res.2 = sub <8 x i16> %za, %zb.2 + ret <8 x i16> %res.2 +} + +define <8 x i16> @do_not_sink_nonfree_zext(<8 x i8> %a, <8 x i8> %b, i1 %c) { +; CHECK-LABEL: @do_not_sink_nonfree_zext( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16> +; CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16> +; CHECK-NEXT: [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]] +; CHECK-NEXT: ret <8 x i16> [[RES_1]] +; CHECK: if.else: +; CHECK-NEXT: [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16> +; CHECK-NEXT: ret <8 x i16> [[ZB_2]] +; +entry: + %za = sext <8 x i8> %a to <8 x i16> + br i1 %c, label %if.then, label %if.else + +if.then: + %zb.1 = sext <8 x i8> %b to <8 x i16> + %res.1 = add <8 x i16> %za, %zb.1 + ret <8 x i16> %res.1 + +if.else: + %zb.2 = sext <8 x i8> %b to <8 x i16> + ret <8 x i16> %zb.2 +} + +define <8 x i16> @do_not_sink_nonfree_sext(<8 x i8> %a, <8 x i8> %b, i1 %c) { +; CHECK-LABEL: @do_not_sink_nonfree_sext( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16> +; CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16> +; CHECK-NEXT: [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]] +; CHECK-NEXT: ret <8 x i16> [[RES_1]] +; CHECK: if.else: +; CHECK-NEXT: [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16> +; CHECK-NEXT: ret <8 x i16> [[ZB_2]] +; +entry: + %za = sext <8 x i8> %a to <8 x i16> + br i1 %c, label %if.then, label %if.else + +if.then: + %zb.1 = sext <8 x i8> %b to <8 x i16> + %res.1 = add <8 x i16> %za, %zb.1 + ret <8 x i16> %res.1 + +if.else: + %zb.2 = sext <8 x i8> %b to <8 x i16> + ret <8 x i16> %zb.2 +} + +; The masks used are suitable for umull, sink shufflevector to users. +define <8 x i16> @sink_shufflevector_umull(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: @sink_shufflevector_umull( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[VMULL0:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[TMP0]], <8 x i8> [[S2]]) +; CHECK-NEXT: ret <8 x i16> [[VMULL0]] +; CHECK: if.else: +; CHECK-NEXT: [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[VMULL1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[S4]]) +; CHECK-NEXT: ret <8 x i16> [[VMULL1]] +; +entry: + %s1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> + %s3 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> + br i1 undef, label %if.then, label %if.else + +if.then: + %s2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> + %vmull0 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %s1, <8 x i8> %s2) #3 + ret <8 x i16> %vmull0 + +if.else: + %s4 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> + %vmull1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %s3, <8 x i8> %s4) #3 + ret <8 x i16> %vmull1 +} + +; Both exts and their shufflevector operands can be sunk. +define <8 x i16> @sink_shufflevector_ext_subadd(<16 x i8> %a, <16 x i8> %b) { +entry: + %s1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> + %z1 = zext <8 x i8> %s1 to <8 x i16> + %s3 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> + %z3 = sext <8 x i8> %s3 to <8 x i16> + br i1 undef, label %if.then, label %if.else + +if.then: + %s2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> + %z2 = zext <8 x i8> %s2 to <8 x i16> + %res1 = add <8 x i16> %z1, %z2 + ret <8 x i16> %res1 + +if.else: + %s4 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> + %z4 = sext <8 x i8> %s4 to <8 x i16> + %res2 = sub <8 x i16> %z3, %z4 + ret <8 x i16> %res2 +} + + +declare void @user1(<8 x i16>) + +; Both exts and their shufflevector operands can be sunk. +define <8 x i16> @sink_shufflevector_ext_subadd_multiuse(<16 x i8> %a, <16 x i8> %b) { +entry: + %s1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> + %z1 = zext <8 x i8> %s1 to <8 x i16> + %s3 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> + %z3 = sext <8 x i8> %s3 to <8 x i16> + call void @user1(<8 x i16> %z3) + br i1 undef, label %if.then, label %if.else + +if.then: + %s2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> + %z2 = zext <8 x i8> %s2 to <8 x i16> + %res1 = add <8 x i16> %z1, %z2 + ret <8 x i16> %res1 + +if.else: + %s4 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> + %z4 = sext <8 x i8> %s4 to <8 x i16> + %res2 = sub <8 x i16> %z3, %z4 + ret <8 x i16> %res2 +} + + +; The masks used are not suitable for umull, do not sink. +define <8 x i16> @no_sink_shufflevector_umull(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: @no_sink_shufflevector_umull( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[S1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[VMULL0:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[S1]], <8 x i8> [[S2]]) +; CHECK-NEXT: ret <8 x i16> [[VMULL0]] +; CHECK: if.else: +; CHECK-NEXT: [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[VMULL1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[S3]], <8 x i8> [[S4]]) +; CHECK-NEXT: ret <8 x i16> [[VMULL1]] +; +entry: + %s1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> + %s3 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> + br i1 undef, label %if.then, label %if.else + +if.then: + %s2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> + %vmull0 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %s1, <8 x i8> %s2) #3 + ret <8 x i16> %vmull0 + +if.else: + %s4 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> + %vmull1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %s3, <8 x i8> %s4) #3 + ret <8 x i16> %vmull1 +} + + +; Function Attrs: nounwind readnone +declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) #2