diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10572,6 +10572,24 @@ return true; } + case Instruction::Mul: { + + for (auto OpIdx : enumerate(I->operands())) { + Instruction *Op = dyn_cast(OpIdx.value().get()); + // Make sure we are not already sinking this operand + if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; })) + continue; + + Instruction *Shuffle = Op; + + // TODO Why can this sometimes fail? + if(isa(&Shuffle->getOperandUse(0))) + Ops.push_back(&Shuffle->getOperandUse(0)); + Ops.push_back(&OpIdx.value()); + } + + return true; + } default: return false; } @@ -14966,6 +14984,50 @@ return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); } +static SDValue performDUPSextCombine(SDNode *N, SelectionDAG &DAG, bool Signed) { + auto Operand = N->getOperand(0); + auto ExtOperand = Operand.getOperand(0); + + auto NType = N->getValueType(0); + + // Cannot operate on non-vector dups + if(!NType.isVector()) + return SDValue(); + + + auto VectorElementType = NType.getVectorElementType(); + + // Cannot operate on non-integers + if(!VectorElementType.isInteger()) + return SDValue(); + + + TypeSize Size = VectorElementType.getSizeInBits(); + + // Cannot operate on a result size of 8 bits + if(Size == 8) + return SDValue(); + + EVT HalfType = MVT::getIntegerVT(Size >> 1); + EVT HalfVT = NType.getVectorVT(*DAG.getContext(), HalfType, NType.getVectorNumElements()); + + auto DupNode = DAG.getNode(N->getOpcode(), SDLoc(N), DAG.getVTList(HalfVT), {ExtOperand}); + auto ExtNode = DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, SDLoc(Operand.getNode()), DAG.getVTList(NType), {DupNode, DAG.getValueType(NType)}); + + return ExtNode; // (sext/zext(dup(...))) +} + +static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { + auto Operand = N->getOperand(0); + auto Opcode = Operand.getOpcode(); + bool IsSext = Opcode == ISD::SIGN_EXTEND || Opcode == ISD::SIGN_EXTEND_INREG || Opcode == ISD::AssertSext; + if(IsSext || Opcode == ISD::ZERO_EXTEND || Opcode == ISD::AssertZext) { + return performDUPSextCombine(N, DAG, IsSext); + } + + return performPostLD1Combine(N, DCI, false); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -15026,7 +15088,7 @@ case AArch64ISD::CSEL: return performCONDCombine(N, DCI, DAG, 2, 3); case AArch64ISD::DUP: - return performPostLD1Combine(N, DCI, false); + return performDUPCombine(N, DCI, DAG); case AArch64ISD::NVCAST: return performNVCASTCombine(N); case AArch64ISD::UZP1: diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-smull.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-smull.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -o -| FileCheck %s + +define void @matrix_mul_signed(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) { +; CHECK-LABEL: matrix_mul_signed: +; CHECK: // %bb.0: // %vector.header +; CHECK-NEXT: sxth w9, w3 +; CHECK-NEXT: // +; CHECK-NEXT: and x8, x0, #0xfffffff8 +; CHECK-NEXT: dup v0.4h, w9 +; CHECK-NEXT: .LBB0_1: // %vector.body +; CHECK-NEXT: // +; CHECK-NEXT: add x9, x2, w0, sxtw #1 +; CHECK-NEXT: ldp d1, d2, [x9] +; CHECK-NEXT: add x9, x1, w0, sxtw #2 +; CHECK-NEXT: subs x8, x8, #8 +; CHECK-NEXT: add w0, w0, #8 +; CHECK-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: smull v2.4s, v0.4h, v2.4h +; CHECK-NEXT: stp q1, q2, [x9] +; CHECK-NEXT: b.ne .LBB0_1 +; CHECK-NEXT: // %bb.2: // %for.end12 +; CHECK-NEXT: ret +vector.header: + %conv4 = sext i16 %val to i32 + %wide.trip.count = sext i32 %N to i64 + %0 = add nsw i64 %wide.trip.count, -1 + %min.iters.check = icmp ult i32 %N, 8 + %1 = trunc i64 %0 to i32 + %2 = icmp ugt i64 %0, 4294967295 + %n.vec = and i64 %wide.trip.count, 4294967288 + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0 + %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer + %cmp.n = icmp eq i64 %n.vec, %wide.trip.count + br label %vector.body + +vector.body: ; preds = %vector.header, %vector.body + %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ] + %3 = trunc i64 %index to i32 + %4 = add i32 %N, %3 + %5 = sext i32 %4 to i64 + %6 = getelementptr inbounds i16, i16* %A, i64 %5 + %7 = bitcast i16* %6 to <4 x i16>* + %wide.load = load <4 x i16>, <4 x i16>* %7, align 2 + %8 = getelementptr inbounds i16, i16* %6, i64 4 + %9 = bitcast i16* %8 to <4 x i16>* + %wide.load30 = load <4 x i16>, <4 x i16>* %9, align 2 + %10 = sext <4 x i16> %wide.load to <4 x i32> + %11 = sext <4 x i16> %wide.load30 to <4 x i32> + %12 = mul nsw <4 x i32> %broadcast.splat, %10 + %13 = mul nsw <4 x i32> %broadcast.splat32, %11 + %14 = getelementptr inbounds i32, i32* %C, i64 %5 + %15 = bitcast i32* %14 to <4 x i32>* + store <4 x i32> %12, <4 x i32>* %15, align 4 + %16 = getelementptr inbounds i32, i32* %14, i64 4 + %17 = bitcast i32* %16 to <4 x i32>* + store <4 x i32> %13, <4 x i32>* %17, align 4 + %index.next = add i64 %index, 8 + %18 = icmp eq i64 %index.next, %n.vec + br i1 %18, label %for.end12, label %vector.body + +for.end12: ; preds = %vector.body + ret void +} diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -o -| FileCheck %s + +define void @matrix_mul_unsigned(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) { +; CHECK-LABEL: matrix_mul_unsigned: +; CHECK: // %bb.0: // %vector.header +; CHECK-NEXT: and w9, w3, #0xffff +; CHECK-NEXT: // +; CHECK-NEXT: and x8, x0, #0xfffffff8 +; CHECK-NEXT: dup v0.4h, w9 +; CHECK-NEXT: .LBB0_1: // %vector.body +; CHECK-NEXT: // +; CHECK-NEXT: add x9, x2, w0, uxtw #1 +; CHECK-NEXT: ldp d1, d2, [x9] +; CHECK-NEXT: add x9, x1, w0, uxtw #2 +; CHECK-NEXT: subs x8, x8, #8 +; CHECK-NEXT: add w0, w0, #8 +; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: umull v2.4s, v0.4h, v2.4h +; CHECK-NEXT: stp q1, q2, [x9] +; CHECK-NEXT: b.ne .LBB0_1 +; CHECK-NEXT: // %bb.2: // %for.end12 +; CHECK-NEXT: ret +vector.header: + %conv4 = zext i16 %val to i32 + %wide.trip.count = zext i32 %N to i64 + %0 = add nsw i64 %wide.trip.count, -1 + %min.iters.check = icmp ult i32 %N, 8 + %1 = trunc i64 %0 to i32 + %2 = icmp ugt i64 %0, 4294967295 + %n.vec = and i64 %wide.trip.count, 4294967288 + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0 + %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer + %cmp.n = icmp eq i64 %n.vec, %wide.trip.count + br label %vector.body + +vector.body: ; preds = %vector.header, %vector.body + %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ] + %3 = trunc i64 %index to i32 + %4 = add i32 %N, %3 + %5 = zext i32 %4 to i64 + %6 = getelementptr inbounds i16, i16* %A, i64 %5 + %7 = bitcast i16* %6 to <4 x i16>* + %wide.load = load <4 x i16>, <4 x i16>* %7, align 2 + %8 = getelementptr inbounds i16, i16* %6, i64 4 + %9 = bitcast i16* %8 to <4 x i16>* + %wide.load30 = load <4 x i16>, <4 x i16>* %9, align 2 + %10 = zext <4 x i16> %wide.load to <4 x i32> + %11 = zext <4 x i16> %wide.load30 to <4 x i32> + %12 = mul nuw nsw <4 x i32> %broadcast.splat, %10 + %13 = mul nuw nsw <4 x i32> %broadcast.splat32, %11 + %14 = getelementptr inbounds i32, i32* %C, i64 %5 + %15 = bitcast i32* %14 to <4 x i32>* + store <4 x i32> %12, <4 x i32>* %15, align 4 + %16 = getelementptr inbounds i32, i32* %14, i64 4 + %17 = bitcast i32* %16 to <4 x i32>* + store <4 x i32> %13, <4 x i32>* %17, align 4 + %index.next = add i64 %index, 8 + %18 = icmp eq i64 %index.next, %n.vec + br i1 %18, label %for.end12, label %vector.body + +for.end12: ; preds = %vector.body + ret void +}