Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -22107,8 +22107,9 @@ // 128bit vector version. if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) { EVT LVT = VT.getDoubleNumVectorElementsVT(*DCI.DAG.getContext()); - if (SDNode *LN = DCI.DAG.getNodeIfExists( - N->getOpcode(), DCI.DAG.getVTList(LVT), {N->getOperand(0)})) { + SmallVector Ops(N->ops()); + if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(), + DCI.DAG.getVTList(LVT), Ops)) { SDLoc DL(N); return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0), DCI.DAG.getConstant(0, DL, MVT::i64)); @@ -23043,6 +23044,10 @@ case AArch64ISD::CSEL: return performCSELCombine(N, DCI, DAG); case AArch64ISD::DUP: + case AArch64ISD::DUPLANE8: + case AArch64ISD::DUPLANE16: + case AArch64ISD::DUPLANE32: + case AArch64ISD::DUPLANE64: return performDUPCombine(N, DCI); case AArch64ISD::DUPLANE128: return performDupLane128Combine(N, DAG); Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -137,6 +137,10 @@ BinOpFrag<(extract_subvector (v8i16 (AArch64duplane16 (v8i16 node:$LHS), node:$RHS)), (i64 4))>; def extract_high_dup_v4i32 : BinOpFrag<(extract_subvector (v4i32 (AArch64duplane32 (v4i32 node:$LHS), node:$RHS)), (i64 2))>; +def extract_low_dup_v8i16 : + BinOpFrag<(extract_subvector (v8i16 (AArch64duplane16 (v8i16 node:$LHS), node:$RHS)), (i64 0))>; +def extract_low_dup_v4i32 : + BinOpFrag<(extract_subvector (v4i32 (AArch64duplane32 (v4i32 node:$LHS), node:$RHS)), (i64 0))>; //===----------------------------------------------------------------------===// // Asm Operand Classes. Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -6947,6 +6947,13 @@ defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl", TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>; defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull", AArch64smull>; +def : Pat<(v4i32 (AArch64smull (v4i16 V64:$Rn), + (extract_low_dup_v8i16 (v8i16 V128:$Rm), VectorIndexS:$idx))), + (SMULLv4i16_indexed V64:$Rn, V128:$Rm, VectorIndexS:$idx)>; +def : Pat<(v2i64 (AArch64smull (v2i32 V64:$Rn), + (extract_low_dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx))), + (SMULLv2i32_indexed V64:$Rn, V128:$Rm, VectorIndexS:$idx)>; + defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal", int_aarch64_neon_sqadd>; defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl", @@ -6956,6 +6963,13 @@ defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh", int_aarch64_neon_sqrdmlsh>; defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>; +def : Pat<(v4i32 (int_aarch64_neon_sqdmull (v4i16 V64:$Rn), + (extract_low_dup_v8i16 (v8i16 V128:$Rm), VectorIndexS:$idx))), + (SQDMULLv4i16_indexed V64:$Rn, V128:$Rm, VectorIndexS:$idx)>; +def : Pat<(v2i64 (int_aarch64_neon_sqdmull (v2i32 V64:$Rn), + (extract_low_dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx))), + (SQDMULLv2i32_indexed V64:$Rn, V128:$Rm, VectorIndexS:$idx)>; + defm UMLAL : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal", TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>; defm UMLSL : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl", Index: llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll =================================================================== --- llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -657,20 +657,19 @@ define void @sink_v16s16_8(i32 *%p, i32 *%d, i64 %n, <16 x i8> %a) { ; CHECK-LABEL: sink_v16s16_8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: dup v1.8b, v0.b[10] ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: dup v0.16b, v0.b[10] ; CHECK-NEXT: .LBB9_1: // %loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q2, [x0] +; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: add x8, x8, #8 ; CHECK-NEXT: subs x2, x2, #8 -; CHECK-NEXT: smull2 v3.8h, v2.16b, v0.16b -; CHECK-NEXT: smull v2.8h, v2.8b, v1.8b -; CHECK-NEXT: cmlt v3.8h, v3.8h, #0 +; CHECK-NEXT: smull2 v2.8h, v1.16b, v0.16b +; CHECK-NEXT: smull v1.8h, v1.8b, v0.8b ; CHECK-NEXT: cmlt v2.8h, v2.8h, #0 -; CHECK-NEXT: uzp1 v2.16b, v2.16b, v3.16b -; CHECK-NEXT: str q2, [x0], #32 +; CHECK-NEXT: cmlt v1.8h, v1.8h, #0 +; CHECK-NEXT: uzp1 v1.16b, v1.16b, v2.16b +; CHECK-NEXT: str q1, [x0], #32 ; CHECK-NEXT: b.ne .LBB9_1 ; CHECK-NEXT: // %bb.2: // %exit ; CHECK-NEXT: ret