Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -22107,15 +22107,19 @@ // 128bit vector version. if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) { EVT LVT = VT.getDoubleNumVectorElementsVT(*DCI.DAG.getContext()); - if (SDNode *LN = DCI.DAG.getNodeIfExists( - N->getOpcode(), DCI.DAG.getVTList(LVT), {N->getOperand(0)})) { + SmallVector Ops(N->ops()); + if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(), + DCI.DAG.getVTList(LVT), Ops)) { SDLoc DL(N); return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0), DCI.DAG.getConstant(0, DL, MVT::i64)); } } - return performPostLD1Combine(N, DCI, false); + if (N->getOpcode() == AArch64ISD::DUP) + return performPostLD1Combine(N, DCI, false); + + return SDValue(); } /// Get rid of unnecessary NVCASTs (that don't change the type). @@ -23043,6 +23047,10 @@ case AArch64ISD::CSEL: return performCSELCombine(N, DCI, DAG); case AArch64ISD::DUP: + case AArch64ISD::DUPLANE8: + case AArch64ISD::DUPLANE16: + case AArch64ISD::DUPLANE32: + case AArch64ISD::DUPLANE64: return performDUPCombine(N, DCI); case AArch64ISD::DUPLANE128: return performDupLane128Combine(N, DAG); Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -138,6 +138,15 @@ def extract_high_dup_v4i32 : BinOpFrag<(extract_subvector (v4i32 (AArch64duplane32 (v4i32 node:$LHS), node:$RHS)), (i64 2))>; +def dup_v8i16 : + PatFrags<(ops node:$LHS, node:$RHS), + [(v4i16 (extract_subvector (v8i16 (AArch64duplane16 (v8i16 node:$LHS), node:$RHS)), (i64 0))), + (v4i16 (AArch64duplane16 (v8i16 node:$LHS), node:$RHS))]>; +def dup_v4i32 : + PatFrags<(ops node:$LHS, node:$RHS), + [(v2i32 (extract_subvector (v4i32 (AArch64duplane32 (v4i32 node:$LHS), node:$RHS)), (i64 0))), + (v2i32 (AArch64duplane32 (v4i32 node:$LHS), node:$RHS))]>; + //===----------------------------------------------------------------------===// // Asm Operand Classes. // @@ -8960,7 +8969,7 @@ asm, ".4s", ".4s", ".4h", ".h", [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), - (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + (dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; @@ -8987,7 +8996,7 @@ asm, ".2d", ".2d", ".2s", ".s", [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), - (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + (dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; @@ -9145,7 +9154,7 @@ asm, ".4s", ".4s", ".4h", ".h", [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), - (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + (dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; @@ -9172,7 +9181,7 @@ asm, ".2d", ".2d", ".2s", ".s", [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), - (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + (dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; Index: llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll =================================================================== --- llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -657,20 +657,19 @@ define void @sink_v16s16_8(i32 *%p, i32 *%d, i64 %n, <16 x i8> %a) { ; CHECK-LABEL: sink_v16s16_8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: dup v1.8b, v0.b[10] ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: dup v0.16b, v0.b[10] ; CHECK-NEXT: .LBB9_1: // %loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q2, [x0] +; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: add x8, x8, #8 ; CHECK-NEXT: subs x2, x2, #8 -; CHECK-NEXT: smull2 v3.8h, v2.16b, v0.16b -; CHECK-NEXT: smull v2.8h, v2.8b, v1.8b -; CHECK-NEXT: cmlt v3.8h, v3.8h, #0 +; CHECK-NEXT: smull2 v2.8h, v1.16b, v0.16b +; CHECK-NEXT: smull v1.8h, v1.8b, v0.8b ; CHECK-NEXT: cmlt v2.8h, v2.8h, #0 -; CHECK-NEXT: uzp1 v2.16b, v2.16b, v3.16b -; CHECK-NEXT: str q2, [x0], #32 +; CHECK-NEXT: cmlt v1.8h, v1.8h, #0 +; CHECK-NEXT: uzp1 v1.16b, v1.16b, v2.16b +; CHECK-NEXT: str q1, [x0], #32 ; CHECK-NEXT: b.ne .LBB9_1 ; CHECK-NEXT: // %bb.2: // %exit ; CHECK-NEXT: ret