Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -22107,15 +22107,19 @@ // 128bit vector version. if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) { EVT LVT = VT.getDoubleNumVectorElementsVT(*DCI.DAG.getContext()); - if (SDNode *LN = DCI.DAG.getNodeIfExists( - N->getOpcode(), DCI.DAG.getVTList(LVT), {N->getOperand(0)})) { + SmallVector Ops(N->ops()); + if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(), + DCI.DAG.getVTList(LVT), Ops)) { SDLoc DL(N); return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0), DCI.DAG.getConstant(0, DL, MVT::i64)); } } - return performPostLD1Combine(N, DCI, false); + if (N->getOpcode() == AArch64ISD::DUP) + return performPostLD1Combine(N, DCI, false); + + return SDValue(); } /// Get rid of unnecessary NVCASTs (that don't change the type). @@ -23043,6 +23047,10 @@ case AArch64ISD::CSEL: return performCSELCombine(N, DCI, DAG); case AArch64ISD::DUP: + case AArch64ISD::DUPLANE8: + case AArch64ISD::DUPLANE16: + case AArch64ISD::DUPLANE32: + case AArch64ISD::DUPLANE64: return performDUPCombine(N, DCI); case AArch64ISD::DUPLANE128: return performDupLane128Combine(N, DAG); Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -137,6 +137,10 @@ BinOpFrag<(extract_subvector (v8i16 (AArch64duplane16 (v8i16 node:$LHS), node:$RHS)), (i64 4))>; def extract_high_dup_v4i32 : BinOpFrag<(extract_subvector (v4i32 (AArch64duplane32 (v4i32 node:$LHS), node:$RHS)), (i64 2))>; +def extract_low_dup_v8i16 : + BinOpFrag<(extract_subvector (v8i16 (AArch64duplane16 (v8i16 node:$LHS), node:$RHS)), (i64 0))>; +def extract_low_dup_v4i32 : + BinOpFrag<(extract_subvector (v4i32 (AArch64duplane32 (v4i32 node:$LHS), node:$RHS)), (i64 0))>; //===----------------------------------------------------------------------===// // Asm Operand Classes. @@ -9021,6 +9025,13 @@ let Inst{11} = idx{1}; let Inst{21} = idx{0}; } + + def : Pat<(v4i32 (OpNode (v4i16 V64:$Rn), + (extract_low_dup_v8i16 (v8i16 V128:$Rm), VectorIndexS:$idx))), + (!cast(NAME # "v4i16_indexed") V64:$Rn, V128:$Rm, VectorIndexS:$idx)>; + def : Pat<(v2i64 (OpNode (v2i32 V64:$Rn), + (extract_low_dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx))), + (!cast(NAME # "v2i32_indexed") V64:$Rn, V128:$Rm, VectorIndexS:$idx)>; } multiclass SIMDIndexedLongSQDMLXSDTied opc, string asm, @@ -9138,7 +9149,6 @@ multiclass SIMDVectorIndexedLongSD opc, string asm, SDPatternOperator OpNode> { - let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V128, V64, V128_lo, VectorIndexH, @@ -9150,6 +9160,9 @@ let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; } def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc, @@ -9164,6 +9177,9 @@ let Inst{11} = idx{2}; let Inst{21} = idx{1}; let Inst{20} = idx{0}; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; } def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc, @@ -9176,6 +9192,9 @@ bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; } def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc, @@ -9188,8 +9207,17 @@ bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; } - } + + def : Pat<(v4i32 (OpNode (v4i16 V64:$Rn), + (extract_low_dup_v8i16 (v8i16 V128:$Rm), VectorIndexS:$idx))), + (!cast(NAME # "v4i16_indexed") V64:$Rn, V128:$Rm, VectorIndexS:$idx)>; + def : Pat<(v2i64 (OpNode (v2i32 V64:$Rn), + (extract_low_dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx))), + (!cast(NAME # "v2i32_indexed") V64:$Rn, V128:$Rm, VectorIndexS:$idx)>; } multiclass SIMDVectorIndexedLongSDTied opc, string asm, Index: llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll =================================================================== --- llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -657,20 +657,19 @@ define void @sink_v16s16_8(i32 *%p, i32 *%d, i64 %n, <16 x i8> %a) { ; CHECK-LABEL: sink_v16s16_8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: dup v1.8b, v0.b[10] ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: dup v0.16b, v0.b[10] ; CHECK-NEXT: .LBB9_1: // %loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q2, [x0] +; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: add x8, x8, #8 ; CHECK-NEXT: subs x2, x2, #8 -; CHECK-NEXT: smull2 v3.8h, v2.16b, v0.16b -; CHECK-NEXT: smull v2.8h, v2.8b, v1.8b -; CHECK-NEXT: cmlt v3.8h, v3.8h, #0 +; CHECK-NEXT: smull2 v2.8h, v1.16b, v0.16b +; CHECK-NEXT: smull v1.8h, v1.8b, v0.8b ; CHECK-NEXT: cmlt v2.8h, v2.8h, #0 -; CHECK-NEXT: uzp1 v2.16b, v2.16b, v3.16b -; CHECK-NEXT: str q2, [x0], #32 +; CHECK-NEXT: cmlt v1.8h, v1.8h, #0 +; CHECK-NEXT: uzp1 v1.16b, v1.16b, v2.16b +; CHECK-NEXT: str q1, [x0], #32 ; CHECK-NEXT: b.ne .LBB9_1 ; CHECK-NEXT: // %bb.2: // %exit ; CHECK-NEXT: ret