diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -22107,15 +22107,19 @@ // 128bit vector version. if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) { EVT LVT = VT.getDoubleNumVectorElementsVT(*DCI.DAG.getContext()); - if (SDNode *LN = DCI.DAG.getNodeIfExists( - N->getOpcode(), DCI.DAG.getVTList(LVT), {N->getOperand(0)})) { + SmallVector Ops(N->ops()); + if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(), + DCI.DAG.getVTList(LVT), Ops)) { SDLoc DL(N); return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0), DCI.DAG.getConstant(0, DL, MVT::i64)); } } - return performPostLD1Combine(N, DCI, false); + if (N->getOpcode() == AArch64ISD::DUP) + return performPostLD1Combine(N, DCI, false); + + return SDValue(); } /// Get rid of unnecessary NVCASTs (that don't change the type). @@ -23043,6 +23047,10 @@ case AArch64ISD::CSEL: return performCSELCombine(N, DCI, DAG); case AArch64ISD::DUP: + case AArch64ISD::DUPLANE8: + case AArch64ISD::DUPLANE16: + case AArch64ISD::DUPLANE32: + case AArch64ISD::DUPLANE64: return performDUPCombine(N, DCI); case AArch64ISD::DUPLANE128: return performDupLane128Combine(N, DAG); diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -138,6 +138,23 @@ def extract_high_dup_v4i32 : BinOpFrag<(extract_subvector (v4i32 (AArch64duplane32 (v4i32 node:$LHS), node:$RHS)), (i64 2))>; +def dup_v8i16 : + PatFrags<(ops node:$LHS, node:$RHS), + [(v4i16 (extract_subvector (v8i16 (AArch64duplane16 (v8i16 node:$LHS), node:$RHS)), (i64 0))), + (v4i16 (AArch64duplane16 (v8i16 node:$LHS), node:$RHS))]>; +def dup_v4i32 : + PatFrags<(ops node:$LHS, node:$RHS), + [(v2i32 (extract_subvector (v4i32 (AArch64duplane32 (v4i32 node:$LHS), node:$RHS)), (i64 0))), + (v2i32 (AArch64duplane32 (v4i32 node:$LHS), node:$RHS))]>; +def dup_v8f16 : + PatFrags<(ops node:$LHS, node:$RHS), + [(v4f16 (extract_subvector (v8f16 (AArch64duplane16 (v8f16 node:$LHS), node:$RHS)), (i64 0))), + (v4f16 (AArch64duplane16 (v8f16 node:$LHS), node:$RHS))]>; +def dup_v4f32 : + PatFrags<(ops node:$LHS, node:$RHS), + [(v2f32 (extract_subvector (v4f32 (AArch64duplane32 (v4f32 node:$LHS), node:$RHS)), (i64 0))), + (v2f32 (AArch64duplane32 (v4f32 node:$LHS), node:$RHS))]>; + //===----------------------------------------------------------------------===// // Asm Operand Classes. // @@ -8443,7 +8460,7 @@ asm, ".4h", ".4h", ".4h", ".h", [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), - (v4f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + (dup_v8f16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx)))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; @@ -8470,7 +8487,7 @@ asm, ".2s", ".2s", ".2s", ".s", [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), - (v2f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> { + (dup_v4f32 (v4f32 V128:$Rm), VectorIndexS:$idx)))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; @@ -8781,7 +8798,7 @@ asm, ".4h", ".4h", ".4h", ".h", [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), - (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + (dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; @@ -8807,7 +8824,7 @@ asm, ".2s", ".2s", ".2s", ".s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), - (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + (dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; @@ -8855,7 +8872,7 @@ asm, ".4h", ".4h", ".4h", ".h", [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), - (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + (dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; @@ -8881,7 +8898,7 @@ asm, ".2s", ".2s", ".2s", ".s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), - (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + (dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; @@ -8907,7 +8924,7 @@ asm, ".4h", ".4h", ".4h", ".h", [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd),(v4i16 V64:$Rn), - (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + (dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; @@ -8933,7 +8950,7 @@ asm, ".2s", ".2s", ".2s", ".s", [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), - (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + (dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; @@ -8960,7 +8977,7 @@ asm, ".4s", ".4s", ".4h", ".h", [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), - (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + (dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; @@ -8987,7 +9004,7 @@ asm, ".2d", ".2d", ".2s", ".s", [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), - (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + (dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; @@ -9033,8 +9050,8 @@ (Accum (v4i32 V128:$Rd), (v4i32 (int_aarch64_neon_sqdmull (v4i16 V64:$Rn), - (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx))))))]> { + (dup_v8i16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx)))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; @@ -9064,8 +9081,7 @@ (Accum (v2i64 V128:$Rd), (v2i64 (int_aarch64_neon_sqdmull (v2i32 V64:$Rn), - (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx))))))]> { + (dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; @@ -9110,9 +9126,8 @@ (i32 (vector_extract (v4i32 (int_aarch64_neon_sqdmull (v4i16 V64:$Rn), - (v4i16 (AArch64duplane16 - (v8i16 V128_lo:$Rm), - VectorIndexH:$idx)))), + (dup_v8i16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx))), (i64 0))))), (!cast(NAME # v1i32_indexed) FPR32Op:$Rd, @@ -9145,7 +9160,7 @@ asm, ".4s", ".4s", ".4h", ".h", [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), - (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + (dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; @@ -9172,7 +9187,7 @@ asm, ".2d", ".2d", ".2s", ".s", [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), - (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + (dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; @@ -9201,7 +9216,7 @@ asm, ".4s", ".4s", ".4h", ".h", [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn), - (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + (dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; @@ -9228,7 +9243,7 @@ asm, ".2d", ".2d", ".2s", ".s", [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn), - (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + (dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; @@ -10850,8 +10865,8 @@ asm, ".4h", ".4h", ".4h", ".h", [(set (v4i16 V64:$dst), (v4i16 (op (v4i16 V64:$Rd), (v4i16 V64:$Rn), - (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx)))))]> { + (dup_v8i16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; @@ -10876,8 +10891,7 @@ asm, ".2s", ".2s", ".2s", ".s", [(set (v2i32 V64:$dst), (v2i32 (op (v2i32 V64:$Rd), (v2i32 V64:$Rn), - (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx)))))]> { + (dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll --- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -657,20 +657,19 @@ define void @sink_v16s16_8(i32 *%p, i32 *%d, i64 %n, <16 x i8> %a) { ; CHECK-LABEL: sink_v16s16_8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: dup v1.8b, v0.b[10] ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: dup v0.16b, v0.b[10] ; CHECK-NEXT: .LBB9_1: // %loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q2, [x0] +; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: add x8, x8, #8 ; CHECK-NEXT: subs x2, x2, #8 -; CHECK-NEXT: smull2 v3.8h, v2.16b, v0.16b -; CHECK-NEXT: smull v2.8h, v2.8b, v1.8b -; CHECK-NEXT: cmlt v3.8h, v3.8h, #0 +; CHECK-NEXT: smull2 v2.8h, v1.16b, v0.16b +; CHECK-NEXT: smull v1.8h, v1.8b, v0.8b ; CHECK-NEXT: cmlt v2.8h, v2.8h, #0 -; CHECK-NEXT: uzp1 v2.16b, v2.16b, v3.16b -; CHECK-NEXT: str q2, [x0], #32 +; CHECK-NEXT: cmlt v1.8h, v1.8h, #0 +; CHECK-NEXT: uzp1 v1.16b, v1.16b, v2.16b +; CHECK-NEXT: str q1, [x0], #32 ; CHECK-NEXT: b.ne .LBB9_1 ; CHECK-NEXT: // %bb.2: // %exit ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/duplane-index-patfrags.ll b/llvm/test/CodeGen/AArch64/duplane-index-patfrags.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/duplane-index-patfrags.ll @@ -0,0 +1,119 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+fullfp16 < %s | FileCheck %s + +define <8 x half> @sel.v8f16.fmul(ptr %p, ptr %q, <8 x half> %a, <8 x half> %b, <4 x half> %c) { +; CHECK-LABEL: sel.v8f16.fmul: +; CHECK: // %bb.0: +; CHECK-NEXT: fmul v1.8h, v1.8h, v0.h[0] +; CHECK-NEXT: fmul v2.4h, v2.4h, v0.h[0] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: str d2, [x0] +; CHECK-NEXT: ret + %splat = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> zeroinitializer + %splat2 = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> zeroinitializer + + %r = fmul <8 x half> %b, %splat + %r2 = fmul <4 x half> %c, %splat2 + store <4 x half> %r2, ptr %p + ret <8 x half> %r +} + +define <4 x float> @sel.v4f32.fmul(ptr %p, ptr %q, <4 x float> %a, <4 x float> %b, <2 x float> %c) { +; CHECK-LABEL: sel.v4f32.fmul: +; CHECK: // %bb.0: +; CHECK-NEXT: fmul v1.4s, v1.4s, v0.s[0] +; CHECK-NEXT: fmul v2.2s, v2.2s, v0.s[0] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: str d2, [x0] +; CHECK-NEXT: ret + %splat = shufflevector <4 x float> %a, <4 x float> poison, <4 x i32> zeroinitializer + %splat2 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> zeroinitializer + + %r = fmul <4 x float> %b, %splat + %r2 = fmul <2 x float> %c, %splat2 + store <2 x float> %r2, ptr %p + ret <4 x float> %r +} + +define <8 x i16> @sel.v8i16.mul(ptr %p, ptr %q, <8 x i16> %a, <8 x i16> %b, <4 x i16> %c) { +; CHECK-LABEL: sel.v8i16.mul: +; CHECK: // %bb.0: +; CHECK-NEXT: mul v1.8h, v1.8h, v0.h[0] +; CHECK-NEXT: mul v2.4h, v2.4h, v0.h[0] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: str d2, [x0] +; CHECK-NEXT: ret + %splat = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> zeroinitializer + %splat2 = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> zeroinitializer + + %r = mul <8 x i16> %b, %splat + %r2 = mul <4 x i16> %c, %splat2 + store <4 x i16> %r2, ptr %p + ret <8 x i16> %r +} + +define <4 x i32> @sel.v4i32.mul(ptr %p, ptr %q, <4 x i32> %a, <4 x i32> %b, <2 x i32> %c) { +; CHECK-LABEL: sel.v4i32.mul: +; CHECK: // %bb.0: +; CHECK-NEXT: mul v1.4s, v1.4s, v0.s[0] +; CHECK-NEXT: mul v2.2s, v2.2s, v0.s[0] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: str d2, [x0] +; CHECK-NEXT: ret + %splat = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> zeroinitializer + %splat2 = shufflevector <4 x i32> %a, <4 x i32> poison, <2 x i32> zeroinitializer + + %r = mul <4 x i32> %b, %splat + %r2 = mul <2 x i32> %c, %splat2 + store <2 x i32> %r2, ptr %p + ret <4 x i32> %r +} + +define <4 x i64> @sel.v4i32.smull(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) { +; CHECK-LABEL: sel.v4i32.smull: +; CHECK: // %bb.0: +; CHECK-NEXT: smull2 v2.2d, v1.4s, v0.s[0] +; CHECK-NEXT: smull v0.2d, v1.2s, v0.s[0] +; CHECK-NEXT: mov v1.16b, v2.16b +; CHECK-NEXT: ret + %ext = sext <4 x i32> %a to <4 x i64> + %splat = shufflevector <4 x i64> %ext, <4 x i64> poison, <4 x i32> zeroinitializer + %d = sext <4 x i32> %b to <4 x i64> + %r = mul <4 x i64> %d, %splat + ret <4 x i64> %r +} + +define <4 x i64> @sel.v4i32.umull(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) { +; CHECK-LABEL: sel.v4i32.umull: +; CHECK: // %bb.0: +; CHECK-NEXT: umull2 v2.2d, v1.4s, v0.s[0] +; CHECK-NEXT: umull v0.2d, v1.2s, v0.s[0] +; CHECK-NEXT: mov v1.16b, v2.16b +; CHECK-NEXT: ret + %ext = zext <4 x i32> %a to <4 x i64> + %splat = shufflevector <4 x i64> %ext, <4 x i64> poison, <4 x i32> zeroinitializer + %d = zext <4 x i32> %b to <4 x i64> + %r = mul <4 x i64> %d, %splat + ret <4 x i64> %r +} + +define <4 x i32> @sel.v4i32.sqdmull(<8 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: sel.v4i32.sqdmull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: sqdmull v2.4s, v0.4h, v1.h[0] +; CHECK-NEXT: sqdmlal2 v2.4s, v0.8h, v1.h[0] +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ret +entry: + %c = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> + %d = shufflevector <4 x i16> %b, <4 x i16> poison, <4 x i32> zeroinitializer + %e = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %c, <4 x i16> %d) + %f = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> + %g = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %f, <4 x i16> %d) + %h = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %e, <4 x i32> %g) + ret <4 x i32> %h +} + +declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) +declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)