Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9722,12 +9722,13 @@ DAG.getConstant(NumElems, dl, MVT::i64)); } -static bool isEssentiallyExtractSubvector(SDValue N) { - if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR) - return true; - - return N.getOpcode() == ISD::BITCAST && - N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR; +static bool isEssentiallyExtractHighSubvector(SDValue N) { + if (N.getOpcode() == ISD::BITCAST) + N = N.getOperand(0); + if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR) + return false; + return cast(N.getOperand(1))->getAPIntValue() == + N.getOperand(0).getValueType().getVectorNumElements() / 2; } /// Helper structure to keep track of ISD::SET_CC operands. @@ -9894,13 +9895,13 @@ // It's not worth doing if at least one of the inputs isn't already an // extract, but we don't know which it'll be so we have to try both. - if (isEssentiallyExtractSubvector(LHS.getOperand(0))) { + if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) { RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG); if (!RHS.getNode()) return SDValue(); RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS); - } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) { + } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) { LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG); if (!LHS.getNode()) return SDValue(); @@ -9933,11 +9934,11 @@ // Either node could be a DUP, but it's not worth doing both of them (you'd // just as well use the non-high version) so look for a corresponding extract // operation on the other "wing". - if (isEssentiallyExtractSubvector(LHS)) { + if (isEssentiallyExtractHighSubvector(LHS)) { RHS = tryExtendDUPToExtractHigh(RHS, DAG); if (!RHS.getNode()) return SDValue(); - } else if (isEssentiallyExtractSubvector(RHS)) { + } else if (isEssentiallyExtractHighSubvector(RHS)) { LHS = tryExtendDUPToExtractHigh(LHS, DAG); if (!LHS.getNode()) return SDValue(); Index: test/CodeGen/AArch64/arm64-vabs.ll =================================================================== --- test/CodeGen/AArch64/arm64-vabs.ll +++ test/CodeGen/AArch64/arm64-vabs.ll @@ -885,6 +885,20 @@ define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { ; CHECK-LABEL: uabdl_from_extract_dup: ; CHECK-NOT: ext.16b +; CHECK: uabdl.2d + %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 + %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 + + %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> + + %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind + %res1 = zext <2 x i32> %res to <2 x i64> + ret <2 x i64> %res1 +} + +define <2 x i64> @uabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { +; CHECK-LABEL: uabdl2_from_extract_dup: +; CHECK-NOT: ext.16b ; CHECK: uabdl2.2d %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 @@ -899,6 +913,20 @@ define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { ; CHECK-LABEL: sabdl_from_extract_dup: ; CHECK-NOT: ext.16b +; CHECK: sabdl.2d + %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 + %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 + + %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> + + %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind + %res1 = zext <2 x i32> %res to <2 x i64> + ret <2 x i64> %res1 +} + +define <2 x i64> @sabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { +; CHECK-LABEL: sabdl2_from_extract_dup: +; CHECK-NOT: ext.16b ; CHECK: sabdl2.2d %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 Index: test/CodeGen/AArch64/arm64-vadd.ll =================================================================== --- test/CodeGen/AArch64/arm64-vadd.ll +++ test/CodeGen/AArch64/arm64-vadd.ll @@ -738,6 +738,22 @@ declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone +define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs) { +; CHECK-LABEL: uaddl_duprhs +; CHECK-NOT: ext.16b +; CHECK: uaddl.2d + %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 + %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 + + %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> + + %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64> + %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64> + + %res = add <2 x i64> %lhs.ext, %rhs.ext + ret <2 x i64> %res +} + define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) { ; CHECK-LABEL: uaddl2_duprhs ; CHECK-NOT: ext.16b @@ -754,6 +770,22 @@ ret <2 x i64> %res } +define <2 x i64> @saddl_duplhs(i32 %lhs, <4 x i32> %rhs) { +; CHECK-LABEL: saddl_duplhs +; CHECK-NOT: ext.16b +; CHECK: saddl.2d + %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0 + %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1 + + %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> + + %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64> + %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64> + + %res = add <2 x i64> %lhs.ext, %rhs.ext + ret <2 x i64> %res +} + define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) { ; CHECK-LABEL: saddl2_duplhs ; CHECK-NOT: ext.16b @@ -770,6 +802,22 @@ ret <2 x i64> %res } +define <2 x i64> @usubl_duprhs(<4 x i32> %lhs, i32 %rhs) { +; CHECK-LABEL: usubl_duprhs +; CHECK-NOT: ext.16b +; CHECK: usubl.2d + %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 + %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 + + %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> + + %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64> + %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64> + + %res = sub <2 x i64> %lhs.ext, %rhs.ext + ret <2 x i64> %res +} + define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) { ; CHECK-LABEL: usubl2_duprhs ; CHECK-NOT: ext.16b @@ -786,8 +834,24 @@ ret <2 x i64> %res } +define <2 x i64> @ssubl_duplhs(i32 %lhs, <4 x i32> %rhs) { +; CHECK-LABEL: ssubl_duplhs: +; CHECK-NOT: ext.16b +; CHECK: ssubl.2d + %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0 + %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1 + + %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> + + %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64> + %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64> + + %res = sub <2 x i64> %lhs.ext, %rhs.ext + ret <2 x i64> %res +} + define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) { -; CHECK-LABEL: ssubl2_duplhs +; CHECK-LABEL: ssubl2_duplhs: ; CHECK-NOT: ext.16b ; CHECK: ssubl2.2d %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0 Index: test/CodeGen/AArch64/arm64-vmul.ll =================================================================== --- test/CodeGen/AArch64/arm64-vmul.ll +++ test/CodeGen/AArch64/arm64-vmul.ll @@ -1338,6 +1338,19 @@ ret <4 x i32> %vmull2.i } +define <4 x i32> @foo6a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp { +; CHECK-LABEL: foo6a: +; CHECK-NEXT: smull.4s v0, v1, v2[1] +; CHECK-NEXT: ret +entry: + %0 = bitcast <8 x i16> %b to <2 x i64> + %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> + %1 = bitcast <1 x i64> %shuffle.i to <4 x i16> + %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind + ret <4 x i32> %vmull2.i +} + define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: foo7: ; CHECK-NEXT: smull2.2d v0, v1, v2[1] @@ -1351,6 +1364,20 @@ ret <2 x i64> %vmull2.i } +define <2 x i64> @foo7a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp { +; CHECK-LABEL: foo7a: +; CHECK-NEXT: smull.2d v0, v1, v2[1] +; CHECK-NEXT: ret +entry: + %0 = bitcast <4 x i32> %b to <2 x i64> + %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> + %1 = bitcast <1 x i64> %shuffle.i to <2 x i32> + %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind + ret <2 x i64> %vmull2.i +} + + define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: foo8: ; CHECK-NEXT: umull2.4s v0, v1, v2[1] @@ -1364,6 +1391,19 @@ ret <4 x i32> %vmull2.i } +define <4 x i32> @foo8a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp { +; CHECK-LABEL: foo8a: +; CHECK-NEXT: umull.4s v0, v1, v2[1] +; CHECK-NEXT: ret +entry: + %0 = bitcast <8 x i16> %b to <2 x i64> + %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> + %1 = bitcast <1 x i64> %shuffle.i to <4 x i16> + %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind + ret <4 x i32> %vmull2.i +} + define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: foo9: ; CHECK-NEXT: umull2.2d v0, v1, v2[1] @@ -1377,6 +1417,19 @@ ret <2 x i64> %vmull2.i } +define <2 x i64> @foo9a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp { +; CHECK-LABEL: foo9a: +; CHECK-NEXT: umull.2d v0, v1, v2[1] +; CHECK-NEXT: ret +entry: + %0 = bitcast <4 x i32> %b to <2 x i64> + %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> + %1 = bitcast <1 x i64> %shuffle.i to <2 x i32> + %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind + ret <2 x i64> %vmull2.i +} + define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind { ; CHECK-LABEL: bar0: ; CHECK: smlal2.8h v0, v1, v2 @@ -1667,6 +1720,24 @@ ret <2 x i64> %vmull2.i } +define <4 x i32> @vmull_low_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp { +entry: +; CHECK: vmull_low_n_s16_test +; CHECK-NOT: ext +; CHECK: smull.4s +; CHECK-NEXT: ret + %conv = trunc i32 %d to i16 + %0 = bitcast <8 x i16> %b to <2 x i64> + %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> + %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16> + %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0 + %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1 + %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2 + %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3 + %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind + ret <4 x i32> %vmull2.i.i +} + define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp { entry: ; CHECK: vmull_high_n_s16_test @@ -1804,8 +1875,21 @@ ret <2 x i64> %sum } -define <2 x i64> @mull_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { -; CHECK-LABEL: mull_from_extract_dup: +define <2 x i64> @mull_from_extract_dup_low(<4 x i32> %lhs, i32 %rhs) { +; CHECK-LABEL: mull_from_extract_dup_low: +; CHECK-NOT: ext +; CHECK: sqdmull.2d + %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 + %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 + + %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> + + %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind + ret <2 x i64> %res +} + +define <2 x i64> @mull_from_extract_dup_high(<4 x i32> %lhs, i32 %rhs) { +; CHECK-LABEL: mull_from_extract_dup_high: ; CHECK-NOT: ext ; CHECK: sqdmull2.2d %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 @@ -1817,8 +1901,21 @@ ret <2 x i64> %res } -define <8 x i16> @pmull_from_extract_dup(<16 x i8> %lhs, i8 %rhs) { -; CHECK-LABEL: pmull_from_extract_dup: +define <8 x i16> @pmull_from_extract_dup_low(<16 x i8> %lhs, i8 %rhs) { +; CHECK-LABEL: pmull_from_extract_dup_low: +; CHECK-NOT: ext +; CHECK: pmull.8h + %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0 + %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> + + %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> + + %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind + ret <8 x i16> %res +} + +define <8 x i16> @pmull_from_extract_dup_high(<16 x i8> %lhs, i8 %rhs) { +; CHECK-LABEL: pmull_from_extract_dup_high: ; CHECK-NOT: ext ; CHECK: pmull2.8h %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0 @@ -1830,8 +1927,20 @@ ret <8 x i16> %res } -define <8 x i16> @pmull_from_extract_duplane(<16 x i8> %lhs, <8 x i8> %rhs) { -; CHECK-LABEL: pmull_from_extract_duplane: +define <8 x i16> @pmull_from_extract_duplane_low(<16 x i8> %lhs, <8 x i8> %rhs) { +; CHECK-LABEL: pmull_from_extract_duplane_low: +; CHECK-NOT: ext +; CHECK: pmull.8h + + %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> + %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> + + %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind + ret <8 x i16> %res +} + +define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) { +; CHECK-LABEL: pmull_from_extract_duplane_high: ; CHECK-NOT: ext ; CHECK: pmull2.8h @@ -1842,8 +1951,20 @@ ret <8 x i16> %res } -define <2 x i64> @sqdmull_from_extract_duplane(<4 x i32> %lhs, <4 x i32> %rhs) { -; CHECK-LABEL: sqdmull_from_extract_duplane: +define <2 x i64> @sqdmull_from_extract_duplane_low(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK-LABEL: sqdmull_from_extract_duplane_low: +; CHECK-NOT: ext +; CHECK: sqdmull.2d + + %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> + %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> + + %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind + ret <2 x i64> %res +} + +define <2 x i64> @sqdmull_from_extract_duplane_high(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK-LABEL: sqdmull_from_extract_duplane_high: ; CHECK-NOT: ext ; CHECK: sqdmull2.2d @@ -1854,8 +1975,21 @@ ret <2 x i64> %res } -define <2 x i64> @sqdmlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { -; CHECK-LABEL: sqdmlal_from_extract_duplane: +define <2 x i64> @sqdmlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK-LABEL: sqdmlal_from_extract_duplane_low: +; CHECK-NOT: ext +; CHECK: sqdmlal.2d + + %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> + %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> + + %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind + %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res) + ret <2 x i64> %sum +} + +define <2 x i64> @sqdmlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK-LABEL: sqdmlal_from_extract_duplane_high: ; CHECK-NOT: ext ; CHECK: sqdmlal2.2d @@ -1867,8 +2001,21 @@ ret <2 x i64> %sum } -define <2 x i64> @umlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { -; CHECK-LABEL: umlal_from_extract_duplane: +define <2 x i64> @umlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK-LABEL: umlal_from_extract_duplane_low: +; CHECK-NOT: ext +; CHECK: umlal.2d + + %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> + %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> + + %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind + %sum = add <2 x i64> %accum, %res + ret <2 x i64> %sum +} + +define <2 x i64> @umlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK-LABEL: umlal_from_extract_duplane_high: ; CHECK-NOT: ext ; CHECK: umlal2.2d