diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -159,6 +159,22 @@ return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift); } + bool SelectExtractHigh(SDValue N, SDValue &Res) { + if (Subtarget->isLittleEndian() && N->getOpcode() == ISD::BITCAST) + N = N->getOperand(0); + if (N->getOpcode() != ISD::EXTRACT_SUBVECTOR || + !isa(N->getOperand(1))) + return false; + EVT VT = N->getValueType(0); + EVT LVT = N->getOperand(0).getValueType(); + unsigned Index = N->getConstantOperandVal(1); + if (!VT.is64BitVector() || !LVT.is128BitVector() || + Index != VT.getVectorNumElements()) + return false; + Res = N->getOperand(0); + return true; + } + bool SelectDupZeroOrUndef(SDValue N) { switch(N->getOpcode()) { case ISD::UNDEF: diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -109,15 +109,19 @@ class BinOpFrag : PatFrag<(ops node:$LHS, node:$RHS), res>; class UnOpFrag : PatFrag<(ops node:$LHS), res>; -// Helper fragment for an extract of the high portion of a 128-bit vector. +// Helper fragment for an extract of the high portion of a 128-bit vector. The +// ComplexPattern match both extract_subvector and bitcast(extract_subvector(..)). def extract_high_v16i8 : - UnOpFrag<(extract_subvector (v16i8 node:$LHS), (i64 8))>; + ComplexPattern; def extract_high_v8i16 : - UnOpFrag<(extract_subvector (v8i16 node:$LHS), (i64 4))>; + ComplexPattern; def extract_high_v4i32 : - UnOpFrag<(extract_subvector (v4i32 node:$LHS), (i64 2))>; -def extract_high_v2i64 : - UnOpFrag<(extract_subvector (v2i64 node:$LHS), (i64 1))>; + ComplexPattern; + +def extract_high_dup_v8i16 : + BinOpFrag<(extract_subvector (v8i16 (AArch64duplane16 (v8i16 node:$LHS), node:$RHS)), (i64 4))>; +def extract_high_dup_v4i32 : + BinOpFrag<(extract_subvector (v4i32 (AArch64duplane32 (v4i32 node:$LHS), node:$RHS)), (i64 2))>; //===----------------------------------------------------------------------===// // Asm Operand Classes. @@ -6509,8 +6513,8 @@ asm#"2", ".1q", ".2d", ".2d", []>; } - def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 V128:$Rn)), - (v8i8 (extract_high_v16i8 V128:$Rm)))), + def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn))), + (v8i8 (extract_high_v16i8 (v16i8 V128:$Rm))))), (!cast(NAME#"v16i8") V128:$Rn, V128:$Rm)>; } @@ -6523,8 +6527,8 @@ def v8i16_v4i32 : BaseSIMDDifferentThreeVector; + [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)), + (extract_high_v8i16 (v8i16 V128:$Rm))))]>; def v2i32_v2i64 : BaseSIMDDifferentThreeVector; + [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)), + (extract_high_v4i32 (v4i32 V128:$Rm))))]>; } multiclass SIMDLongThreeVectorBHSabdl opc, string asm, @@ -6547,8 +6551,8 @@ V128, V128, V128, asm#"2", ".8h", ".16b", ".16b", [(set (v8i16 V128:$Rd), - (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn), - (extract_high_v16i8 V128:$Rm)))))]>; + (zext (v8i8 (OpNode (extract_high_v16i8 (v16i8 V128:$Rn)), + (extract_high_v16i8 (v16i8 V128:$Rm))))))]>; def v4i16_v4i32 : BaseSIMDDifferentThreeVector; + (zext (v4i16 (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)), + (extract_high_v8i16 (v8i16 V128:$Rm))))))]>; def v2i32_v2i64 : BaseSIMDDifferentThreeVector; + (zext (v2i32 (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)), + (extract_high_v4i32 (v4i32 V128:$Rm))))))]>; } multiclass SIMDLongThreeVectorTiedBHSabal opc, @@ -6587,8 +6591,8 @@ asm#"2", ".8h", ".16b", ".16b", [(set (v8i16 V128:$dst), (add (v8i16 V128:$Rd), - (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn), - (extract_high_v16i8 V128:$Rm))))))]>; + (zext (v8i8 (OpNode (extract_high_v16i8 (v16i8 V128:$Rn)), + (extract_high_v16i8 (v16i8 V128:$Rm)))))))]>; def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied; + (zext (v4i16 (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)), + (extract_high_v8i16 (v8i16 V128:$Rm)))))))]>; def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied; + (zext (v2i32 (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)), + (extract_high_v4i32 (v4i32 V128:$Rm)))))))]>; } multiclass SIMDLongThreeVectorBHS opc, string asm, @@ -6626,8 +6630,8 @@ def v16i8_v8i16 : BaseSIMDDifferentThreeVector; + [(set (v8i16 V128:$Rd), (OpNode (extract_high_v16i8 (v16i8 V128:$Rn)), + (extract_high_v16i8 (v16i8 V128:$Rm))))]>; def v4i16_v4i32 : BaseSIMDDifferentThreeVector; + [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)), + (extract_high_v8i16 (v8i16 V128:$Rm))))]>; def v2i32_v2i64 : BaseSIMDDifferentThreeVector; + [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)), + (extract_high_v4i32 (v4i32 V128:$Rm))))]>; } multiclass SIMDLongThreeVectorTiedBHS opc, @@ -6661,8 +6665,8 @@ asm#"2", ".8h", ".16b", ".16b", [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), - (extract_high_v16i8 V128:$Rn), - (extract_high_v16i8 V128:$Rm)))]>; + (extract_high_v16i8 (v16i8 V128:$Rn)), + (extract_high_v16i8 (v16i8 V128:$Rm))))]>; def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied; + (extract_high_v8i16 (v8i16 V128:$Rn)), + (extract_high_v8i16 (v8i16 V128:$Rm))))]>; def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied; + (extract_high_v4i32 (v4i32 V128:$Rn)), + (extract_high_v4i32 (v4i32 V128:$Rm))))]>; } multiclass SIMDLongThreeVectorSQDMLXTiedHS opc, string asm, @@ -6703,8 +6707,8 @@ asm#"2", ".4s", ".8h", ".8h", [(set (v4i32 V128:$dst), (Accum (v4i32 V128:$Rd), - (v4i32 (int_aarch64_neon_sqdmull (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 V128:$Rm)))))]>; + (v4i32 (int_aarch64_neon_sqdmull (extract_high_v8i16 (v8i16 V128:$Rn)), + (extract_high_v8i16 (v8i16 V128:$Rm))))))]>; def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied; + (v2i64 (int_aarch64_neon_sqdmull (extract_high_v4i32 (v4i32 V128:$Rn)), + (extract_high_v4i32 (v4i32 V128:$Rm))))))]>; } multiclass SIMDWideThreeVectorBHS opc, string asm, @@ -6731,7 +6735,7 @@ V128, V128, V128, asm#"2", ".8h", ".8h", ".16b", [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), - (extract_high_v16i8 V128:$Rm)))]>; + (extract_high_v16i8 (v16i8 V128:$Rm))))]>; def v4i16_v4i32 : BaseSIMDDifferentThreeVector; + (extract_high_v8i16 (v8i16 V128:$Rm))))]>; def v2i32_v2i64 : BaseSIMDDifferentThreeVector; + (extract_high_v4i32 (v4i32 V128:$Rm))))]>; } //---------------------------------------------------------------------------- @@ -8768,9 +8772,8 @@ V128_lo, VectorIndexH, asm#"2", ".4s", ".4s", ".8h", ".h", [(set (v4i32 V128:$Rd), - (OpNode (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx))))]> { + (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)), + (extract_high_dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))]> { bits<3> idx; let Inst{11} = idx{2}; @@ -8795,9 +8798,8 @@ V128, VectorIndexS, asm#"2", ".2d", ".2d", ".4s", ".s", [(set (v2i64 V128:$Rd), - (OpNode (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx))))]> { + (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)), + (extract_high_dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; @@ -8860,10 +8862,8 @@ [(set (v4i32 V128:$dst), (Accum (v4i32 V128:$Rd), (v4i32 (int_aarch64_neon_sqdmull - (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 - (AArch64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx))))))]> { + (extract_high_v8i16 (v8i16 V128:$Rn)), + (extract_high_dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; @@ -8892,10 +8892,8 @@ [(set (v2i64 V128:$dst), (Accum (v2i64 V128:$Rd), (v2i64 (int_aarch64_neon_sqdmull - (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 - (AArch64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx))))))]> { + (extract_high_v4i32 (v4i32 V128:$Rn)), + (extract_high_dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; @@ -8948,9 +8946,8 @@ V128_lo, VectorIndexH, asm#"2", ".4s", ".4s", ".8h", ".h", [(set (v4i32 V128:$Rd), - (OpNode (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx))))]> { + (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)), + (extract_high_dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))]> { bits<3> idx; let Inst{11} = idx{2}; @@ -8975,9 +8972,8 @@ V128, VectorIndexS, asm#"2", ".2d", ".2d", ".4s", ".s", [(set (v2i64 V128:$Rd), - (OpNode (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx))))]> { + (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)), + (extract_high_dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; @@ -9007,9 +9003,8 @@ asm#"2", ".4s", ".4s", ".8h", ".h", [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), - (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx))))]> { + (extract_high_v8i16 (v8i16 V128:$Rn)), + (extract_high_dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; @@ -9034,9 +9029,8 @@ asm#"2", ".2d", ".2d", ".4s", ".s", [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), - (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx))))]> { + (extract_high_v4i32 (v4i32 V128:$Rn)), + (extract_high_dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; @@ -9721,7 +9715,7 @@ V128, V128, vecshiftL8, asm#"2", ".8h", ".16b", [(set (v8i16 V128:$Rd), - (OpNode (extract_high_v16i8 V128:$Rn), vecshiftL8:$imm))]> { + (OpNode (extract_high_v16i8 (v16i8 V128:$Rn)), vecshiftL8:$imm))]> { bits<3> imm; let Inst{18-16} = imm; } @@ -9737,7 +9731,7 @@ V128, V128, vecshiftL16, asm#"2", ".4s", ".8h", [(set (v4i32 V128:$Rd), - (OpNode (extract_high_v8i16 V128:$Rn), vecshiftL16:$imm))]> { + (OpNode (extract_high_v8i16 (v8i16 V128:$Rn)), vecshiftL16:$imm))]> { bits<4> imm; let Inst{19-16} = imm; @@ -9754,7 +9748,7 @@ V128, V128, vecshiftL32, asm#"2", ".2d", ".4s", [(set (v2i64 V128:$Rd), - (OpNode (extract_high_v4i32 V128:$Rn), vecshiftL32:$imm))]> { + (OpNode (extract_high_v4i32 (v4i32 V128:$Rn)), vecshiftL32:$imm))]> { bits<5> imm; let Inst{20-16} = imm; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4250,25 +4250,25 @@ (zext (v8i8 V64:$opB))), (AArch64vashr v8i16:$src, (i32 15))))), (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>; -def : Pat<(abs (v8i16 (sub (zext (extract_high_v16i8 V128:$opA)), - (zext (extract_high_v16i8 V128:$opB))))), +def : Pat<(abs (v8i16 (sub (zext (extract_high_v16i8 (v16i8 V128:$opA))), + (zext (extract_high_v16i8 (v16i8 V128:$opB)))))), (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>; def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))), - (v8i16 (add (sub (zext (extract_high_v16i8 V128:$opA)), - (zext (extract_high_v16i8 V128:$opB))), + (v8i16 (add (sub (zext (extract_high_v16i8 (v16i8 V128:$opA))), + (zext (extract_high_v16i8 (v16i8 V128:$opB)))), (AArch64vashr v8i16:$src, (i32 15))))), (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>; def : Pat<(abs (v4i32 (sub (zext (v4i16 V64:$opA)), (zext (v4i16 V64:$opB))))), (UABDLv4i16_v4i32 V64:$opA, V64:$opB)>; -def : Pat<(abs (v4i32 (sub (zext (extract_high_v8i16 V128:$opA)), - (zext (extract_high_v8i16 V128:$opB))))), +def : Pat<(abs (v4i32 (sub (zext (extract_high_v8i16 (v8i16 V128:$opA))), + (zext (extract_high_v8i16 (v8i16 V128:$opB)))))), (UABDLv8i16_v4i32 V128:$opA, V128:$opB)>; def : Pat<(abs (v2i64 (sub (zext (v2i32 V64:$opA)), (zext (v2i32 V64:$opB))))), (UABDLv2i32_v2i64 V64:$opA, V64:$opB)>; -def : Pat<(abs (v2i64 (sub (zext (extract_high_v4i32 V128:$opA)), - (zext (extract_high_v4i32 V128:$opB))))), +def : Pat<(abs (v2i64 (sub (zext (extract_high_v4i32 (v4i32 V128:$opA))), + (zext (extract_high_v4i32 (v4i32 V128:$opB)))))), (UABDLv4i32_v2i64 V128:$opA, V128:$opB)>; defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", abs>; @@ -4439,15 +4439,15 @@ multiclass SIMDVectorLShiftLongBySizeBHSPats { def : Pat<(AArch64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)), (SHLLv8i8 V64:$Rn)>; - def : Pat<(AArch64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)), + def : Pat<(AArch64vshl (v8i16 (ext (extract_high_v16i8 (v16i8 V128:$Rn)))), (i32 8)), (SHLLv16i8 V128:$Rn)>; def : Pat<(AArch64vshl (v4i32 (ext (v4i16 V64:$Rn))), (i32 16)), (SHLLv4i16 V64:$Rn)>; - def : Pat<(AArch64vshl (v4i32 (ext (extract_high_v8i16 V128:$Rn))), (i32 16)), + def : Pat<(AArch64vshl (v4i32 (ext (extract_high_v8i16 (v8i16 V128:$Rn)))), (i32 16)), (SHLLv8i16 V128:$Rn)>; def : Pat<(AArch64vshl (v2i64 (ext (v2i32 V64:$Rn))), (i32 32)), (SHLLv2i32 V64:$Rn)>; - def : Pat<(AArch64vshl (v2i64 (ext (extract_high_v4i32 V128:$Rn))), (i32 32)), + def : Pat<(AArch64vshl (v2i64 (ext (extract_high_v4i32 (v4i32 V128:$Rn)))), (i32 32)), (SHLLv4i32 V128:$Rn)>; } diff --git a/llvm/test/CodeGen/AArch64/highextractbitcast.ll b/llvm/test/CodeGen/AArch64/highextractbitcast.ll --- a/llvm/test/CodeGen/AArch64/highextractbitcast.ll +++ b/llvm/test/CodeGen/AArch64/highextractbitcast.ll @@ -37,9 +37,7 @@ define <4 x i32> @test_smull_high_s16_bitcasta1(<2 x i64> %aa, <8 x i16> %b) #0 { ; CHECK-LE-LABEL: test_smull_high_s16_bitcasta1: ; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-LE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-LE-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: test_smull_high_s16_bitcasta1: @@ -63,9 +61,7 @@ define <4 x i32> @test_smull_high_s16_bitcastb1(<8 x i16> %a, <16 x i8> %bb) #0 { ; CHECK-LE-LABEL: test_smull_high_s16_bitcastb1: ; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-LE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-LE-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: test_smull_high_s16_bitcastb1: @@ -89,9 +85,7 @@ define <4 x i32> @test_smull_high_s16_bitcasta2(<2 x i64> %a, <8 x i16> %b) #0 { ; CHECK-LE-LABEL: test_smull_high_s16_bitcasta2: ; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-LE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-LE-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: test_smull_high_s16_bitcasta2: @@ -117,9 +111,7 @@ define <4 x i32> @test_smull_high_s16_bitcastb2(<8 x i16> %a, <16 x i8> %b) #0 { ; CHECK-LE-LABEL: test_smull_high_s16_bitcastb2: ; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-LE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-LE-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: test_smull_high_s16_bitcastb2: @@ -374,9 +366,7 @@ define <4 x i32> @test_umull_high_s16_bitcasta1(<2 x i64> %aa, <8 x i16> %b) #0 { ; CHECK-LE-LABEL: test_umull_high_s16_bitcasta1: ; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-LE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-LE-NEXT: umull v0.4s, v0.4h, v1.4h +; CHECK-LE-NEXT: umull2 v0.4s, v0.8h, v1.8h ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: test_umull_high_s16_bitcasta1: @@ -400,9 +390,7 @@ define <8 x i16> @test_vabdl_high_u82(<16 x i8> %a, <8 x i16> %bb) { ; CHECK-LE-LABEL: test_vabdl_high_u82: ; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-LE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-LE-NEXT: uabdl v0.8h, v0.8b, v1.8b +; CHECK-LE-NEXT: uabdl2 v0.8h, v0.16b, v1.16b ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: test_vabdl_high_u82: @@ -427,9 +415,7 @@ define <8 x i16> @test_vabdl_high_s82(<16 x i8> %a, <8 x i16> %bb) { ; CHECK-LE-LABEL: test_vabdl_high_s82: ; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-LE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-LE-NEXT: sabdl v0.8h, v0.8b, v1.8b +; CHECK-LE-NEXT: sabdl2 v0.8h, v0.16b, v1.16b ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: test_vabdl_high_s82: @@ -454,9 +440,7 @@ define <4 x i32> @test_vqdmlal_high_s16_bitcast(<4 x i32> %a, <8 x i16> %b, <16 x i8> %cc) { ; CHECK-LE-LABEL: test_vqdmlal_high_s16_bitcast: ; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-LE-NEXT: ext v2.16b, v2.16b, v2.16b, #8 -; CHECK-LE-NEXT: sqdmlal v0.4s, v1.4h, v2.4h +; CHECK-LE-NEXT: sqdmlal2 v0.4s, v1.8h, v2.8h ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: test_vqdmlal_high_s16_bitcast: @@ -510,9 +494,7 @@ define <8 x i16> @test_pmull_high_p8_64(<2 x i64> %aa, <2 x i64> %bb) { ; CHECK-LE-LABEL: test_pmull_high_p8_64: ; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-LE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-LE-NEXT: pmull v0.8h, v0.8b, v1.8b +; CHECK-LE-NEXT: pmull2 v0.8h, v0.16b, v1.16b ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: test_pmull_high_p8_64: @@ -568,8 +550,7 @@ define <2 x i64> @hadd32_zext_asr(<16 x i8> %src1a) { ; CHECK-LE-LABEL: hadd32_zext_asr: ; CHECK-LE: // %bb.0: -; CHECK-LE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-LE-NEXT: ushll v0.2d, v0.2s, #1 +; CHECK-LE-NEXT: ushll2 v0.2d, v0.4s, #1 ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: hadd32_zext_asr: