diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10327,18 +10327,6 @@ return EltType.getSizeInBits() / 8; } -/// NarrowVector - Given a value in the V128 register class, produce the -/// equivalent value in the V64 register class. -static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { - EVT VT = V128Reg.getValueType(); - unsigned WideSize = VT.getVectorNumElements(); - MVT EltTy = VT.getVectorElementType().getSimpleVT(); - MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2); - SDLoc DL(V128Reg); - - return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg); -} - // Gather data to see if the operation can be modelled as a // shuffle in combination with VEXTs. SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, @@ -12594,7 +12582,6 @@ Subtarget->forceStreamingCompatibleSVE())) return LowerFixedLengthInsertVectorElt(Op, DAG); - // Check for non-constant or out of range lane. EVT VT = Op.getOperand(0).getValueType(); if (VT.getScalarType() == MVT::i1) { @@ -12613,31 +12600,12 @@ return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT); } + // Check for non-constant or out of range lane. ConstantSDNode *CI = dyn_cast(Op.getOperand(2)); if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) return SDValue(); - // Insertion/extraction are legal for V128 types. - if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || - VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || - VT == MVT::v8f16 || VT == MVT::v8bf16) - return Op; - - if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && - VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 && - VT != MVT::v4bf16) - return SDValue(); - - // For V64 types, we perform insertion by expanding the value - // to a V128 type and perform the insertion on that. - SDLoc DL(Op); - SDValue WideVec = WidenVector(Op.getOperand(0), DAG); - EVT WideTy = WideVec.getValueType(); - - SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec, - Op.getOperand(1), Op.getOperand(2)); - // Re-narrow the resultant vector. - return NarrowVector(Node, DAG); + return Op; } SDValue diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5932,14 +5932,15 @@ (i64 0)), dsub)>; -def : Pat<(vector_insert (v8f16 v8f16:$Rn), (f16 fpimm0), - (i64 VectorIndexH:$imm)), +def : Pat<(vector_insert (v8f16 V128:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)), (INSvi16gpr V128:$Rn, VectorIndexH:$imm, WZR)>; -def : Pat<(vector_insert v4f32:$Rn, (f32 fpimm0), - (i64 VectorIndexS:$imm)), +def : Pat<(vector_insert (v4f16 V64:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)), + (EXTRACT_SUBREG (INSvi16gpr (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), V64:$Rn, dsub)), VectorIndexH:$imm, WZR), dsub)>; +def : Pat<(vector_insert (v4f32 V128:$Rn), (f32 fpimm0), (i64 VectorIndexS:$imm)), (INSvi32gpr V128:$Rn, VectorIndexS:$imm, WZR)>; -def : Pat<(vector_insert v2f64:$Rn, (f64 fpimm0), - (i64 VectorIndexD:$imm)), +def : Pat<(vector_insert (v2f32 V64:$Rn), (f32 fpimm0), (i64 VectorIndexS:$imm)), + (EXTRACT_SUBREG (INSvi32gpr (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)), VectorIndexS:$imm, WZR), dsub)>; +def : Pat<(vector_insert v2f64:$Rn, (f64 fpimm0), (i64 VectorIndexD:$imm)), (INSvi64gpr V128:$Rn, VectorIndexS:$imm, XZR)>; def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn), @@ -5988,6 +5989,22 @@ (v2f64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rm, dsub)), (i64 0))>; +def : Pat<(v2i32 (vector_insert (v2i32 V64:$Rn), (i32 GPR32:$Rm), (i64 VectorIndexS:$imm))), + (EXTRACT_SUBREG + (INSvi32gpr (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), V64:$Rn, dsub)), + VectorIndexS:$imm, GPR32:$Rm), + dsub)>; +def : Pat<(v4i16 (vector_insert (v4i16 V64:$Rn), (i32 GPR32:$Rm), (i64 VectorIndexH:$imm))), + (EXTRACT_SUBREG + (INSvi16gpr (v8i16 (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), V64:$Rn, dsub)), + VectorIndexH:$imm, GPR32:$Rm), + dsub)>; +def : Pat<(v8i8 (vector_insert (v8i8 V64:$Rn), (i32 GPR32:$Rm), (i64 VectorIndexB:$imm))), + (EXTRACT_SUBREG + (INSvi8gpr (v16i8 (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), V64:$Rn, dsub)), + VectorIndexB:$imm, GPR32:$Rm), + dsub)>; + // Copy an element at a constant index in one vector into a constant indexed // element of another. // FIXME refactor to a shared class/dev parameterized on vector type, vector @@ -6051,10 +6068,20 @@ defm : Neon_INS_elt_pattern; defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; + // Insert from bitcast // vector_insert(bitcast(f32 src), n, lane) -> INSvi32lane(src, lane, INSERT_SUBREG(-, n), 0) def : Pat<(v4i32 (vector_insert v4i32:$src, (i32 (bitconvert (f32 FPR32:$Sn))), imm:$Immd)), (INSvi32lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$Sn, ssub), 0)>; +def : Pat<(v2i32 (vector_insert v2i32:$src, (i32 (bitconvert (f32 FPR32:$Sn))), imm:$Immd)), + (EXTRACT_SUBREG + (INSvi32lane (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), V64:$src, dsub)), + imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$Sn, ssub), 0), + dsub)>; def : Pat<(v2i64 (vector_insert v2i64:$src, (i64 (bitconvert (f64 FPR64:$Sn))), imm:$Immd)), (INSvi64lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$Sn, dsub), 0)>; @@ -7283,12 +7310,22 @@ // In this case, the index must be adjusted to match LD1 type. // class Ld1Lane128IdxOpPat + VecIndex, ValueType VTy, ValueType STy, + Instruction LD1, SDNodeXForm IdxOp> : Pat<(vector_insert (VTy VecListOne128:$Rd), (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx), (LD1 VecListOne128:$Rd, (IdxOp VecIndex:$idx), GPR64sp:$Rn)>; +class Ld1Lane64IdxOpPat + : Pat<(vector_insert (VTy VecListOne64:$Rd), + (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx), + (EXTRACT_SUBREG + (LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub), + (IdxOp VecIndex:$idx), GPR64sp:$Rn), + dsub)>; + def VectorIndexStoH : SDNodeXFormgetTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64); }]>; @@ -7303,6 +7340,10 @@ def : Ld1Lane128IdxOpPat; def : Ld1Lane128IdxOpPat; +def : Ld1Lane64IdxOpPat; +def : Ld1Lane64IdxOpPat; +def : Ld1Lane64IdxOpPat; + // Same as above, but the first element is populated using // scalar_to_vector + insert_subvector instead of insert_vector_elt. let Predicates = [NotInStreamingSVEMode] in { diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll --- a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll +++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll @@ -5,12 +5,12 @@ ; CHECK-LABEL: vector_deinterleave_v2f16_v4f16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v1.2s, v0.s[1] -; CHECK-NEXT: mov v2.16b, v0.16b -; CHECK-NEXT: mov v2.h[1], v1.h[0] +; CHECK-NEXT: dup v2.2s, v0.s[1] +; CHECK-NEXT: mov v1.16b, v2.16b ; CHECK-NEXT: mov v1.h[0], v0.h[1] +; CHECK-NEXT: mov v0.h[1], v2.h[0] ; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-NEXT: fmov d0, d2 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec) ret {<2 x half>, <2 x half>} %retval diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll --- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll +++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll @@ -1021,8 +1021,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.h[2], v1.h[1] ; CHECK-NEXT: mov v0.h[1], v1.h[0] +; CHECK-NEXT: mov v0.h[2], v1.h[1] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %c = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32>