Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10357,18 +10357,6 @@ return EltType.getSizeInBits() / 8; } -/// NarrowVector - Given a value in the V128 register class, produce the -/// equivalent value in the V64 register class. -static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { - EVT VT = V128Reg.getValueType(); - unsigned WideSize = VT.getVectorNumElements(); - MVT EltTy = VT.getVectorElementType().getSimpleVT(); - MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2); - SDLoc DL(V128Reg); - - return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg); -} - // Gather data to see if the operation can be modelled as a // shuffle in combination with VEXTs. SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, @@ -12624,7 +12612,6 @@ Subtarget->forceStreamingCompatibleSVE())) return LowerFixedLengthInsertVectorElt(Op, DAG); - // Check for non-constant or out of range lane. EVT VT = Op.getOperand(0).getValueType(); if (VT.getScalarType() == MVT::i1) { @@ -12643,31 +12630,12 @@ return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT); } + // Check for non-constant or out of range lane. ConstantSDNode *CI = dyn_cast(Op.getOperand(2)); if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) return SDValue(); - // Insertion/extraction are legal for V128 types. - if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || - VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || - VT == MVT::v8f16 || VT == MVT::v8bf16) - return Op; - - if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && - VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 && - VT != MVT::v4bf16) - return SDValue(); - - // For V64 types, we perform insertion by expanding the value - // to a V128 type and perform the insertion on that. - SDLoc DL(Op); - SDValue WideVec = WidenVector(Op.getOperand(0), DAG); - EVT WideTy = WideVec.getValueType(); - - SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec, - Op.getOperand(1), Op.getOperand(2)); - // Re-narrow the resultant vector. - return NarrowVector(Node, DAG); + return Op; } SDValue Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5932,14 +5932,15 @@ (i64 0)), dsub)>; -def : Pat<(vector_insert (v8f16 v8f16:$Rn), (f16 fpimm0), - (i64 VectorIndexH:$imm)), +def : Pat<(vector_insert (v8f16 V128:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)), (INSvi16gpr V128:$Rn, VectorIndexH:$imm, WZR)>; -def : Pat<(vector_insert v4f32:$Rn, (f32 fpimm0), - (i64 VectorIndexS:$imm)), +def : Pat<(vector_insert (v4f16 V64:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)), + (EXTRACT_SUBREG (INSvi16gpr (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), V64:$Rn, dsub)), VectorIndexH:$imm, WZR), dsub)>; +def : Pat<(vector_insert (v4f32 V128:$Rn), (f32 fpimm0), (i64 VectorIndexS:$imm)), (INSvi32gpr V128:$Rn, VectorIndexS:$imm, WZR)>; -def : Pat<(vector_insert v2f64:$Rn, (f64 fpimm0), - (i64 VectorIndexD:$imm)), +def : Pat<(vector_insert (v2f32 V64:$Rn), (f32 fpimm0), (i64 VectorIndexS:$imm)), + (EXTRACT_SUBREG (INSvi32gpr (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)), VectorIndexS:$imm, WZR), dsub)>; +def : Pat<(vector_insert v2f64:$Rn, (f64 fpimm0), (i64 VectorIndexD:$imm)), (INSvi64gpr V128:$Rn, VectorIndexS:$imm, XZR)>; def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn), @@ -5988,6 +5989,22 @@ (v2f64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rm, dsub)), (i64 0))>; +def : Pat<(v2i32 (vector_insert (v2i32 V64:$Rn), (i32 GPR32:$Rm), (i64 VectorIndexS:$imm))), + (EXTRACT_SUBREG + (INSvi32gpr (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), V64:$Rn, dsub)), + VectorIndexS:$imm, GPR32:$Rm), + dsub)>; +def : Pat<(v4i16 (vector_insert (v4i16 V64:$Rn), (i32 GPR32:$Rm), (i64 VectorIndexH:$imm))), + (EXTRACT_SUBREG + (INSvi16gpr (v8i16 (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), V64:$Rn, dsub)), + VectorIndexH:$imm, GPR32:$Rm), + dsub)>; +def : Pat<(v8i8 (vector_insert (v8i8 V64:$Rn), (i32 GPR32:$Rm), (i64 VectorIndexB:$imm))), + (EXTRACT_SUBREG + (INSvi8gpr (v16i8 (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), V64:$Rn, dsub)), + VectorIndexB:$imm, GPR32:$Rm), + dsub)>; + // Copy an element at a constant index in one vector into a constant indexed // element of another. // FIXME refactor to a shared class/dev parameterized on vector type, vector @@ -6051,10 +6068,20 @@ defm : Neon_INS_elt_pattern; defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; + // Insert from bitcast // vector_insert(bitcast(f32 src), n, lane) -> INSvi32lane(src, lane, INSERT_SUBREG(-, n), 0) def : Pat<(v4i32 (vector_insert v4i32:$src, (i32 (bitconvert (f32 FPR32:$Sn))), imm:$Immd)), (INSvi32lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$Sn, ssub), 0)>; +def : Pat<(v2i32 (vector_insert v2i32:$src, (i32 (bitconvert (f32 FPR32:$Sn))), imm:$Immd)), + (EXTRACT_SUBREG + (INSvi32lane (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), V64:$src, dsub)), + imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$Sn, ssub), 0), + dsub)>; def : Pat<(v2i64 (vector_insert v2i64:$src, (i64 (bitconvert (f64 FPR64:$Sn))), imm:$Immd)), (INSvi64lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$Sn, dsub), 0)>; @@ -7283,12 +7310,22 @@ // In this case, the index must be adjusted to match LD1 type. // class Ld1Lane128IdxOpPat + VecIndex, ValueType VTy, ValueType STy, + Instruction LD1, SDNodeXForm IdxOp> : Pat<(vector_insert (VTy VecListOne128:$Rd), (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx), (LD1 VecListOne128:$Rd, (IdxOp VecIndex:$idx), GPR64sp:$Rn)>; +class Ld1Lane64IdxOpPat + : Pat<(vector_insert (VTy VecListOne64:$Rd), + (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx), + (EXTRACT_SUBREG + (LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub), + (IdxOp VecIndex:$idx), GPR64sp:$Rn), + dsub)>; + def VectorIndexStoH : SDNodeXFormgetTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64); }]>; @@ -7303,6 +7340,10 @@ def : Ld1Lane128IdxOpPat; def : Ld1Lane128IdxOpPat; +def : Ld1Lane64IdxOpPat; +def : Ld1Lane64IdxOpPat; +def : Ld1Lane64IdxOpPat; + // Same as above, but the first element is populated using // scalar_to_vector + insert_subvector instead of insert_vector_elt. let Predicates = [NotInStreamingSVEMode] in { Index: llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll =================================================================== --- llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll +++ llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll @@ -5,12 +5,12 @@ ; CHECK-LABEL: vector_deinterleave_v2f16_v4f16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v1.2s, v0.s[1] -; CHECK-NEXT: mov v2.16b, v0.16b -; CHECK-NEXT: mov v2.h[1], v1.h[0] +; CHECK-NEXT: dup v2.2s, v0.s[1] +; CHECK-NEXT: mov v1.16b, v2.16b ; CHECK-NEXT: mov v1.h[0], v0.h[1] +; CHECK-NEXT: mov v0.h[1], v2.h[0] ; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-NEXT: fmov d0, d2 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec) ret {<2 x half>, <2 x half>} %retval Index: llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll =================================================================== --- llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll +++ llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll @@ -1021,8 +1021,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.h[2], v1.h[1] ; CHECK-NEXT: mov v0.h[1], v1.h[0] +; CHECK-NEXT: mov v0.h[2], v1.h[1] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %c = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> Index: llvm/test/CodeGen/AArch64/srem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/AArch64/srem-vector-lkk.ll +++ llvm/test/CodeGen/AArch64/srem-vector-lkk.ll @@ -106,44 +106,47 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: smov w9, v0.h[0] ; CHECK-NEXT: mov w8, #37253 -; CHECK-NEXT: movk w8, #44150, lsl #16 ; CHECK-NEXT: smov w10, v0.h[1] +; CHECK-NEXT: movk w8, #44150, lsl #16 ; CHECK-NEXT: smov w11, v0.h[2] -; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: mov w14, #95 +; CHECK-NEXT: smov w14, v0.h[3] +; CHECK-NEXT: mov w12, #95 ; CHECK-NEXT: smull x13, w9, w8 ; CHECK-NEXT: smull x15, w10, w8 ; CHECK-NEXT: lsr x13, x13, #32 ; CHECK-NEXT: smull x16, w11, w8 -; CHECK-NEXT: add w13, w13, w9 ; CHECK-NEXT: lsr x15, x15, #32 -; CHECK-NEXT: asr w17, w13, #6 -; CHECK-NEXT: add w15, w15, w10 -; CHECK-NEXT: add w13, w17, w13, lsr #31 -; CHECK-NEXT: asr w17, w15, #6 -; CHECK-NEXT: add w15, w17, w15, lsr #31 -; CHECK-NEXT: smull x8, w12, w8 -; CHECK-NEXT: msub w9, w13, w14, w9 +; CHECK-NEXT: add w13, w13, w9 +; CHECK-NEXT: add w10, w15, w10 +; CHECK-NEXT: asr w15, w13, #6 +; CHECK-NEXT: add w13, w15, w13, lsr #31 +; CHECK-NEXT: umov w15, v0.h[1] +; CHECK-NEXT: smull x8, w14, w8 ; CHECK-NEXT: lsr x16, x16, #32 -; CHECK-NEXT: add w16, w16, w11 -; CHECK-NEXT: msub w10, w15, w14, w10 -; CHECK-NEXT: asr w17, w16, #6 +; CHECK-NEXT: add w11, w16, w11 +; CHECK-NEXT: asr w16, w10, #6 +; CHECK-NEXT: msub w9, w13, w12, w9 +; CHECK-NEXT: add w10, w16, w10, lsr #31 ; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: fmov s1, w13 -; CHECK-NEXT: add w16, w17, w16, lsr #31 -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: add w8, w8, w12 -; CHECK-NEXT: asr w9, w8, #6 -; CHECK-NEXT: add w8, w9, w8, lsr #31 -; CHECK-NEXT: msub w9, w16, w14, w11 +; CHECK-NEXT: umov w16, v0.h[2] +; CHECK-NEXT: add w8, w8, w14 +; CHECK-NEXT: asr w14, w11, #6 +; CHECK-NEXT: add w11, w14, w11, lsr #31 +; CHECK-NEXT: msub w14, w10, w12, w15 +; CHECK-NEXT: fmov s1, w9 +; CHECK-NEXT: umov w9, v0.h[3] +; CHECK-NEXT: fmov s0, w13 +; CHECK-NEXT: asr w13, w8, #6 +; CHECK-NEXT: add w8, w13, w8, lsr #31 +; CHECK-NEXT: msub w13, w11, w12, w16 +; CHECK-NEXT: mov v1.h[1], w14 ; CHECK-NEXT: mov v0.h[1], w10 -; CHECK-NEXT: mov v1.h[1], w15 -; CHECK-NEXT: msub w10, w8, w14, w12 -; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: mov v1.h[2], w16 -; CHECK-NEXT: mov v0.h[3], w10 -; CHECK-NEXT: mov v1.h[3], w8 -; CHECK-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-NEXT: msub w9, w8, w12, w9 +; CHECK-NEXT: mov v1.h[2], w13 +; CHECK-NEXT: mov v0.h[2], w11 +; CHECK-NEXT: mov v1.h[3], w9 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: add v0.4h, v1.4h, v0.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, %2 = sdiv <4 x i16> %x,