diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9252,6 +9252,26 @@ if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) return SDValue(); + SDLoc DL(Op); + auto *CV = dyn_cast(Op.getOperand(1)); + if (CV && CV->isZero()) { + using RowTy = MVT[3]; + RowTy ConversionTbl[] = {{MVT::v4f16, MVT::v4i16, MVT::i32}, + {MVT::v8f16, MVT::v8i16, MVT::i32}, + {MVT::v2f32, MVT::v2i32, MVT::i32}, + {MVT::v4f32, MVT::v4i32, MVT::i32}, + {MVT::v2f64, MVT::v2i64, MVT::i64}}; + for (auto &KV : ConversionTbl) { + if (VT != KV[0]) + continue; + // When inserting 0.0 into a float vector, we can use the GPR variants of + // INS which use wzr/xzr to generate more compact code. + return DAG.getBitcast( + VT, DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, KV[1], + DAG.getBitcast(KV[1], Op.getOperand(0)), + DAG.getConstant(0, DL, KV[2]), Op.getOperand(2))); + } + } // Insertion/extraction are legal for V128 types. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || @@ -9266,7 +9286,6 @@ // For V64 types, we perform insertion by expanding the value // to a V128 type and perform the insertion on that. - SDLoc DL(Op); SDValue WideVec = WidenVector(Op.getOperand(0), DAG); EVT WideTy = WideVec.getValueType(); diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll --- a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll @@ -60,28 +60,23 @@ ret <8 x i16> %vecinit5 } -; TODO: This should jsut be a mov.s v0[3], wzr define <4 x half> @test_insert_v4f16_f16_zero(<4 x half> %a) { ; CHECK-LABEL: test_insert_v4f16_f16_zero: ; CHECK: bb.0: -; CHECK-NEXT: adrp x8, .LCPI4_0 ; CHECK-NEXT: kill -; CHECK-NEXT: add x8, x8, :lo12:.LCPI4_0 -; CHECK-NEXT: ld1.h { v0 }[0], [x8] +; CHECK-NEXT: mov.h v0[2], wzr ; CHECK-NEXT: kill ; CHECK-NEXT: ret entry: - %vecinit5 = insertelement <4 x half> %a, half 0.000000e+00, i32 0 + %vecinit5 = insertelement <4 x half> %a, half 0.000000e+00, i32 2 ret <4 x half> %vecinit5 } define <8 x half> @test_insert_v8f16_f16_zero(<8 x half> %a) { ; CHECK-LABEL: test_insert_v8f16_f16_zero: ; CHECK: bb.0: -; CHECK-NEXT: adrp x8, .LCPI5_0 -; CHECK-NEXT: add x8, x8, :lo12:.LCPI5_0 -; CHECK-NEXT: ld1.h { v0 }[6], [x8] +; CHECK-NEXT: mov.h v0[6], wzr ; CHECK-NEXT: ret entry: @@ -93,8 +88,7 @@ ; CHECK-LABEL: test_insert_v2f32_f32_zero: ; CHECK: bb.0: ; CHECK-NEXT: // kill -; CHECK-NEXT: fmov s1, wzr -; CHECK-NEXT: mov.s v0[0], v1[0] +; CHECK-NEXT: mov.s v0[0], wzr ; CHECK-NEXT: // kill ; CHECK-NEXT: ret @@ -106,8 +100,7 @@ define <4 x float> @test_insert_v4f32_f32_zero(<4 x float> %a) { ; CHECK-LABEL: test_insert_v4f32_f32_zero: ; CHECK: bb.0: -; CHECK-NEXT: fmov s1, wzr -; CHECK-NEXT: mov.s v0[3], v1[0] +; CHECK-NEXT: mov.s v0[3], wzr ; CHECK-NEXT: ret entry: @@ -118,8 +111,7 @@ define <2 x double> @test_insert_v2f64_f64_zero(<2 x double> %a) { ; CHECK-LABEL: test_insert_v2f64_f64_zero: ; CHECK: bb.0: -; CHECK-NEXT: fmov d1, xzr -; CHECK-NEXT: mov.d v0[1], v1[0] +; CHECK-NEXT: mov.d v0[1], xzr ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll @@ -47,8 +47,7 @@ define float @test_v3f32(<3 x float> %a) nounwind { ; CHECK-LABEL: test_v3f32: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov s1, wzr -; CHECK-NEXT: mov v0.s[3], v1.s[0] +; CHECK-NEXT: mov v0.s[3], wzr ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s ; CHECK-NEXT: faddp s0, v0.2s