Index: llvm/trunk/lib/Target/ARM/ARMInstrNEON.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARMInstrNEON.td +++ llvm/trunk/lib/Target/ARM/ARMInstrNEON.td @@ -6336,6 +6336,9 @@ let Inst{19} = lane{0}; } +def : Pat<(v4f16 (NEONvduplane (v4f16 DPR:$Vm), imm:$lane)), + (VDUPLN32d DPR:$Vm, imm:$lane)>; + def : Pat<(v2f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)), (VDUPLN32d DPR:$Vm, imm:$lane)>; @@ -6350,6 +6353,10 @@ (v8i16 (VDUPLN16q (v4i16 (EXTRACT_SUBREG QPR:$src, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane)))>; +def : Pat<(v8f16 (NEONvduplane (v8f16 QPR:$src), imm:$lane)), + (v8f16 (VDUPLN16q (v4f16 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; def : Pat<(v4i32 (NEONvduplane (v4i32 QPR:$src), imm:$lane)), (v4i32 (VDUPLN32q (v2i32 (EXTRACT_SUBREG QPR:$src, (DSubReg_i32_reg imm:$lane))), @@ -6359,12 +6366,18 @@ (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; +def : Pat<(v4f16 (NEONvdup HPR:$src)), + (v4f16 (VDUPLN16d (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), + HPR:$src, ssub_0), (i32 0)))>; def : Pat<(v2f32 (NEONvdup (f32 SPR:$src))), (v2f32 (VDUPLN32d (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$src, ssub_0), (i32 0)))>; def : Pat<(v4f32 (NEONvdup (f32 SPR:$src))), (v4f32 (VDUPLN32q (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$src, ssub_0), (i32 0)))>; +def : Pat<(v8f16 (NEONvdup HPR:$src)), + (v8f16 (VDUPLN16q (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), + HPR:$src, ssub_0), (i32 0)))>; // VMOVN : Vector Narrowing Move defm VMOVN : N2VN_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVN, Index: llvm/trunk/test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll +++ llvm/trunk/test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll @@ -1120,58 +1120,78 @@ ; %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x half> %vtrn1.i, 0, 1 ; ret %struct.float16x8x2_t %.fca.0.1.insert ;} -; -;define dso_local <4 x half> @test_vmov_n_f16(float %a.coerce) { -;entry: -; %0 = bitcast float %a.coerce to i32 -; %tmp.0.extract.trunc = trunc i32 %0 to i16 -; %1 = bitcast i16 %tmp.0.extract.trunc to half -; %vecinit = insertelement <4 x half> undef, half %1, i32 0 -; %vecinit4 = shufflevector <4 x half> %vecinit, <4 x half> undef, <4 x i32> zeroinitializer -; ret <4 x half> %vecinit4 -;} -; -;define dso_local <8 x half> @test_vmovq_n_f16(float %a.coerce) { -;entry: -; %0 = bitcast float %a.coerce to i32 -; %tmp.0.extract.trunc = trunc i32 %0 to i16 -; %1 = bitcast i16 %tmp.0.extract.trunc to half -; %vecinit = insertelement <8 x half> undef, half %1, i32 0 -; %vecinit8 = shufflevector <8 x half> %vecinit, <8 x half> undef, <8 x i32> zeroinitializer -; ret <8 x half> %vecinit8 -;} -; -;define dso_local <4 x half> @test_vdup_n_f16(float %a.coerce) { -;entry: -; %0 = bitcast float %a.coerce to i32 -; %tmp.0.extract.trunc = trunc i32 %0 to i16 -; %1 = bitcast i16 %tmp.0.extract.trunc to half -; %vecinit = insertelement <4 x half> undef, half %1, i32 0 -; %vecinit4 = shufflevector <4 x half> %vecinit, <4 x half> undef, <4 x i32> zeroinitializer -; ret <4 x half> %vecinit4 -;} -; -;define dso_local <8 x half> @test_vdupq_n_f16(float %a.coerce) { -;entry: -; %0 = bitcast float %a.coerce to i32 -; %tmp.0.extract.trunc = trunc i32 %0 to i16 -; %1 = bitcast i16 %tmp.0.extract.trunc to half -; %vecinit = insertelement <8 x half> undef, half %1, i32 0 -; %vecinit8 = shufflevector <8 x half> %vecinit, <8 x half> undef, <8 x i32> zeroinitializer -; ret <8 x half> %vecinit8 -;} -; -;define dso_local <4 x half> @test_vdup_lane_f16(<4 x half> %a) { -;entry: -; %shuffle = shufflevector <4 x half> %a, <4 x half> undef, <4 x i32> -; ret <4 x half> %shuffle -;} -; -;define dso_local <8 x half> @test_vdupq_lane_f16(<4 x half> %a) { -;entry: -; %shuffle = shufflevector <4 x half> %a, <4 x half> undef, <8 x i32> -; ret <8 x half> %shuffle -;} + +define dso_local <4 x half> @test_vmov_n_f16(float %a.coerce) { +; CHECK-LABEL: test_vmov_n_f16: +; CHECK: vdup.16 d0, d0[0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %vecinit = insertelement <4 x half> undef, half %1, i32 0 + %vecinit4 = shufflevector <4 x half> %vecinit, <4 x half> undef, <4 x i32> zeroinitializer + ret <4 x half> %vecinit4 +} + +define dso_local <8 x half> @test_vmovq_n_f16(float %a.coerce) { +; CHECK-LABEL: test_vmovq_n_f16: +; CHECK: vdup.16 q0, d0[0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %vecinit = insertelement <8 x half> undef, half %1, i32 0 + %vecinit8 = shufflevector <8 x half> %vecinit, <8 x half> undef, <8 x i32> zeroinitializer + ret <8 x half> %vecinit8 +} + +define dso_local <4 x half> @test_vdup_n_f16(float %a.coerce) { +; CHECK-LABEL: test_vdup_n_f16: +; CHECK: vdup.16 d0, d0[0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %vecinit = insertelement <4 x half> undef, half %1, i32 0 + %vecinit4 = shufflevector <4 x half> %vecinit, <4 x half> undef, <4 x i32> zeroinitializer + ret <4 x half> %vecinit4 +} + +define dso_local <8 x half> @test_vdupq_n_f16(float %a.coerce) { +; CHECK-LABEL: test_vdupq_n_f16: +; CHECK: vdup.16 q0, d0[0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %vecinit = insertelement <8 x half> undef, half %1, i32 0 + %vecinit8 = shufflevector <8 x half> %vecinit, <8 x half> undef, <8 x i32> zeroinitializer + ret <8 x half> %vecinit8 +} + +define dso_local <4 x half> @test_vdup_lane_f16(<4 x half> %a) { +; CHECK-LABEL: test_vdup_lane_f16: +; CHECK: vdup.32 d0, d0[3] +; CHECK-NEXT: bx lr +entry: + %shuffle = shufflevector <4 x half> %a, <4 x half> undef, <4 x i32> + ret <4 x half> %shuffle +} + +define dso_local <8 x half> @test_vdupq_lane_f16(<4 x half> %a) { +; CHECK-LABEL: test_vdupq_lane_f16: +; CHECK: vdup.16 q0, d0[3] +; CHECK-NEXT: bx lr +entry: + %shuffle = shufflevector <4 x half> %a, <4 x half> undef, <8 x i32> + ret <8 x half> %shuffle +} + +; FIXME (PR38404) ; ;define dso_local <4 x half> @test_vext_f16(<4 x half> %a, <4 x half> %b) { ;entry: