Index: llvm/trunk/lib/Target/ARM/ARMInstrNEON.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARMInstrNEON.td +++ llvm/trunk/lib/Target/ARM/ARMInstrNEON.td @@ -1117,6 +1117,12 @@ def VLD1LNq16Pseudo : VLD1QLNPseudo; def VLD1LNq32Pseudo : VLD1QLNPseudo; +def : Pat<(vector_insert (v4f16 DPR:$src), + (f16 (load addrmode6:$addr)), imm:$lane), + (VLD1LNd16 addrmode6:$addr, DPR:$src, imm:$lane)>; +def : Pat<(vector_insert (v8f16 QPR:$src), + (f16 (load addrmode6:$addr)), imm:$lane), + (VLD1LNq16Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>; def : Pat<(vector_insert (v2f32 DPR:$src), (f32 (load addrmode6:$addr)), imm:$lane), (VLD1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>; @@ -2175,6 +2181,11 @@ def : Pat<(store (extractelt (v4f32 QPR:$src), imm:$lane), addrmode6:$addr), (VST1LNq32Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>; +def : Pat<(store (extractelt (v4f16 DPR:$src), imm:$lane), addrmode6:$addr), + (VST1LNd16 addrmode6:$addr, DPR:$src, imm:$lane)>; +def : Pat<(store (extractelt (v8f16 QPR:$src), imm:$lane), addrmode6:$addr), + (VST1LNq16Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>; + // ...with address register writeback: class VST1LNWB op11_8, bits<4> op7_4, string Dt, ValueType Ty, PatFrag StoreOp, SDNode ExtractOp, Operand AdrMode> @@ -2504,6 +2515,13 @@ MVT::i32); }]>; +// Extract S sub-registers of Q/D registers containing a given f16 lane. +def SSubReg_f16_reg : SDNodeXFormgetTargetConstant(ARM::ssub_0 + N->getZExtValue()/2, SDLoc(N), + MVT::i32); +}]>; + // Translate lane numbers from Q registers to D subregs. def SubReg_i8_lane : SDNodeXFormgetTargetConstant(N->getZExtValue() & 7, SDLoc(N), MVT::i32); @@ -6223,6 +6241,32 @@ def : Pat<(extractelt (v2f64 QPR:$src1), imm:$src2), (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>; +def imm_even : ImmLeaf; +def imm_odd : ImmLeaf; + +def : Pat<(extractelt (v4f16 DPR:$src), imm_even:$lane), + (EXTRACT_SUBREG + (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)), + (SSubReg_f16_reg imm_even:$lane))>; + +def : Pat<(extractelt (v4f16 DPR:$src), imm_odd:$lane), + (COPY_TO_REGCLASS + (VMOVH (EXTRACT_SUBREG + (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)), + (SSubReg_f16_reg imm_odd:$lane))), + HPR)>; + +def : Pat<(extractelt (v8f16 QPR:$src), imm_even:$lane), + (EXTRACT_SUBREG + (v4f32 (COPY_TO_REGCLASS (v8f16 QPR:$src), QPR_VFP2)), + (SSubReg_f16_reg imm_even:$lane))>; + +def : Pat<(extractelt (v8f16 QPR:$src), imm_odd:$lane), + (COPY_TO_REGCLASS + (VMOVH (EXTRACT_SUBREG + (v4f32 (COPY_TO_REGCLASS (v8f16 QPR:$src), QPR_VFP2)), + (SSubReg_f16_reg imm_odd:$lane))), + HPR)>; // VMOV : Vector Set Lane (move ARM core register to scalar) @@ -6281,6 +6325,15 @@ (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS QPR:$src1, QPR_VFP2)), SPR:$src2, (SSubReg_f32_reg imm:$src3))>; +def : Pat<(insertelt (v4f16 DPR:$src1), HPR:$src2, imm:$lane), + (v4f16 (VSETLNi16 DPR:$src1, (VMOVRH $src2), imm:$lane))>; +def : Pat<(insertelt (v8f16 QPR:$src1), HPR:$src2, imm:$lane), + (v8f16 (INSERT_SUBREG QPR:$src1, + (v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1, + (DSubReg_i16_reg imm:$lane))), + (VMOVRH $src2), (SubReg_i16_lane imm:$lane))), + (DSubReg_i16_reg imm:$lane)))>; + //def : Pat<(v2i64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)), // (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>; def : Pat<(v2f64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)), Index: llvm/trunk/test/CodeGen/ARM/fp16-insert-extract.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/fp16-insert-extract.ll +++ llvm/trunk/test/CodeGen/ARM/fp16-insert-extract.ll @@ -0,0 +1,72 @@ +; RUN: llc -mtriple=arm-eabi -mattr=+armv8.2-a,+fullfp16,+neon -float-abi=hard -O1 < %s | FileCheck %s +; RUN: llc -mtriple=arm-eabi -mattr=+armv8.2-a,+fullfp16,+neon -float-abi=soft -O1 < %s | FileCheck %s + +define float @test_vget_lane_f16_1(<4 x half> %a) nounwind { +; CHECK-LABEL: test_vget_lane_f16_1: +; CHECK: vmovx.f16 s0, s0 +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +entry: + %elt = extractelement <4 x half> %a, i32 1 + %conv = fpext half %elt to float + ret float %conv +} + +define float @test_vget_lane_f16_2(<4 x half> %a) nounwind { +; CHECK-LABEL: test_vget_lane_f16_2: +; CHECK-NOT: vmovx.f16 +; CHECK: vcvtb.f32.f16 s0, s1 +entry: + %elt = extractelement <4 x half> %a, i32 2 + %conv = fpext half %elt to float + ret float %conv +} + +define float @test_vget_laneq_f16_6(<8 x half> %a) nounwind { +; CHECK-LABEL: test_vget_laneq_f16_6: +; CHECK-NOT: vmovx.f16 +; CHECK: vcvtb.f32.f16 s0, s3 +entry: + %elt = extractelement <8 x half> %a, i32 6 + %conv = fpext half %elt to float + ret float %conv +} + +define float @test_vget_laneq_f16_7(<8 x half> %a) nounwind { +; CHECK-LABEL: test_vget_laneq_f16_7: +; CHECK: vmovx.f16 s0, s3 +; CHECK: vcvtb.f32.f16 s0, s0 +entry: + %elt = extractelement <8 x half> %a, i32 7 + %conv = fpext half %elt to float + ret float %conv +} + +define <4 x half> @test_vset_lane_f16(<4 x half> %a, float %fb) nounwind { +; CHECK-LABEL: test_vset_lane_f16: +; CHECK: vmov.f16 r[[GPR:[0-9]+]], s{{[0-9]+}} +; CHECK: vmov.16 d{{[0-9]+}}[3], r[[GPR]] +entry: + %b = fptrunc float %fb to half + %x = insertelement <4 x half> %a, half %b, i32 3 + ret <4 x half> %x +} + +define <8 x half> @test_vset_laneq_f16_1(<8 x half> %a, float %fb) nounwind { +; CHECK-LABEL: test_vset_laneq_f16_1: +; CHECK: vmov.f16 r[[GPR:[0-9]+]], s{{[0-9]+}} +; CHECK: vmov.16 d{{[0-9]+}}[1], r[[GPR]] +entry: + %b = fptrunc float %fb to half + %x = insertelement <8 x half> %a, half %b, i32 1 + ret <8 x half> %x +} + +define <8 x half> @test_vset_laneq_f16_7(<8 x half> %a, float %fb) nounwind { +; CHECK-LABEL: test_vset_laneq_f16_7: +; CHECK: vmov.f16 r[[GPR:[0-9]+]], s{{[0-9]+}} +; CHECK: vmov.16 d{{[0-9]+}}[3], r[[GPR]] +entry: + %b = fptrunc float %fb to half + %x = insertelement <8 x half> %a, half %b, i32 7 + ret <8 x half> %x +} Index: llvm/trunk/test/CodeGen/ARM/fp16-vldlane-vstlane.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/fp16-vldlane-vstlane.ll +++ llvm/trunk/test/CodeGen/ARM/fp16-vldlane-vstlane.ll @@ -0,0 +1,56 @@ +; RUN: llc -mtriple=arm-eabi -mattr=+armv8.2-a,+fullfp16,+neon -float-abi=hard -O1 < %s | FileCheck %s +; RUN: llc -mtriple=arm-eabi -mattr=+armv8.2-a,+fullfp16,+neon -float-abi=soft -O1 < %s | FileCheck %s + +define <4 x half> @vld1d_lane_f16(half* %pa, <4 x half> %v4) nounwind { +; CHECK-LABEL: vld1d_lane_f16: +; CHECK: vld1.16 {d{{[0-9]+}}[3]}, [r0:16] +entry: + %a = load half, half* %pa + %res = insertelement <4 x half> %v4, half %a, i32 3 + ret <4 x half> %res +} + +define <8 x half> @vld1q_lane_f16_1(half* %pa, <8 x half> %v8) nounwind { +; CHECK-LABEL: vld1q_lane_f16_1: +; CHECK: vld1.16 {d{{[0-9]+}}[1]}, [r0:16] +entry: + %a = load half, half* %pa + %res = insertelement <8 x half> %v8, half %a, i32 1 + ret <8 x half> %res +} + +define <8 x half> @vld1q_lane_f16_7(half* %pa, <8 x half> %v8) nounwind { +; CHECK-LABEL: vld1q_lane_f16_7: +; CHECK: vld1.16 {d{{[0-9]+}}[3]}, [r0:16] +entry: + %a = load half, half* %pa + %res = insertelement <8 x half> %v8, half %a, i32 7 + ret <8 x half> %res +} + +define void @vst1d_lane_f16(half* %pa, <4 x half> %v4) nounwind { +; CHECK-LABEL: vst1d_lane_f16: +; CHECK: vst1.16 {d{{[0-9]+}}[3]}, [r0:16] +entry: + %a = extractelement <4 x half> %v4, i32 3 + store half %a, half* %pa + ret void +} + +define void @vst1q_lane_f16_7(half* %pa, <8 x half> %v8) nounwind { +; CHECK-LABEL: vst1q_lane_f16_7: +; CHECK: vst1.16 {d{{[0-9]+}}[3]}, [r0:16] +entry: + %a = extractelement <8 x half> %v8, i32 7 + store half %a, half* %pa + ret void +} + +define void @vst1q_lane_f16_1(half* %pa, <8 x half> %v8) nounwind { +; CHECK-LABEL: vst1q_lane_f16_1: +; CHECK: vst1.16 {d{{[0-9]+}}[1]}, [r0:16] +entry: + %a = extractelement <8 x half> %v8, i32 1 + store half %a, half* %pa + ret void +}