Skip to content

Commit 08da01b

Browse files
committedJun 4, 2019
[ARM] Add FP16 vector insert/extract patterns
This change adds two FP16 extraction and two insertion patterns (one per possible vector length). Extractions are handled by copying a Q/D register into one of VFP2 class registers, where single FP32 sub-registers can be accessed. Then the extraction of even lanes are simple sub-register extractions (because we don't care about the top parts of registers for FP16 operations). Odd lanes need an additional VMOVX instruction. Unfortunately, insertions cannot be handled in the same way, because: * There is no instruction to insert FP16 into an even lane (VINS only works with odd lanes) * The patterns for odd lanes will have a form of a DAG (not a tree), and will not be implementable in pure tablegen Because of this insertions are handled in the same way as 16-bit integer insertions (with conversions between FP registers and GPRs using VMOVHR instructions). Without these patterns the ARM backend would sometimes fail during instruction selection. This patch also adds patterns which combine: * an FP16 element extraction and a store into a single VST1 instruction * an FP16 load and insertion into a single VLD1 instruction Differential Revision: https://reviews.llvm.org/D62651 llvm-svn: 362482
1 parent 4ef0f82 commit 08da01b

File tree

3 files changed

+181
-0
lines changed

3 files changed

+181
-0
lines changed
 

‎llvm/lib/Target/ARM/ARMInstrNEON.td

+53
Original file line numberDiff line numberDiff line change
@@ -1117,6 +1117,12 @@ def VLD1LNq8Pseudo : VLD1QLNPseudo<v16i8, extloadi8>;
11171117
def VLD1LNq16Pseudo : VLD1QLNPseudo<v8i16, extloadi16>;
11181118
def VLD1LNq32Pseudo : VLD1QLNPseudo<v4i32, load>;
11191119

1120+
def : Pat<(vector_insert (v4f16 DPR:$src),
1121+
(f16 (load addrmode6:$addr)), imm:$lane),
1122+
(VLD1LNd16 addrmode6:$addr, DPR:$src, imm:$lane)>;
1123+
def : Pat<(vector_insert (v8f16 QPR:$src),
1124+
(f16 (load addrmode6:$addr)), imm:$lane),
1125+
(VLD1LNq16Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
11201126
def : Pat<(vector_insert (v2f32 DPR:$src),
11211127
(f32 (load addrmode6:$addr)), imm:$lane),
11221128
(VLD1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>;
@@ -2175,6 +2181,11 @@ def : Pat<(store (extractelt (v2f32 DPR:$src), imm:$lane), addrmode6:$addr),
21752181
def : Pat<(store (extractelt (v4f32 QPR:$src), imm:$lane), addrmode6:$addr),
21762182
(VST1LNq32Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
21772183

2184+
def : Pat<(store (extractelt (v4f16 DPR:$src), imm:$lane), addrmode6:$addr),
2185+
(VST1LNd16 addrmode6:$addr, DPR:$src, imm:$lane)>;
2186+
def : Pat<(store (extractelt (v8f16 QPR:$src), imm:$lane), addrmode6:$addr),
2187+
(VST1LNq16Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
2188+
21782189
// ...with address register writeback:
21792190
class VST1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
21802191
PatFrag StoreOp, SDNode ExtractOp, Operand AdrMode>
@@ -2504,6 +2515,13 @@ def SSubReg_f32_reg : SDNodeXForm<imm, [{
25042515
MVT::i32);
25052516
}]>;
25062517

2518+
// Extract S sub-registers of Q/D registers containing a given f16 lane.
2519+
def SSubReg_f16_reg : SDNodeXForm<imm, [{
2520+
assert(ARM::ssub_3 == ARM::ssub_0+3 && "Unexpected subreg numbering");
2521+
return CurDAG->getTargetConstant(ARM::ssub_0 + N->getZExtValue()/2, SDLoc(N),
2522+
MVT::i32);
2523+
}]>;
2524+
25072525
// Translate lane numbers from Q registers to D subregs.
25082526
def SubReg_i8_lane : SDNodeXForm<imm, [{
25092527
return CurDAG->getTargetConstant(N->getZExtValue() & 7, SDLoc(N), MVT::i32);
@@ -6223,6 +6241,32 @@ def : Pat<(extractelt (v4f32 QPR:$src1), imm:$src2),
62236241
def : Pat<(extractelt (v2f64 QPR:$src1), imm:$src2),
62246242
(EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>;
62256243

6244+
def imm_even : ImmLeaf<i32, [{ return (Imm & 1) == 0; }]>;
6245+
def imm_odd : ImmLeaf<i32, [{ return (Imm & 1) == 1; }]>;
6246+
6247+
def : Pat<(extractelt (v4f16 DPR:$src), imm_even:$lane),
6248+
(EXTRACT_SUBREG
6249+
(v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)),
6250+
(SSubReg_f16_reg imm_even:$lane))>;
6251+
6252+
def : Pat<(extractelt (v4f16 DPR:$src), imm_odd:$lane),
6253+
(COPY_TO_REGCLASS
6254+
(VMOVH (EXTRACT_SUBREG
6255+
(v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)),
6256+
(SSubReg_f16_reg imm_odd:$lane))),
6257+
HPR)>;
6258+
6259+
def : Pat<(extractelt (v8f16 QPR:$src), imm_even:$lane),
6260+
(EXTRACT_SUBREG
6261+
(v4f32 (COPY_TO_REGCLASS (v8f16 QPR:$src), QPR_VFP2)),
6262+
(SSubReg_f16_reg imm_even:$lane))>;
6263+
6264+
def : Pat<(extractelt (v8f16 QPR:$src), imm_odd:$lane),
6265+
(COPY_TO_REGCLASS
6266+
(VMOVH (EXTRACT_SUBREG
6267+
(v4f32 (COPY_TO_REGCLASS (v8f16 QPR:$src), QPR_VFP2)),
6268+
(SSubReg_f16_reg imm_odd:$lane))),
6269+
HPR)>;
62266270

62276271
// VMOV : Vector Set Lane (move ARM core register to scalar)
62286272

@@ -6281,6 +6325,15 @@ def : Pat<(v4f32 (insertelt QPR:$src1, SPR:$src2, imm:$src3)),
62816325
(INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS QPR:$src1, QPR_VFP2)),
62826326
SPR:$src2, (SSubReg_f32_reg imm:$src3))>;
62836327

6328+
def : Pat<(insertelt (v4f16 DPR:$src1), HPR:$src2, imm:$lane),
6329+
(v4f16 (VSETLNi16 DPR:$src1, (VMOVRH $src2), imm:$lane))>;
6330+
def : Pat<(insertelt (v8f16 QPR:$src1), HPR:$src2, imm:$lane),
6331+
(v8f16 (INSERT_SUBREG QPR:$src1,
6332+
(v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1,
6333+
(DSubReg_i16_reg imm:$lane))),
6334+
(VMOVRH $src2), (SubReg_i16_lane imm:$lane))),
6335+
(DSubReg_i16_reg imm:$lane)))>;
6336+
62846337
//def : Pat<(v2i64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
62856338
// (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>;
62866339
def : Pat<(v2f64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
; RUN: llc -mtriple=arm-eabi -mattr=+armv8.2-a,+fullfp16,+neon -float-abi=hard -O1 < %s | FileCheck %s
2+
; RUN: llc -mtriple=arm-eabi -mattr=+armv8.2-a,+fullfp16,+neon -float-abi=soft -O1 < %s | FileCheck %s
3+
4+
define float @test_vget_lane_f16_1(<4 x half> %a) nounwind {
5+
; CHECK-LABEL: test_vget_lane_f16_1:
6+
; CHECK: vmovx.f16 s0, s0
7+
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
8+
entry:
9+
%elt = extractelement <4 x half> %a, i32 1
10+
%conv = fpext half %elt to float
11+
ret float %conv
12+
}
13+
14+
define float @test_vget_lane_f16_2(<4 x half> %a) nounwind {
15+
; CHECK-LABEL: test_vget_lane_f16_2:
16+
; CHECK-NOT: vmovx.f16
17+
; CHECK: vcvtb.f32.f16 s0, s1
18+
entry:
19+
%elt = extractelement <4 x half> %a, i32 2
20+
%conv = fpext half %elt to float
21+
ret float %conv
22+
}
23+
24+
define float @test_vget_laneq_f16_6(<8 x half> %a) nounwind {
25+
; CHECK-LABEL: test_vget_laneq_f16_6:
26+
; CHECK-NOT: vmovx.f16
27+
; CHECK: vcvtb.f32.f16 s0, s3
28+
entry:
29+
%elt = extractelement <8 x half> %a, i32 6
30+
%conv = fpext half %elt to float
31+
ret float %conv
32+
}
33+
34+
define float @test_vget_laneq_f16_7(<8 x half> %a) nounwind {
35+
; CHECK-LABEL: test_vget_laneq_f16_7:
36+
; CHECK: vmovx.f16 s0, s3
37+
; CHECK: vcvtb.f32.f16 s0, s0
38+
entry:
39+
%elt = extractelement <8 x half> %a, i32 7
40+
%conv = fpext half %elt to float
41+
ret float %conv
42+
}
43+
44+
define <4 x half> @test_vset_lane_f16(<4 x half> %a, float %fb) nounwind {
45+
; CHECK-LABEL: test_vset_lane_f16:
46+
; CHECK: vmov.f16 r[[GPR:[0-9]+]], s{{[0-9]+}}
47+
; CHECK: vmov.16 d{{[0-9]+}}[3], r[[GPR]]
48+
entry:
49+
%b = fptrunc float %fb to half
50+
%x = insertelement <4 x half> %a, half %b, i32 3
51+
ret <4 x half> %x
52+
}
53+
54+
define <8 x half> @test_vset_laneq_f16_1(<8 x half> %a, float %fb) nounwind {
55+
; CHECK-LABEL: test_vset_laneq_f16_1:
56+
; CHECK: vmov.f16 r[[GPR:[0-9]+]], s{{[0-9]+}}
57+
; CHECK: vmov.16 d{{[0-9]+}}[1], r[[GPR]]
58+
entry:
59+
%b = fptrunc float %fb to half
60+
%x = insertelement <8 x half> %a, half %b, i32 1
61+
ret <8 x half> %x
62+
}
63+
64+
define <8 x half> @test_vset_laneq_f16_7(<8 x half> %a, float %fb) nounwind {
65+
; CHECK-LABEL: test_vset_laneq_f16_7:
66+
; CHECK: vmov.f16 r[[GPR:[0-9]+]], s{{[0-9]+}}
67+
; CHECK: vmov.16 d{{[0-9]+}}[3], r[[GPR]]
68+
entry:
69+
%b = fptrunc float %fb to half
70+
%x = insertelement <8 x half> %a, half %b, i32 7
71+
ret <8 x half> %x
72+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
; RUN: llc -mtriple=arm-eabi -mattr=+armv8.2-a,+fullfp16,+neon -float-abi=hard -O1 < %s | FileCheck %s
2+
; RUN: llc -mtriple=arm-eabi -mattr=+armv8.2-a,+fullfp16,+neon -float-abi=soft -O1 < %s | FileCheck %s
3+
4+
define <4 x half> @vld1d_lane_f16(half* %pa, <4 x half> %v4) nounwind {
5+
; CHECK-LABEL: vld1d_lane_f16:
6+
; CHECK: vld1.16 {d{{[0-9]+}}[3]}, [r0:16]
7+
entry:
8+
%a = load half, half* %pa
9+
%res = insertelement <4 x half> %v4, half %a, i32 3
10+
ret <4 x half> %res
11+
}
12+
13+
define <8 x half> @vld1q_lane_f16_1(half* %pa, <8 x half> %v8) nounwind {
14+
; CHECK-LABEL: vld1q_lane_f16_1:
15+
; CHECK: vld1.16 {d{{[0-9]+}}[1]}, [r0:16]
16+
entry:
17+
%a = load half, half* %pa
18+
%res = insertelement <8 x half> %v8, half %a, i32 1
19+
ret <8 x half> %res
20+
}
21+
22+
define <8 x half> @vld1q_lane_f16_7(half* %pa, <8 x half> %v8) nounwind {
23+
; CHECK-LABEL: vld1q_lane_f16_7:
24+
; CHECK: vld1.16 {d{{[0-9]+}}[3]}, [r0:16]
25+
entry:
26+
%a = load half, half* %pa
27+
%res = insertelement <8 x half> %v8, half %a, i32 7
28+
ret <8 x half> %res
29+
}
30+
31+
define void @vst1d_lane_f16(half* %pa, <4 x half> %v4) nounwind {
32+
; CHECK-LABEL: vst1d_lane_f16:
33+
; CHECK: vst1.16 {d{{[0-9]+}}[3]}, [r0:16]
34+
entry:
35+
%a = extractelement <4 x half> %v4, i32 3
36+
store half %a, half* %pa
37+
ret void
38+
}
39+
40+
define void @vst1q_lane_f16_7(half* %pa, <8 x half> %v8) nounwind {
41+
; CHECK-LABEL: vst1q_lane_f16_7:
42+
; CHECK: vst1.16 {d{{[0-9]+}}[3]}, [r0:16]
43+
entry:
44+
%a = extractelement <8 x half> %v8, i32 7
45+
store half %a, half* %pa
46+
ret void
47+
}
48+
49+
define void @vst1q_lane_f16_1(half* %pa, <8 x half> %v8) nounwind {
50+
; CHECK-LABEL: vst1q_lane_f16_1:
51+
; CHECK: vst1.16 {d{{[0-9]+}}[1]}, [r0:16]
52+
entry:
53+
%a = extractelement <8 x half> %v8, i32 1
54+
store half %a, half* %pa
55+
ret void
56+
}

0 commit comments

Comments
 (0)
Please sign in to comment.