This is an archive of the discontinued LLVM Phabricator instance.

test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll
996	ah, I now see what you mean. This is the test and IR for ACLE intrinsic: float16x8_t vmulq_lane_f16 (float16x8_t a, float16x4_t v, const int lane) but yes, the pattern would also match for a pattern where the 2nd operand is a v8f16.

SjoerdMeijer added inline comments.Aug 8 2018, 2:23 AM

test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll
996	The shufflevector is creating a 8 x half vector here, with half the elements undef because we pass in a 4 x half, so it actually looks all okay here?

Cheers, shufflevector always confuses me. LGTM.

This revision is now accepted and ready to land.Aug 8 2018, 3:03 AM

Thanks for the reviews!

Closed by commit rL339232: [ARM] FP16: vector VMUL variants (authored by SjoerdMeijer). · Explain WhyAug 8 2018, 3:28 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

Target/

ARM/

ARMInstrNEON.td

16 lines

test/

CodeGen/

ARM/

armv8.2a-fp16-vector-intrinsics.ll

78 lines

Diff 159268

lib/Target/ARM/ARMInstrNEON.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 4,299 Lines • ▼ Show 20 Lines	def : Pat<(v4i32 (mul (v4i32 QPR:$src1),
(DSubReg_i32_reg imm:$lane))),		(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;		(SubReg_i32_lane imm:$lane)))>;
def : Pat<(v4f32 (fmul (v4f32 QPR:$src1),		def : Pat<(v4f32 (fmul (v4f32 QPR:$src1),
(v4f32 (NEONvduplane (v4f32 QPR:$src2), imm:$lane)))),		(v4f32 (NEONvduplane (v4f32 QPR:$src2), imm:$lane)))),
(v4f32 (VMULslfq (v4f32 QPR:$src1),		(v4f32 (VMULslfq (v4f32 QPR:$src1),
(v2f32 (EXTRACT_SUBREG QPR:$src2,		(v2f32 (EXTRACT_SUBREG QPR:$src2,
(DSubReg_i32_reg imm:$lane))),		(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;		(SubReg_i32_lane imm:$lane)))>;
		def : Pat<(v8f16 (fmul (v8f16 QPR:$src1),
		(v8f16 (NEONvduplane (v8f16 QPR:$src2), imm:$lane)))),
		(v8f16 (VMULslhq(v8f16 QPR:$src1),
		(v4f16 (EXTRACT_SUBREG QPR:$src2,
		(DSubReg_i16_reg imm:$lane))),
		(SubReg_i16_lane imm:$lane)))>;

def : Pat<(v2f32 (fmul DPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),		def : Pat<(v2f32 (fmul DPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
(VMULslfd DPR:$Rn,		(VMULslfd DPR:$Rn,
(INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),		(INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
(i32 0))>;		(i32 0))>;
		def : Pat<(v4f16 (fmul DPR:$Rn, (NEONvdup (f16 HPR:$Rm)))),
		(VMULslhd DPR:$Rn,
		(INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$Rm, ssub_0),
		(i32 0))>;
def : Pat<(v4f32 (fmul QPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),		def : Pat<(v4f32 (fmul QPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
(VMULslfq QPR:$Rn,		(VMULslfq QPR:$Rn,
(INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),		(INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
(i32 0))>;		(i32 0))>;
		def : Pat<(v8f16 (fmul QPR:$Rn, (NEONvdup (f16 HPR:$Rm)))),
		(VMULslhq QPR:$Rn,
		(INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$Rm, ssub_0),
		(i32 0))>;

// VQDMULH : Vector Saturating Doubling Multiply Returning High Half		// VQDMULH : Vector Saturating Doubling Multiply Returning High Half
defm VQDMULH : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D,		defm VQDMULH : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D,
IIC_VMULi16Q, IIC_VMULi32Q,		IIC_VMULi16Q, IIC_VMULi32Q,
"vqdmulh", "s", int_arm_neon_vqdmulh, 1>;		"vqdmulh", "s", int_arm_neon_vqdmulh, 1>;
defm VQDMULHsl: N3VIntSL_HS<0b1100, IIC_VMULi16D, IIC_VMULi32D,		defm VQDMULHsl: N3VIntSL_HS<0b1100, IIC_VMULi16D, IIC_VMULi32D,
IIC_VMULi16Q, IIC_VMULi32Q,		IIC_VMULi16Q, IIC_VMULi32Q,
"vqdmulh", "s", int_arm_neon_vqdmulh>;		"vqdmulh", "s", int_arm_neon_vqdmulh>;
▲ Show 20 Lines • Show All 4,225 Lines • Show Last 20 Lines

test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll

	Show First 20 Lines • Show All 830 Lines • ▼ Show 20 Lines
	; CHECK-LABEL: test_vminnmq_f16:			; CHECK-LABEL: test_vminnmq_f16:
	; CHECK: vminnm.f16 q0, q0, q1			; CHECK: vminnm.f16 q0, q0, q1
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	entry:			entry:
	%vminnmq_v2.i = tail call <8 x half> @llvm.arm.neon.vminnm.v8f16(<8 x half> %a, <8 x half> %b)			%vminnmq_v2.i = tail call <8 x half> @llvm.arm.neon.vminnm.v8f16(<8 x half> %a, <8 x half> %b)
	ret <8 x half> %vminnmq_v2.i			ret <8 x half> %vminnmq_v2.i
	}			}

	define dso_local <4 x half> @test_vmul_f16(<4 x half> %a, <4 x half> %b) {			define dso_local <4 x half> @test_vmul_f16(<4 x half> %a, <4 x half> %b) {
				SjoerdMeijerAuthorUnsubmitted Not Done Reply Inline Actions Yes, they are here :) SjoerdMeijer: Yes, they are here :)
	; CHECKLABEL: test_vmul_f16:			; CHECKLABEL: test_vmul_f16:
	; CHECK: vmul.f16 d0, d0, d1			; CHECK: vmul.f16 d0, d0, d1
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	entry:			entry:
	%mul.i = fmul <4 x half> %a, %b			%mul.i = fmul <4 x half> %a, %b
	ret <4 x half> %mul.i			ret <4 x half> %mul.i
	}			}

	▲ Show 20 Lines • Show All 122 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: vfma.f16 q0, [[Q8]], q2			; CHECK-NEXT: vfma.f16 q0, [[Q8]], q2
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	entry:			entry:
	%sub.i = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b			%sub.i = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
	%0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %sub.i, <8 x half> %c, <8 x half> %a)			%0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %sub.i, <8 x half> %c, <8 x half> %a)
	ret <8 x half> %0			ret <8 x half> %0
	}			}

	; FIXME (PR38404)			define dso_local <4 x half> @test_vmul_lane_f16(<4 x half> %a, <4 x half> %b) {
	;			; CHECK-LABEL: test_vmul_lane_f16:
	;define dso_local <4 x half> @test_vmul_lane_f16(<4 x half> %a, <4 x half> %b) {			; CHECK: vmul.f16 d0, d0, d1[3]
	;entry:			; CHECK-NEXT: bx lr
	; %shuffle = shufflevector <4 x half> %b, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>			entry:
	; %mul = fmul <4 x half> %shuffle, %a			%shuffle = shufflevector <4 x half> %b, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
	; ret <4 x half> %mul			%mul = fmul <4 x half> %shuffle, %a
	;}			ret <4 x half> %mul
				}

	;define dso_local <8 x half> @test_vmulq_lane_f16(<8 x half> %a, <4 x half> %b) {			define dso_local <8 x half> @test_vmulq_lane_f16(<8 x half> %a, <4 x half> %b) {
	;entry:			; CHECK-LABEL: test_vmulq_lane_f16:
	; %shuffle = shufflevector <4 x half> %b, <4 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>			; CHECK: vmul.f16 q0, q0, d2[3]
	; %mul = fmul <8 x half> %shuffle, %a			; CHECK-NEXT: bx lr
	; ret <8 x half> %mul			entry:
	;}			%shuffle = shufflevector <4 x half> %b, <4 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
				%mul = fmul <8 x half> %shuffle, %a
				ret <8 x half> %mul
				}
				samparkerUnsubmitted Not Done Reply Inline Actions Should there not also be a test where both inputs are v8f16? samparker: Should there not also be a test where both inputs are v8f16?
				SjoerdMeijerAuthorUnsubmitted Not Done Reply Inline Actions ah, I now see what you mean. This is the test and IR for ACLE intrinsic: float16x8_t vmulq_lane_f16 (float16x8_t a, float16x4_t v, const int lane) but yes, the pattern would also match for a pattern where the 2nd operand is a v8f16. SjoerdMeijer: ah, I now see what you mean. This is the test and IR for ACLE intrinsic: float16x8_t…
				SjoerdMeijerAuthorUnsubmitted Not Done Reply Inline Actions The shufflevector is creating a 8 x half vector here, with half the elements undef because we pass in a 4 x half, so it actually looks all okay here? SjoerdMeijer: The shufflevector is creating a 8 x half vector here, with half the elements undef because we…

	;define dso_local <4 x half> @test_vmul_n_f16(<4 x half> %a, float %b.coerce) {			define dso_local <4 x half> @test_vmul_n_f16(<4 x half> %a, float %b.coerce) {
	;entry:			; CHECK-LABEL: test_vmul_n_f16:
	; %0 = bitcast float %b.coerce to i32			; CHECK: vmul.f16 d0, d0, d1[0]
	; %tmp.0.extract.trunc = trunc i32 %0 to i16			; CHECK-NEXT: bx lr
	; %1 = bitcast i16 %tmp.0.extract.trunc to half			entry:
	; %vecinit = insertelement <4 x half> undef, half %1, i32 0			%0 = bitcast float %b.coerce to i32
	; %vecinit4 = shufflevector <4 x half> %vecinit, <4 x half> undef, <4 x i32> zeroinitializer			%tmp.0.extract.trunc = trunc i32 %0 to i16
	; %mul = fmul <4 x half> %vecinit4, %a			%1 = bitcast i16 %tmp.0.extract.trunc to half
	; ret <4 x half> %mul			%vecinit = insertelement <4 x half> undef, half %1, i32 0
	;}			%vecinit4 = shufflevector <4 x half> %vecinit, <4 x half> undef, <4 x i32> zeroinitializer
				%mul = fmul <4 x half> %vecinit4, %a
				ret <4 x half> %mul
				}

	;define dso_local <8 x half> @test_vmulq_n_f16(<8 x half> %a, float %b.coerce) {			define dso_local <8 x half> @test_vmulq_n_f16(<8 x half> %a, float %b.coerce) {
	;entry:			; CHECK-LABEL: test_vmulq_n_f16:
	; %0 = bitcast float %b.coerce to i32			; CHECK: vmul.f16 q0, q0, d2[0]
	; %tmp.0.extract.trunc = trunc i32 %0 to i16			; CHECK-NEXT: bx lr
	; %1 = bitcast i16 %tmp.0.extract.trunc to half			entry:
	; %vecinit = insertelement <8 x half> undef, half %1, i32 0			%0 = bitcast float %b.coerce to i32
	; %vecinit8 = shufflevector <8 x half> %vecinit, <8 x half> undef, <8 x i32> zeroinitializer			%tmp.0.extract.trunc = trunc i32 %0 to i16
	; %mul = fmul <8 x half> %vecinit8, %a			%1 = bitcast i16 %tmp.0.extract.trunc to half
	; ret <8 x half> %mul			%vecinit = insertelement <8 x half> undef, half %1, i32 0
	;}			%vecinit8 = shufflevector <8 x half> %vecinit, <8 x half> undef, <8 x i32> zeroinitializer
				%mul = fmul <8 x half> %vecinit8, %a
				ret <8 x half> %mul
				}

	define dso_local <4 x half> @test_vbsl_f16(<4 x i16> %a, <4 x half> %b, <4 x half> %c) {			define dso_local <4 x half> @test_vbsl_f16(<4 x i16> %a, <4 x half> %b, <4 x half> %c) {
	; CHECKLABEL: test_vbsl_f16:			; CHECKLABEL: test_vbsl_f16:
	; CHECK: vbsl d0, d1, d2			; CHECK: vbsl d0, d1, d2
	; CHECK-NEXT: bx lr			; CHECK-NEXT: bx lr
	entry:			entry:
	%0 = bitcast <4 x i16> %a to <8 x i8>			%0 = bitcast <4 x i16> %a to <8 x i8>
	%1 = bitcast <4 x half> %b to <8 x i8>			%1 = bitcast <4 x half> %b to <8 x i8>
	▲ Show 20 Lines • Show All 221 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[ARM] FP16: vector VMUL variantsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 159268

lib/Target/ARM/ARMInstrNEON.td

test/CodeGen/ARM/armv8.2a-fp16-vector-intrinsics.ll

[ARM] FP16: vector VMUL variants
ClosedPublic