This is an archive of the discontinued LLVM Phabricator instance.

[ARM][MVE] Enable SHRN for tail predication
ClosedPublic

Authored by samparker on Mar 5 2020, 2:00 AM.

Download Raw Diff

Details

Reviewers

dmgreen
SjoerdMeijer

Commits

rG77e30758ddfc: [ARM][MVE] Enable *SHRN* for tail predication

Summary

These instructions don't swap lanes so make them valid.

Diff Detail

Unit TestsFailed

	Time	Test
	180 ms	Clang.Driver::Unknown Unit Message ("")

Event Timeline

samparker created this revision.Mar 5 2020, 2:00 AM

Herald added a project: Restricted Project. · View Herald TranscriptMar 5 2020, 2:00 AM

Herald added subscribers: hiraditya, kristof.beyls. · View Herald Transcript

Agreed, they don't swap lanes. They can write to only bottom/top halfs, but that's fine. So looks like a straightforward change to me.

This revision is now accepted and ready to land.Mar 5 2020, 2:31 AM

Harbormaster failed remote builds in B48161: Diff 248411!Mar 5 2020, 2:46 AM

Closed by commit rG77e30758ddfc: [ARM][MVE] Enable *SHRN* for tail predication (authored by samparker). · Explain WhyMar 5 2020, 3:19 AM

This revision was automatically updated to reflect the committed changes.

Hey. Can you explain what makes an instruction validForTailPredication? I think I've lost track. And what do you mean by "swap lanes" in this case?

We're allowing instructions which produce a vector and where the output lanes are only dependent upon the same lane on the input register(s). So when I say 'swap', I should say 'exchange'.

OK. I'm not sure if that is enough, if I am understanding correctly. What if we load a v8i16, extend that into two v4i32's using something like a VMULL, then narrow that back into a single v8i16. I don't think this is something that autovec will produce (yet), but could come up from intrinsics in a way that people are likely to write. Something like this:

#include <arm_mve.h>
void test(short *x, short *y, short *z, int n) {
  while(n > 0) {
    int pred = vctp16q(n);
    int16x8_t a = vldrhq_z_s16(x, pred);
    int16x8_t b = vldrhq_z_s16(y, pred);
    int32x4_t top = vmulltq_int(a, b);
    int32x4_t bot = vmullbq_int(a, b);
    int16x8_t rtop = vqshrnbq(vuninitializedq_s16(), bot, 16);
    int16x8_t rbot = vqshrntq(rtop, top, 16);
    vstrhq_p_s16(z, rbot, pred);

    x += 8;
    y += 8;
    z += 8;
    n -= 8;
  }
}

I'm pretty sure that tail predicating this would not be valid, as the top bits of one of the mul's could be cut off.

Yes, the problem there is that the number of lanes isn't the same throughout the loop, not necessarily that we're using a narrowing operation. Do you know if there's a nice way to query operand/result types at the MI level?

Not sure. I think they are just register types at the instruction level. I thought that was why we excluded many instructions (like vmull and vshrn), because they all change the types, and that changing of the types probably means that the tail predication might not be valid.

From IR I think that would be <8 x i16> sext to <8 x i32>, so the number of lanes would be the same, but the types (and lanes they are computed in) changes.

PS. I noticed VFMA.f32 isn't marked as valid. I think that's one that should certainly be OK.

Revision Contents

Path

Size

llvm/

lib/

Target/

ARM/

ARMInstrMVE.td

3 lines

unittests/

Target/

ARM/

MachineInstrTest.cpp

32 lines

Diff 248411

llvm/lib/Target/ARM/ARMInstrMVE.td

	Show First 20 Lines • Show All 492 Lines • ▼ Show 20 Lines
	let Inst{28} = bit_28;			let Inst{28} = bit_28;
	let Inst{25-23} = 0b101;			let Inst{25-23} = 0b101;
	let Inst{21} = 0b0;			let Inst{21} = 0b0;
	let Inst{20-16} = imm{4-0};			let Inst{20-16} = imm{4-0};
	let Inst{12} = bit_12;			let Inst{12} = bit_12;
	let Inst{11-6} = 0b111111;			let Inst{11-6} = 0b111111;
	let Inst{4} = 0b0;			let Inst{4} = 0b0;
	let Inst{0} = 0b1;			let Inst{0} = 0b1;
				let validForTailPredication = 1;
	}			}

	def MVE_VRSHRNi16bh : MVE_VxSHRN<"vrshrnb", "i16", 0b0, 0b1, shr_imm8> {			def MVE_VRSHRNi16bh : MVE_VxSHRN<"vrshrnb", "i16", 0b0, 0b1, shr_imm8> {
	let Inst{20-19} = 0b01;			let Inst{20-19} = 0b01;
	}			}
	def MVE_VRSHRNi16th : MVE_VxSHRN<"vrshrnt", "i16", 0b1, 0b1, shr_imm8> {			def MVE_VRSHRNi16th : MVE_VxSHRN<"vrshrnt", "i16", 0b1, 0b1, shr_imm8> {
	let Inst{20-19} = 0b01;			let Inst{20-19} = 0b01;
	}			}
	Show All 25 Lines
	let Inst{28} = bit_28;			let Inst{28} = bit_28;
	let Inst{25-23} = 0b101;			let Inst{25-23} = 0b101;
	let Inst{21} = 0b0;			let Inst{21} = 0b0;
	let Inst{20-16} = imm{4-0};			let Inst{20-16} = imm{4-0};
	let Inst{12} = bit_12;			let Inst{12} = bit_12;
	let Inst{11-6} = 0b111111;			let Inst{11-6} = 0b111111;
	let Inst{4} = 0b0;			let Inst{4} = 0b0;
	let Inst{0} = 0b0;			let Inst{0} = 0b0;
				let validForTailPredication = 1;
	}			}

	def MVE_VQRSHRUNs16bh : MVE_VxQRSHRUN<			def MVE_VQRSHRUNs16bh : MVE_VxQRSHRUN<
	"vqrshrunb", "s16", 0b1, 0b0, shr_imm8> {			"vqrshrunb", "s16", 0b1, 0b0, shr_imm8> {
	let Inst{20-19} = 0b01;			let Inst{20-19} = 0b01;
	}			}
	def MVE_VQRSHRUNs16th : MVE_VxQRSHRUN<			def MVE_VQRSHRUNs16th : MVE_VxQRSHRUN<
	"vqrshrunt", "s16", 0b1, 0b1, shr_imm8> {			"vqrshrunt", "s16", 0b1, 0b1, shr_imm8> {
	Show All 32 Lines

	let Inst{25-23} = 0b101;			let Inst{25-23} = 0b101;
	let Inst{21} = 0b0;			let Inst{21} = 0b0;
	let Inst{20-16} = imm{4-0};			let Inst{20-16} = imm{4-0};
	let Inst{12} = bit_12;			let Inst{12} = bit_12;
	let Inst{11-6} = 0b111101;			let Inst{11-6} = 0b111101;
	let Inst{4} = 0b0;			let Inst{4} = 0b0;
	let Inst{0} = bit_0;			let Inst{0} = bit_0;
				let validForTailPredication = 1;
	}			}

	multiclass MVE_VxQRSHRN_types<string iname, bit bit_0, bit bit_12> {			multiclass MVE_VxQRSHRN_types<string iname, bit bit_0, bit bit_12> {
	def s16 : MVE_VxQRSHRN<iname, "s16", bit_0, bit_12, shr_imm8> {			def s16 : MVE_VxQRSHRN<iname, "s16", bit_0, bit_12, shr_imm8> {
	let Inst{28} = 0b0;			let Inst{28} = 0b0;
	let Inst{20-19} = 0b01;			let Inst{20-19} = 0b01;
	}			}
	def u16 : MVE_VxQRSHRN<iname, "u16", bit_0, bit_12, shr_imm8> {			def u16 : MVE_VxQRSHRN<iname, "u16", bit_0, bit_12, shr_imm8> {
	▲ Show 20 Lines • Show All 492 Lines • Show Last 20 Lines

llvm/unittests/Target/ARM/MachineInstrTest.cpp

Show First 20 Lines • Show All 348 Lines • ▼ Show 20 Lines	auto IsValidTPOpcode = [](unsigned Opcode) {
case MVE_VQSHL_by_vecu32:		case MVE_VQSHL_by_vecu32:
case MVE_VQSHL_by_vecu8:		case MVE_VQSHL_by_vecu8:
case MVE_VQSHL_qrs16:		case MVE_VQSHL_qrs16:
case MVE_VQSHL_qrs32:		case MVE_VQSHL_qrs32:
case MVE_VQSHL_qrs8:		case MVE_VQSHL_qrs8:
case MVE_VQSHL_qru16:		case MVE_VQSHL_qru16:
case MVE_VQSHL_qru32:		case MVE_VQSHL_qru32:
case MVE_VQSHL_qru8:		case MVE_VQSHL_qru8:
		case MVE_VQRSHRNbhs16:
		case MVE_VQRSHRNbhs32:
		case MVE_VQRSHRNbhu16:
		case MVE_VQRSHRNbhu32:
		case MVE_VQRSHRNths16:
		case MVE_VQRSHRNths32:
		case MVE_VQRSHRNthu16:
		case MVE_VQRSHRNthu32:
		case MVE_VQRSHRUNs16bh:
		case MVE_VQRSHRUNs16th:
		case MVE_VQRSHRUNs32bh:
		case MVE_VQRSHRUNs32th:
		case MVE_VQSHRNbhs16:
		case MVE_VQSHRNbhs32:
		case MVE_VQSHRNbhu16:
		case MVE_VQSHRNbhu32:
		case MVE_VQSHRNths16:
		case MVE_VQSHRNths32:
		case MVE_VQSHRNthu16:
		case MVE_VQSHRNthu32:
		case MVE_VQSHRUNs16bh:
		case MVE_VQSHRUNs16th:
		case MVE_VQSHRUNs32bh:
		case MVE_VQSHRUNs32th:
case MVE_VQSUB_qr_s16:		case MVE_VQSUB_qr_s16:
case MVE_VQSUB_qr_s32:		case MVE_VQSUB_qr_s32:
case MVE_VQSUB_qr_s8:		case MVE_VQSUB_qr_s8:
case MVE_VQSUB_qr_u16:		case MVE_VQSUB_qr_u16:
case MVE_VQSUB_qr_u32:		case MVE_VQSUB_qr_u32:
case MVE_VQSUB_qr_u8:		case MVE_VQSUB_qr_u8:
case MVE_VQSUBs16:		case MVE_VQSUBs16:
case MVE_VQSUBs32:		case MVE_VQSUBs32:
Show All 32 Lines	auto IsValidTPOpcode = [](unsigned Opcode) {
case MVE_VRSHL_qru32:		case MVE_VRSHL_qru32:
case MVE_VRSHL_qru8:		case MVE_VRSHL_qru8:
case MVE_VRSHR_imms16:		case MVE_VRSHR_imms16:
case MVE_VRSHR_imms32:		case MVE_VRSHR_imms32:
case MVE_VRSHR_imms8:		case MVE_VRSHR_imms8:
case MVE_VRSHR_immu16:		case MVE_VRSHR_immu16:
case MVE_VRSHR_immu32:		case MVE_VRSHR_immu32:
case MVE_VRSHR_immu8:		case MVE_VRSHR_immu8:
		case MVE_VRSHRNi16bh:
		case MVE_VRSHRNi16th:
		case MVE_VRSHRNi32bh:
		case MVE_VRSHRNi32th:
case MVE_VSHL_by_vecs16:		case MVE_VSHL_by_vecs16:
case MVE_VSHL_by_vecs32:		case MVE_VSHL_by_vecs32:
case MVE_VSHL_by_vecs8:		case MVE_VSHL_by_vecs8:
case MVE_VSHL_by_vecu16:		case MVE_VSHL_by_vecu16:
case MVE_VSHL_by_vecu32:		case MVE_VSHL_by_vecu32:
case MVE_VSHL_by_vecu8:		case MVE_VSHL_by_vecu8:
case MVE_VSHL_immi16:		case MVE_VSHL_immi16:
case MVE_VSHL_immi32:		case MVE_VSHL_immi32:
case MVE_VSHL_immi8:		case MVE_VSHL_immi8:
case MVE_VSHL_qrs16:		case MVE_VSHL_qrs16:
case MVE_VSHL_qrs32:		case MVE_VSHL_qrs32:
case MVE_VSHL_qrs8:		case MVE_VSHL_qrs8:
case MVE_VSHL_qru16:		case MVE_VSHL_qru16:
case MVE_VSHL_qru32:		case MVE_VSHL_qru32:
case MVE_VSHL_qru8:		case MVE_VSHL_qru8:
case MVE_VSHR_imms16:		case MVE_VSHR_imms16:
case MVE_VSHR_imms32:		case MVE_VSHR_imms32:
case MVE_VSHR_imms8:		case MVE_VSHR_imms8:
case MVE_VSHR_immu16:		case MVE_VSHR_immu16:
case MVE_VSHR_immu32:		case MVE_VSHR_immu32:
case MVE_VSHR_immu8:		case MVE_VSHR_immu8:
		case MVE_VSHRNi16bh:
		case MVE_VSHRNi16th:
		case MVE_VSHRNi32bh:
		case MVE_VSHRNi32th:
case MVE_VSLIimm16:		case MVE_VSLIimm16:
case MVE_VSLIimm32:		case MVE_VSLIimm32:
case MVE_VSLIimm8:		case MVE_VSLIimm8:
case MVE_VSRIimm16:		case MVE_VSRIimm16:
case MVE_VSRIimm32:		case MVE_VSRIimm32:
case MVE_VSRIimm8:		case MVE_VSRIimm8:
case MVE_VSTRB16:		case MVE_VSTRB16:
case MVE_VSTRB16_post:		case MVE_VSTRB16_post:
▲ Show 20 Lines • Show All 172 Lines • Show Last 20 Lines