Diff 315767

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 10,950 Lines • ▼ Show 20 Lines	if (areExtractShuffleVectors(Ext1, Ext2)) {
Ops.push_back(&Ext2->getOperandUse(0));		Ops.push_back(&Ext2->getOperandUse(0));
}		}

Ops.push_back(&I->getOperandUse(0));		Ops.push_back(&I->getOperandUse(0));
Ops.push_back(&I->getOperandUse(1));		Ops.push_back(&I->getOperandUse(1));

return true;		return true;
}		}
		case Instruction::Mul: {
		resistorUnsubmitted Done Reply Inline Actions Why isn't this logic the same as for Add/Sub above? resistor: Why isn't this logic the same as for Add/Sub above?
		NickGuyAuthorUnsubmitted Done Reply Inline Actions From my understanding, the Add/Sub above will only sink the operands if they are both extending/shuffling, while this case checks each operand individually. It might be beneficial to make Add/Sub aware of this case, unifying the instruction cases, but that's not what this patch is trying to achieve (the parent patch that this aids only checks for mul patterns) NickGuy: From my understanding, the Add/Sub above will only sink the operands if they are both…
		bool IsProfitable = false;
		for (auto &Op : I->operands()) {
		// Make sure we are not already sinking this operand
		if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
		continue;

		ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
		if (!Shuffle)
		dmgreenUnsubmitted Done Reply Inline Actions I would expect this to check more about the Shuffle I think. Like the fact that it is a duplicate - a pair that looks like: %0 = insertelement <4 x i16> undef, i16 %src, i32 0 %x = shufflevector <4 x i16> %0, <4 x i16> undef, <4 x i32> zeroinitializer Otherwise you might be sinking any shuffle needlessly, or it may not have an insertelement (which might be what is going wrong below). dmgreen: I would expect this to check more about the Shuffle I think. Like the fact that it is a…
		dmgreenUnsubmitted Done Reply Inline Actions Should this be checking for isZeroEltSplat too? dmgreen: Should this be checking for isZeroEltSplat too?
		continue;

		Value *ShuffleOperand = Shuffle->getOperand(0);
		InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
		resistorUnsubmitted Done Reply Inline Actions Dynamic extraction from a constant vector. resistor: Dynamic extraction from a constant vector.
		if (!Insert)
		dmgreenUnsubmitted Done Reply Inline Actions Check for insert into element 0 too? dmgreen: Check for insert into element 0 too?
		continue;

		Value *InsertOperand = Insert->getOperand(1);
		Instruction *OperandInstr = dyn_cast<Instruction>(InsertOperand);
		if (!OperandInstr)
		continue;

		unsigned Opcode = OperandInstr->getOpcode();
		if (Opcode != Instruction::SExt && Opcode != Instruction::ZExt)
		continue;

		Ops.push_back(&Shuffle->getOperandUse(0));
		Ops.push_back(&Op);
		IsProfitable = true;
		}

		return IsProfitable;
		}
default:		default:
return false;		return false;
}		}
return false;		return false;
}		}

bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,		bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
Align &RequiredAligment) const {		Align &RequiredAligment) const {
▲ Show 20 Lines • Show All 6,194 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -o -\| FileCheck %s

				define void @matrix_mul_unsigned(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) {
				; CHECK-LABEL: matrix_mul_unsigned:
				; CHECK: // %bb.0: // %vector.header
				; CHECK-NEXT: and w9, w3, #0xffff
				; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
				; CHECK-NEXT: and x8, x0, #0xfffffff8
				; CHECK-NEXT: dup v0.4h, w9
				; CHECK-NEXT: .LBB0_1: // %vector.body
				; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
				; CHECK-NEXT: add x9, x2, w0, uxtw #1
				; CHECK-NEXT: ldp d1, d2, [x9]
				; CHECK-NEXT: add x9, x1, w0, uxtw #2
				; CHECK-NEXT: subs x8, x8, #8 // =8
				; CHECK-NEXT: add w0, w0, #8 // =8
				; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
				; CHECK-NEXT: umull v2.4s, v0.4h, v2.4h
				; CHECK-NEXT: stp q1, q2, [x9]
				; CHECK-NEXT: b.ne .LBB0_1
				; CHECK-NEXT: // %bb.2: // %for.end12
				; CHECK-NEXT: ret
				vector.header:
				%conv4 = zext i16 %val to i32
				%wide.trip.count = zext i32 %N to i64
				%0 = add nsw i64 %wide.trip.count, -1
				%min.iters.check = icmp ult i32 %N, 8
				%1 = trunc i64 %0 to i32
				%2 = icmp ugt i64 %0, 4294967295
				%n.vec = and i64 %wide.trip.count, 4294967288
				%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
				%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
				%broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0
				%broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer
				%cmp.n = icmp eq i64 %n.vec, %wide.trip.count
				br label %vector.body

				vector.body: ; preds = %vector.header, %vector.body
				%index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
				%3 = trunc i64 %index to i32
				%4 = add i32 %N, %3
				%5 = zext i32 %4 to i64
				%6 = getelementptr inbounds i16, i16* %A, i64 %5
				%7 = bitcast i16* %6 to <4 x i16>*
				%wide.load = load <4 x i16>, <4 x i16>* %7, align 2
				%8 = getelementptr inbounds i16, i16* %6, i64 4
				%9 = bitcast i16* %8 to <4 x i16>*
				%wide.load30 = load <4 x i16>, <4 x i16>* %9, align 2
				%10 = zext <4 x i16> %wide.load to <4 x i32>
				%11 = zext <4 x i16> %wide.load30 to <4 x i32>
				%12 = mul nuw nsw <4 x i32> %broadcast.splat, %10
				%13 = mul nuw nsw <4 x i32> %broadcast.splat32, %11
				%14 = getelementptr inbounds i32, i32* %C, i64 %5
				%15 = bitcast i32* %14 to <4 x i32>*
				store <4 x i32> %12, <4 x i32>* %15, align 4
				%16 = getelementptr inbounds i32, i32* %14, i64 4
				%17 = bitcast i32* %16 to <4 x i32>*
				store <4 x i32> %13, <4 x i32>* %17, align 4
				%index.next = add i64 %index, 8
				%18 = icmp eq i64 %index.next, %n.vec
				br i1 %18, label %for.end12, label %vector.body

				for.end12: ; preds = %vector.body
				ret void
				}

				define void @matrix_mul_signed(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) {
				; CHECK-LABEL: matrix_mul_signed:
				; CHECK: // %bb.0: // %vector.header
				; CHECK-NEXT: sxth w9, w3
				; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
				; CHECK-NEXT: and x8, x0, #0xfffffff8
				; CHECK-NEXT: dup v0.4h, w9
				; CHECK-NEXT: .LBB1_1: // %vector.body
				; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
				; CHECK-NEXT: add x9, x2, w0, sxtw #1
				; CHECK-NEXT: ldp d1, d2, [x9]
				; CHECK-NEXT: add x9, x1, w0, sxtw #2
				; CHECK-NEXT: subs x8, x8, #8 // =8
				; CHECK-NEXT: add w0, w0, #8 // =8
				; CHECK-NEXT: smull v1.4s, v0.4h, v1.4h
				; CHECK-NEXT: smull v2.4s, v0.4h, v2.4h
				; CHECK-NEXT: stp q1, q2, [x9]
				; CHECK-NEXT: b.ne .LBB1_1
				; CHECK-NEXT: // %bb.2: // %for.end12
				; CHECK-NEXT: ret
				vector.header:
				%conv4 = sext i16 %val to i32
				%wide.trip.count = sext i32 %N to i64
				%0 = add nsw i64 %wide.trip.count, -1
				%min.iters.check = icmp ult i32 %N, 8
				%1 = trunc i64 %0 to i32
				%2 = icmp ugt i64 %0, 4294967295
				%n.vec = and i64 %wide.trip.count, 4294967288
				%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
				%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
				%broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0
				%broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer
				%cmp.n = icmp eq i64 %n.vec, %wide.trip.count
				br label %vector.body

				vector.body: ; preds = %vector.header, %vector.body
				%index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
				%3 = trunc i64 %index to i32
				%4 = add i32 %N, %3
				%5 = sext i32 %4 to i64
				%6 = getelementptr inbounds i16, i16* %A, i64 %5
				%7 = bitcast i16* %6 to <4 x i16>*
				%wide.load = load <4 x i16>, <4 x i16>* %7, align 2
				%8 = getelementptr inbounds i16, i16* %6, i64 4
				%9 = bitcast i16* %8 to <4 x i16>*
				%wide.load30 = load <4 x i16>, <4 x i16>* %9, align 2
				%10 = sext <4 x i16> %wide.load to <4 x i32>
				%11 = sext <4 x i16> %wide.load30 to <4 x i32>
				%12 = mul nsw <4 x i32> %broadcast.splat, %10
				%13 = mul nsw <4 x i32> %broadcast.splat32, %11
				%14 = getelementptr inbounds i32, i32* %C, i64 %5
				%15 = bitcast i32* %14 to <4 x i32>*
				store <4 x i32> %12, <4 x i32>* %15, align 4
				%16 = getelementptr inbounds i32, i32* %14, i64 4
				%17 = bitcast i32* %16 to <4 x i32>*
				store <4 x i32> %13, <4 x i32>* %17, align 4
				%index.next = add i64 %index, 8
				%18 = icmp eq i64 %index.next, %n.vec
				br i1 %18, label %for.end12, label %vector.body

				for.end12: ; preds = %vector.body
				ret void
				}


				define void @matrix_mul_double_shuffle(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) {
				; CHECK-LABEL: matrix_mul_double_shuffle:
				; CHECK: // %bb.0: // %vector.header
				; CHECK-NEXT: and w9, w3, #0xffff
				; CHECK-NEXT: dup v0.4h, w3
				; CHECK-NEXT: dup v1.4h, w9
				; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
				; CHECK-NEXT: and x8, x0, #0xfffffff8
				; CHECK-NEXT: umull v0.4s, v1.4h, v0.4h
				; CHECK-NEXT: .LBB2_1: // %vector.body
				; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
				; CHECK-NEXT: mov w9, w0
				; CHECK-NEXT: subs x8, x8, #8 // =8
				; CHECK-NEXT: lsl x9, x9, #2
				; CHECK-NEXT: add w0, w0, #8 // =8
				; CHECK-NEXT: str q0, [x1, x9]
				; CHECK-NEXT: b.ne .LBB2_1
				; CHECK-NEXT: // %bb.2: // %for.end12
				; CHECK-NEXT: ret
				vector.header:
				%conv4 = zext i16 %val to i32
				%wide.trip.count = zext i32 %N to i64
				%0 = add nsw i64 %wide.trip.count, -1
				%min.iters.check = icmp ult i32 %N, 8
				%1 = trunc i64 %0 to i32
				%2 = icmp ugt i64 %0, 4294967295
				%n.vec = and i64 %wide.trip.count, 4294967288
				%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
				%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
				%cmp.n = icmp eq i64 %n.vec, %wide.trip.count
				br label %vector.body

				vector.body: ; preds = %vector.header, %vector.body
				%index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
				%splat.input.ext = zext i16 %val to i32
				%broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %splat.input.ext, i32 0
				%broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> %broadcast.splat, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
				dmgreenUnsubmitted Done Reply Inline Actions Try loading the value in the loop, to keep the mul loop-variant: vector.body: ; preds = %vector.header, %vector.body %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ] %g = getelementptr inbounds i16, i16* %A, i64 %index %val1 = load i16, i16* %g %splat.input.ext = zext i16 %val1 to i32 %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %splat.input.ext, i32 0 %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> %broadcast.splat, <4 x i32> <i32 0, i32 0, i32 0, i32 0> dmgreen: Try loading the value in the loop, to keep the mul loop-variant: ``` vector.body…
				%3 = trunc i64 %index to i32
				%4 = add i32 %N, %3
				%5 = zext i32 %4 to i64
				%6 = mul nuw nsw <4 x i32> %broadcast.splat, %broadcast.splat32
				%7 = getelementptr inbounds i32, i32* %C, i64 %5
				%8 = bitcast i32* %7 to <4 x i32>*
				store <4 x i32> %6, <4 x i32>* %8, align 4
				%index.next = add i64 %index, 8
				%9 = icmp eq i64 %index.next, %n.vec
				br i1 %9, label %for.end12, label %vector.body

				for.end12: ; preds = %vector.body
				ret void
				}

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64] Attempt to sink mul operands
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 315767

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64] Attempt to sink mul operandsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 315767

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll

[AArch64] Attempt to sink mul operands
ClosedPublic