This is an archive of the discontinued LLVM Phabricator instance.

Fix SROA for intrinsics
AbandonedPublic

Authored by apazos on Jan 28 2015, 5:46 PM.

Download Raw Diff

Details

Reviewers

mgrang
chandlerc
resistor
mcrosier

Summary

For intrinsics which do generate lifetime start and end evaluate all conditions before deciding whether to slice vector loads or not.

Diff Detail

Event Timeline

mgrang updated this revision to Diff 18935.Jan 28 2015, 5:46 PM

mgrang retitled this revision from to Fix SROA for intrinsics.

mgrang updated this object.

mgrang edited the test plan for this revision. (Show Details)

mgrang added reviewers: chandlerc, resistor, mcrosier, apazos.

mgrang added a subscriber: Unknown Object (MLST).

Added "target datalayout" and "target triple" to the test case .ll file

mgrang updated this revision to Diff 19421.Feb 5 2015, 11:29 AM

Mandeep,
I'll commit this shortly once I have verified the test case.

Chad

mgrang updated this revision to Diff 19432.Feb 5 2015, 2:12 PM

mgrang edited edge metadata.

Had an offline discussion with Mandeep about this change. This patch needs additional investigation.

This revision now requires changes to proceed.Mar 2 2015, 11:58 AM

I took a closer look at the degradation caused by Owen's patch on AArch64.

With Owen's patch SROA promotes more allocas to vector values and generates a lot of scattered vector insert element instructions. But the backend is not able to optimize those scattered vector insert element instructions when there are extension operations in between the load and the insert instruction. It ends up executing a lot more vector insert instructions degrading performance.

Here is a simple example:

x = ld
y = ld
y = insert x v1, 1
z = insert y v1, 5

Generates:
ld1 { v0.b }[1], [x0]
ld1 { v0.b }[5], [x1]

But
x = ld
ex = ext x
y = ld
ey = ex y
z = insert ex v1, 1
k = insert ey v1, 5

Generates:

ldrb     w8, [x0]
ldrb     w9, [x1]
ins    v0.h[1], w8
ins    v0.h[5], w9

Better code would be:

ld1    { v0.b }[1], [x0]
ld1    { v0.b }[5], [x1]
ushll    v0.8h, v0.8b, #0

Even though it is SROA who is generating the vector insert instructions (b.t.w, same issue with vecttor extract instructions), I do not think we should fix it there.

Chandler , what do you think? Should we try to generate better code from SROA?

In my opinion we should do an IR optimization (Instr Combine or even SLP vectorizer?) to allow the backend to generate better machine code. Here is what the transformation would look like:

; Problem: The difference in element size prevents optimized code from being generated
define <8 x i16> @test_ins4(i8* %arrayidx1, i8* %arrayidx2) {

%1 = load i8* %arrayidx1
%conv1 = zext i8 %1 to i16
%2 = load i8* %arrayidx2
%conv2 = zext i8 %2 to i16
%x = insertelement <8 x i16> undef, i16 %conv1, i32 1
%y = insertelement <8 x i16> undef, i16 %conv2, i32 5
%z = add <8 x i16> %x, %y
ret <8 x i16> %z

}

; Solution: Transforming the IR to eliminate the difference in element size allowing us to generate optimized code
define <8 x i16> @test_ins5(i8* %arrayidx1, i8* %arrayidx2) {

%1 = load i8* %arrayidx1
%conv1 = zext i8 %1 to i16
%2 = load i8* %arrayidx2
%conv2 = zext i8 %2 to i16
%x = insertelement <8 x i8> undef, i8 %1, i32 1
%y = insertelement <8 x i8> %x, i8 %2, i32 5
%z = zext <8 x i8> %y to <8 x i16>
ret <8 x i16> %z

}

With all of the above I think we can close this revision. We do not nee to change Owen's patch (tough the logic is quite confusing in that function).

apazos abandoned this revision.Mar 12 2015, 2:16 PM

Revision Contents

Path

Size

lib/

Transforms/

Scalar/

SROA.cpp

6 lines

test/

Transforms/

ScalarRepl/

sroa-lifetime-instrinsics.ll

53 lines

Diff 19017

lib/Transforms/Scalar/SROA.cpp

Context not available.
	Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);	Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);

	Use *U = S.getUse();	Use *U = S.getUse();
		IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser());

	if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {	if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
	if (MI->isVolatile())	if (MI->isVolatile())
	return false;	return false;
	if (!S.isSplittable())	if (!S.isSplittable())
	return false; // Skip any unsplittable intrinsics.	return false; // Skip any unsplittable intrinsics.
	} else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {	} else if (II && II->getIntrinsicID() != Intrinsic::lifetime_start &&
	if (II->getIntrinsicID() != Intrinsic::lifetime_start &&	II->getIntrinsicID() != Intrinsic::lifetime_end) {
	II->getIntrinsicID() != Intrinsic::lifetime_end)
	return false;	return false;
	} else if (U->get()->getType()->getPointerElementType()->isStructTy()) {	} else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
	// Disable vector promotion when there are loads or stores of an FCA.	// Disable vector promotion when there are loads or stores of an FCA.
Context not available.

test/Transforms/ScalarRepl/sroa-lifetime-instrinsics.ll

This file was added.

				; RUN: opt -S -sroa < %s \| FileCheck %s
				; For intrinsics which do generate lifetime start and end evaluate all
				; conditions before deciding whether to slice vector loads or not

				target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
				target triple = "aarch64--linux-gnu"

				; Function Attrs: nounwind
				declare void @llvm.lifetime.start(i64, i8* nocapture)

				; Function Attrs: nounwind
				declare void @llvm.lifetime.end(i64, i8* nocapture)

				; Function Attrs: nounwind readonly
				define i32 @foo(i8* %n) {
				entry:
				; CHECK-LABEL: @foo
				; CHECK-NOT: %a.sroa.0.0.vec.insert = insertelement <8 x i16> undef, i16 %conv, i32 0
				; CHECK: store i16 %conv, i16* %a.sroa.0.0.arrayidx1.sroa_idx3, align 16
				; CHECK-NEXT: %a.sroa.0.0.a.sroa.0.0. = load <8 x i16>* %a.sroa.0

				%n.addr = alloca i8*, align 8
				%a = alloca [32 x i16], align 2
				%c = alloca <4 x i32>, align 16
				%__ret = alloca <8 x i16>, align 16
				%tmp = alloca <8 x i16>, align 16
				%cleanup.dest.slot = alloca i32
				store i8* %n, i8** %n.addr, align 8
				%0 = bitcast [32 x i16]* %a to i8*
				call void @llvm.lifetime.start(i64 64, i8* %0) #1
				%1 = load i8** %n.addr, align 8
				%arrayidx = getelementptr inbounds i8* %1, i64 0
				%2 = load i8* %arrayidx, align 1
				%conv = sext i8 %2 to i16
				%arrayidx1 = getelementptr inbounds [32 x i16]* %a, i32 0, i64 0
				store i16 %conv, i16* %arrayidx1, align 2
				%arraydecay = getelementptr inbounds [32 x i16]* %a, i32 0, i32 0
				%3 = bitcast i16* %arraydecay to i8*
				%4 = bitcast i8* %3 to <8 x i16>*
				%5 = load <8 x i16>* %4
				store <8 x i16> %5, <8 x i16>* %__ret, align 16
				%6 = load <8 x i16>* %__ret, align 16
				store <8 x i16> %6, <8 x i16>* %tmp
				%7 = load <8 x i16>* %tmp
				%8 = bitcast <8 x i16> %7 to <4 x i32>
				store <4 x i32> %8, <4 x i32>* %c, align 16
				%9 = load <4 x i32>* %c, align 16
				%vecext = extractelement <4 x i32> %9, i32 0
				store i32 1, i32* %cleanup.dest.slot
				%10 = bitcast [32 x i16]* %a to i8*
				call void @llvm.lifetime.end(i64 64, i8* %10) #1
				ret i32 %vecext
				}