This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Transforms/Scalar/
-
Transforms/
-
Scalar/
-
AlignmentFromAssumptions.cpp
-
test/Transforms/AlignmentFromAssumptions/
-
Transforms/
-
AlignmentFromAssumptions/
-
simple.ll

Differential D66575

[AlignmentFromAssumptions] getNewAlignmentDiff(): use getURemExpr()
ClosedPublic

Authored by cjld on Aug 22 2019, 1:12 AM.

Download Raw Diff

Details

Reviewers

lebedev.ri
spatel
jdoerfert
sanjoy
hfinkel

Commits

rG3fc933af8b49: [AlignmentFromAssumptions] getNewAlignmentDiff(): use getURemExpr()
rL369723: [AlignmentFromAssumptions] getNewAlignmentDiff(): use getURemExpr()

Summary

Better way to fix misaligned mov instruction ,

alignment-from-assumptions pass doesn't generate aligned mov instructions, a example below:

// b.cc
#include <cstddef>
#include <stdint.h>

typedef long long index;

extern "C" index g_tid;
extern "C" index g_num;


void add3(float* __restrict__ a, float* __restrict__ b, float* __restrict__ c) {
    index n = 64*1024;
    index m = 16*1024;
    index k = 4*1024;
    index tid = g_tid;
    index num = g_num;
    __builtin_assume_aligned(a, 32);
    __builtin_assume_aligned(b, 32);
    __builtin_assume_aligned(c, 32);
    for (index i0=tid*k; i0<m; i0+=num*k)
        for (index i1=0; i1<n*m; i1+=m)
            for (index i2=0; i2<k; i2++)
                c[i1+i0+i2] = b[i0+i2] + a[i1+i0+i2];
}

compile with clang ./b.cc -Ofast -march=native -std=c++14 -S -o b.s. (intel i7-7500U)
which yield:

// b.s
......
	vmovaps	-224(%rdi,%rbx,4), %ymm0
	vmovups	-192(%rdi,%rbx,4), %ymm1
	vmovups	-160(%rdi,%rbx,4), %ymm2
	vmovups	-128(%rdi,%rbx,4), %ymm3
	vaddps	-224(%rsi,%rbx,4), %ymm0, %ymm0
	vaddps	-192(%rsi,%rbx,4), %ymm1, %ymm1
	vaddps	-160(%rsi,%rbx,4), %ymm2, %ymm2
	vaddps	-128(%rsi,%rbx,4), %ymm3, %ymm3
	vmovaps	%ymm0, -224(%rdx,%rbx,4)
	vmovups	%ymm1, -192(%rdx,%rbx,4)
	vmovups	%ymm2, -160(%rdx,%rbx,4)
	vmovups	%ymm3, -128(%rdx,%rbx,4)
......

expect:

// b.s
......
	vmovaps	-224(%rdi,%rbx,4), %ymm0
	vmovaps	-192(%rdi,%rbx,4), %ymm1
	vmovaps	-160(%rdi,%rbx,4), %ymm2
	vmovaps	-128(%rdi,%rbx,4), %ymm3
	vaddps	-224(%rsi,%rbx,4), %ymm0, %ymm0
	vaddps	-192(%rsi,%rbx,4), %ymm1, %ymm1
	vaddps	-160(%rsi,%rbx,4), %ymm2, %ymm2
	vaddps	-128(%rsi,%rbx,4), %ymm3, %ymm3
	vmovaps	%ymm0, -224(%rdx,%rbx,4)
	vmovaps	%ymm1, -192(%rdx,%rbx,4)
	vmovaps	%ymm2, -160(%rdx,%rbx,4)
	vmovaps	%ymm3, -128(%rdx,%rbx,4)
......

This is because the alignment-from-assumptions pass using the wrong function to calculate the alignment

Diff Detail

Event Timeline

cjld created this revision.Aug 22 2019, 1:12 AM

Herald added subscribers: llvm-commits, hiraditya. · View Herald TranscriptAug 22 2019, 1:12 AM

cjld mentioned this in D66528: Fix misaligned mov instruction codegen by making MaxDepth in value tracking configurable.Aug 22 2019, 1:13 AM

Test :) ?

xbolva00 added a reviewer: sanjoy.Aug 22 2019, 2:22 AM

add test @xbolva00

Thanks, this looks like a fix, although i'm surprized by it's simplicity.

LGTM.

The original code tried to do the modulo computation (as per comment and the looks of it) but the operands of DiffUnitsSCEV = SE->getMinusSCEV(DiffAlign, DiffSCEV) were swapped.
Swapping them should yield the same result as using URem but using URem is better so this is fine.

This revision is now accepted and ready to land.Aug 22 2019, 9:38 AM

Requested by @cjld to commit on his behalf. (I will simplify the description a bit.)

Closed by commit rL369723: [AlignmentFromAssumptions] getNewAlignmentDiff(): use getURemExpr() (authored by MaskRay). · Explain WhyAug 22 2019, 7:19 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

lib/

Transforms/

Scalar/

AlignmentFromAssumptions.cpp

4 lines

test/

Transforms/

AlignmentFromAssumptions/

simple.ll

55 lines

Diff 216604

llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp

	Show First 20 Lines • Show All 87 Lines • ▼ Show 20 Lines
	// DiffSCEV, compute the alignment of the displaced pointer if it can be reduced			// DiffSCEV, compute the alignment of the displaced pointer if it can be reduced
	// to a constant. Using SCEV to compute alignment handles the case where			// to a constant. Using SCEV to compute alignment handles the case where
	// DiffSCEV is a recurrence with constant start such that the aligned offset			// DiffSCEV is a recurrence with constant start such that the aligned offset
	// is constant. e.g. {16,+,32} % 32 -> 16.			// is constant. e.g. {16,+,32} % 32 -> 16.
	static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV,			static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV,
	const SCEV *AlignSCEV,			const SCEV *AlignSCEV,
	ScalarEvolution *SE) {			ScalarEvolution *SE) {
	// DiffUnits = Diff % int64_t(Alignment)			// DiffUnits = Diff % int64_t(Alignment)
	const SCEV *DiffAlignDiv = SE->getUDivExpr(DiffSCEV, AlignSCEV);			const SCEV *DiffUnitsSCEV = SE->getURemExpr(DiffSCEV, AlignSCEV);
	const SCEV *DiffAlign = SE->getMulExpr(DiffAlignDiv, AlignSCEV);
	const SCEV *DiffUnitsSCEV = SE->getMinusSCEV(DiffAlign, DiffSCEV);

	LLVM_DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is "			LLVM_DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is "
	<< DiffUnitsSCEV << " (diff: " << DiffSCEV << ")\n");			<< DiffUnitsSCEV << " (diff: " << DiffSCEV << ")\n");

	if (const SCEVConstant *ConstDUSCEV =			if (const SCEVConstant *ConstDUSCEV =
	dyn_cast<SCEVConstant>(DiffUnitsSCEV)) {			dyn_cast<SCEVConstant>(DiffUnitsSCEV)) {
	int64_t DiffUnits = ConstDUSCEV->getValue()->getSExtValue();			int64_t DiffUnits = ConstDUSCEV->getValue()->getSExtValue();

	▲ Show 20 Lines • Show All 309 Lines • Show Last 20 Lines

llvm/test/Transforms/AlignmentFromAssumptions/simple.ll

Show First 20 Lines • Show All 84 Lines • ▼ Show 20 Lines	for.end: ; preds = %for.body
%add.lcssa = phi i32 [ %add, %for.body ]		%add.lcssa = phi i32 [ %add, %for.body ]
ret i32 %add.lcssa		ret i32 %add.lcssa

; CHECK-LABEL: @hoo		; CHECK-LABEL: @hoo
; CHECK: load i32, i32* %arrayidx, align 32		; CHECK: load i32, i32* %arrayidx, align 32
; CHECK: ret i32 %add.lcssa		; CHECK: ret i32 %add.lcssa
}		}

		; test D66575
		; def hoo2(a, id, num):
		; for i0 in range(id64, 4096, num64):
		; for i1 in range(0, 4096, 32):
		; for i2 in range(0, 4096, 32):
		; load(a, i0+i1+i2+32)
		define void @hoo2(i32* nocapture %a, i64 %id, i64 %num) nounwind uwtable readonly {
		entry:
		%ptrint = ptrtoint i32* %a to i64
		%maskedptr = and i64 %ptrint, 31
		%maskcond = icmp eq i64 %maskedptr, 0
		tail call void @llvm.assume(i1 %maskcond)
		%id.mul = shl nsw i64 %id, 6
		%num.mul = shl nsw i64 %num, 6
		br label %for0.body

		for0.body:
		%i0 = phi i64 [ %id.mul, %entry ], [ %i0.next, %for0.end ]
		br label %for1.body

		for1.body:
		%i1 = phi i64 [ 0, %for0.body ], [ %i1.next, %for1.end ]
		br label %for2.body

		for2.body:
		%i2 = phi i64 [ 0, %for1.body ], [ %i2.next, %for2.body ]

		%t1 = add nuw nsw i64 %i0, %i1
		%t2 = add nuw nsw i64 %t1, %i2
		%t3 = add nuw nsw i64 %t2, 32
		%arrayidx = getelementptr inbounds i32, i32* %a, i64 %t3
		%x = load i32, i32* %arrayidx, align 4

		%i2.next = add nuw nsw i64 %i2, 32
		%cmp2 = icmp ult i64 %i2.next, 4096
		br i1 %cmp2, label %for2.body, label %for1.end

		for1.end:
		%i1.next = add nuw nsw i64 %i1, 32
		%cmp1 = icmp ult i64 %i1.next, 4096
		br i1 %cmp1, label %for1.body, label %for0.end

		for0.end:
		%i0.next = add nuw nsw i64 %i0, %num.mul
		%cmp0 = icmp ult i64 %i0.next, 4096
		br i1 %cmp0, label %for0.body, label %return

		return:
		ret void

		; CHECK-LABEL: @hoo2
		; CHECK: load i32, i32* %arrayidx, align 32
		; CHECK: ret void
		}

define i32 @joo(i32* nocapture %a) nounwind uwtable readonly {		define i32 @joo(i32* nocapture %a) nounwind uwtable readonly {
entry:		entry:
%ptrint = ptrtoint i32* %a to i64		%ptrint = ptrtoint i32* %a to i64
%maskedptr = and i64 %ptrint, 31		%maskedptr = and i64 %ptrint, 31
%maskcond = icmp eq i64 %maskedptr, 0		%maskcond = icmp eq i64 %maskedptr, 0
tail call void @llvm.assume(i1 %maskcond)		tail call void @llvm.assume(i1 %maskcond)
br label %for.body		br label %for.body

▲ Show 20 Lines • Show All 116 Lines • Show Last 20 Lines