This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/trunk/
-
trunk/
-
lib/Target/ARM/
-
Target/
-
ARM/
-
ARMCallingConv.h
-
test/CodeGen/ARM/
-
CodeGen/
-
ARM/
-
aggregate-padding.ll

Differential D49720

[ARM] Fix over-alignment in arguments that are HA of 128-bit vectors
ClosedPublic

Authored by petpav01 on Jul 24 2018, 1:39 AM.

Download Raw Diff

Details

Reviewers

t.p.northover
olista01
eli.friedman
javed.absar
efriedma

Commits

rG8b6eff4e77f0: [ARM] Fix over-alignment in arguments that are HA of 128-bit vectors
rL338233: [ARM] Fix over-alignment in arguments that are HA of 128-bit vectors

Summary

Code in CC_ARM_AAPCS_Custom_Aggregate() is responsible for handling homogeneous aggregates for CC_ARM_AAPCS_VFP. When an aggregate ends up fully on stack, the function tries to pack all resulting items of the aggregate as tightly as possible. Once the first item is laid out, the alignment used for consecutive items is the size of one item.

This logic goes wrong for 128-bit vectors because their alignment is normally only 64 bits, and so can result in inserting unexpected padding between the first and second element.

Example:

$ cat test.c
#include <arm_neon.h>

typedef struct {
  double A[4];
} S_d64_4;

typedef struct {
  uint32x4_t A[2];
} S_v128_2;

int foo(S_d64_4 P0, S_d64_4 P1, float P2, S_v128_2 P3) {
  // * P0 is passed in D0-D3.
  // * P1 is passed in D4-D7.
  // * P2 is passed in [SP, SP+4).
  // * P3.A[0] is passed in [SP+8, SP+24).
  // * P3.A[1] should be passed according to AAPCS in [SP+24, SP+40) but the
  //   code produced by Clang/LLVM expects it in [SP+32, SP+48).
  return vgetq_lane_u32(P3.A[0], 0) + vgetq_lane_u32(P3.A[1], 0);
}

$ clang -target arm-none-eabi -mcpu=cortex-a53 -S test.c -o -
[...]
foo:
        push    {r11, lr}
        mov     r11, sp
        sub     sp, sp, #8
        bfc     sp, #0, #4
        ldr     r0, [r11, #40]   /* load from entry-SP + #32 */
        ldr     r1, [r11, #16]   /* load from entry-SP + #8 */
        add     r0, r1, r0
        mov     sp, r11
        pop     {r11, pc}

The proposed patch fixes the problem by updating the alignment with the item size only if this results in reducing it.

Diff Detail

Repository: rL LLVM

Event Timeline

petpav01 created this revision.Jul 24 2018, 1:39 AM

Herald added a reviewer: javed.absar. · View Herald TranscriptJul 24 2018, 1:39 AM

Herald added subscribers: llvm-commits, chrib, kristof.beyls. · View Herald Transcript

efriedma added a subscriber: efriedma.Jul 25 2018, 12:53 PM

efriedma added inline comments.

lib/Target/ARM/ARMCallingConv.h
279 ↗	(On Diff #156977)	Could you hoist the `std::min(Align, Size)` out of the loop so it's clear it isn't changing every iteration?

petpav01 updated this revision to Diff 157430.Jul 26 2018, 1:13 AM

Thanks for having a look at this patch.

lib/Target/ARM/ARMCallingConv.h
279 ↗	(On Diff #156977)	Updated.

LGTM

This revision is now accepted and ready to land.Jul 26 2018, 2:01 PM

Closed by commit rL338233: [ARM] Fix over-alignment in arguments that are HA of 128-bit vectors (authored by petr.pavlu). · Explain WhyJul 30 2018, 1:49 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

ARM/

ARMCallingConv.h

11 lines

test/

CodeGen/

ARM/

aggregate-padding.ll

16 lines

Diff 157928

llvm/trunk/lib/Target/ARM/ARMCallingConv.h

Show First 20 Lines • Show All 263 Lines • ▼ Show 20 Lines	if (LocVT == MVT::i32 && State.getNextStackOffset() == 0) {
return true;		return true;
} else if (LocVT != MVT::i32)		} else if (LocVT != MVT::i32)
RegList = SRegList;		RegList = SRegList;

// Mark all regs as unavailable (AAPCS rule C.2.vfp for VFP, C.6 for core)		// Mark all regs as unavailable (AAPCS rule C.2.vfp for VFP, C.6 for core)
for (auto Reg : RegList)		for (auto Reg : RegList)
State.AllocateReg(Reg);		State.AllocateReg(Reg);

		// After the first item has been allocated, the rest are packed as tightly as
		// possible. (E.g. an incoming i64 would have starting Align of 8, but we'll
		// be allocating a bunch of i32 slots).
		unsigned RestAlign = std::min(Align, Size);

for (auto &It : PendingMembers) {		for (auto &It : PendingMembers) {
It.convertToMem(State.AllocateStack(Size, Align));		It.convertToMem(State.AllocateStack(Size, Align));
State.addLoc(It);		State.addLoc(It);
		Align = RestAlign;
// After the first item has been allocated, the rest are packed as tightly
// as possible. (E.g. an incoming i64 would have starting Align of 8, but
// we'll be allocating a bunch of i32 slots).
Align = Size;
}		}

// All pending members have now been allocated		// All pending members have now been allocated
PendingMembers.clear();		PendingMembers.clear();

// This will be allocated by the last member of the aggregate		// This will be allocated by the last member of the aggregate
return true;		return true;
}		}

} // End llvm namespace		} // End llvm namespace

#endif		#endif

llvm/trunk/test/CodeGen/ARM/aggregate-padding.ll

	Show First 20 Lines • Show All 93 Lines • ▼ Show 20 Lines
	; CHECK-DAG: ldrh [[VAL0:r[0-9]+]], [sp, #8]			; CHECK-DAG: ldrh [[VAL0:r[0-9]+]], [sp, #8]
	; CHECK-DAG: ldrh [[VAL2:r[0-9]+]], [sp, #16]			; CHECK-DAG: ldrh [[VAL2:r[0-9]+]], [sp, #16]
	; CHECK: add r0, [[VAL0]], [[VAL2]]			; CHECK: add r0, [[VAL0]], [[VAL2]]
	%val0 = extractvalue [3 x i16] %arg, 0			%val0 = extractvalue [3 x i16] %arg, 0
	%val2 = extractvalue [3 x i16] %arg, 2			%val2 = extractvalue [3 x i16] %arg, 2
	%sum = add i16 %val0, %val2			%sum = add i16 %val0, %val2
	ret i16 %sum			ret i16 %sum
	}			}

				; [2 x <4 x i32>] should be aligned only on a 64-bit boundary and contiguous.
				; None of the two <4 x i32> elements should introduce any padding to 128 bits.
				define i32 @test_4xi32_64bit_aligned_and_contiguous([8 x double], float, [2 x <4 x i32>] %arg) nounwind {
				; CHECK-LABEL: test_4xi32_64bit_aligned_and_contiguous:
				; CHECK-DAG: ldr [[VAL0_0:r[0-9]+]], [sp, #8]
				; CHECK-DAG: ldr [[VAL1_0:r[0-9]+]], [sp, #24]
				; CHECK: add r0, [[VAL0_0]], [[VAL1_0]]

				%val0 = extractvalue [2 x <4 x i32>] %arg, 0
				%val0_0 = extractelement <4 x i32> %val0, i32 0
				%val1 = extractvalue [2 x <4 x i32>] %arg, 1
				%val1_0 = extractelement <4 x i32> %val1, i32 0
				%sum = add i32 %val0_0, %val1_0
				ret i32 %sum
				}