diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp --- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -134,8 +134,20 @@ for (const Instruction &I : BB) { if (const AllocaInst *AI = dyn_cast(&I)) { Type *Ty = AI->getAllocatedType(); + Align TyPrefAlign = MF->getDataLayout().getPrefTypeAlign(Ty); + // The "specified" alignment is the alignment written on the alloca, + // or the preferred alignment of the type if none is specified. + // + // (Unspecified alignment on allocas will be going away soon.) + Align SpecifiedAlign = AI->getAlign() ? *AI->getAlign() : TyPrefAlign; + + // If the preferred alignment of the type is higher than the specified + // alignment of the alloca, promote the alignment, as long as it doesn't + // require realigning the stack. + // + // FIXME: Do we really want to second-guess the IR in isel? Align Alignment = - max(MF->getDataLayout().getPrefTypeAlign(Ty), AI->getAlign()); + std::max(std::min(TyPrefAlign, StackAlign), SpecifiedAlign); // Static allocas can be folded into the initial stack frame // adjustment. For targets that don't realign the stack, don't diff --git a/llvm/test/CodeGen/AArch64/sve-alloca-stackid.ll b/llvm/test/CodeGen/AArch64/sve-alloca-stackid.ll --- a/llvm/test/CodeGen/AArch64/sve-alloca-stackid.ll +++ b/llvm/test/CodeGen/AArch64/sve-alloca-stackid.ll @@ -15,3 +15,18 @@ } declare i32 @bar(* %ptr); + +; CHECKCG-LABEL: foo2: +; CHECKCG: addvl sp, sp, #-2 + +; CHECKISEL-LABEL: name: foo2 +; CHECKISEL: stack: +; CHECKISEL: id: 0, name: ptr, type: default, offset: 0, size: 32, alignment: 16, +; CHECKISEL-NEXT: stack-id: sve-vec + +define i32 @foo2( %val) { + %ptr = alloca , align 16 + %res = call i32 @bar2(* %ptr) + ret i32 %res +} +declare i32 @bar2(* %ptr); diff --git a/llvm/test/CodeGen/AMDGPU/private-element-size.ll b/llvm/test/CodeGen/AMDGPU/private-element-size.ll --- a/llvm/test/CodeGen/AMDGPU/private-element-size.ll +++ b/llvm/test/CodeGen/AMDGPU/private-element-size.ll @@ -113,7 +113,7 @@ %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom %index.load = load i32, i32 addrspace(1)* %gep.index %index = and i32 %index.load, 2 - %alloca = alloca [2 x <8 x i32>], align 16, addrspace(5) + %alloca = alloca [2 x <8 x i32>], align 32, addrspace(5) %gep0 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>] addrspace(5)* %alloca, i32 0, i32 0 %gep1 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>] addrspace(5)* %alloca, i32 0, i32 1 store <8 x i32> zeroinitializer, <8 x i32> addrspace(5)* %gep0 diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -51,8 +51,8 @@ ; GCN-NEXT: BB0_2: ; %shader_eval_surface.exit ; GCN-NEXT: s_endpgm entry: - %sd = alloca < 1339 x i32>, align 16, addrspace(5) - %state = alloca <4 x i32>, align 4, addrspace(5) + %sd = alloca < 1339 x i32>, align 8192, addrspace(5) + %state = alloca <4 x i32>, align 16, addrspace(5) %rslt = call i32 @svm_eval_nodes(float addrspace(5)* %kg, <1339 x i32> addrspace(5)* %sd, <4 x i32> addrspace(5)* %state, i32 0, i32 4194304) %cmp = icmp eq i32 %rslt, 0 br i1 %cmp, label %shader_eval_surface.exit, label %if.then4.i diff --git a/llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll b/llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll --- a/llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll +++ b/llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll @@ -34,7 +34,7 @@ ; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128] ; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]! ; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128] - %retval = alloca <16 x float>, align 16 + %retval = alloca <16 x float>, align 64 %0 = load <16 x float>, <16 x float>* @T3_retval, align 16 store <16 x float> %0, <16 x float>* %retval %1 = load <16 x float>, <16 x float>* %retval @@ -73,7 +73,7 @@ ; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128] -%retval = alloca <16 x float>, align 16 +%retval = alloca <16 x float>, align 64 %0 = load <16 x float>, <16 x float>* @T3_retval, align 16 store <16 x float> %0, <16 x float>* %retval %1 = load <16 x float>, <16 x float>* %retval diff --git a/llvm/test/CodeGen/Thumb2/mve-basic.ll b/llvm/test/CodeGen/Thumb2/mve-basic.ll --- a/llvm/test/CodeGen/Thumb2/mve-basic.ll +++ b/llvm/test/CodeGen/Thumb2/mve-basic.ll @@ -44,18 +44,12 @@ define arm_aapcs_vfpcc <16 x i8> @stack_slot_handling(<16 x i8> %a) #0 { ; CHECK-LABEL: stack_slot_handling: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, r6, r7, lr} -; CHECK-NEXT: add r7, sp, #8 ; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: mov r4, sp -; CHECK-NEXT: bfc r4, #0, #4 -; CHECK-NEXT: mov sp, r4 ; CHECK-NEXT: mov r0, sp ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: sub.w r4, r7, #8 -; CHECK-NEXT: mov sp, r4 -; CHECK-NEXT: pop {r4, r6, r7, pc} +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: bx lr entry: %a.addr = alloca <16 x i8>, align 8 store <16 x i8> %a, <16 x i8>* %a.addr, align 8 diff --git a/llvm/test/CodeGen/X86/arg-copy-elide.ll b/llvm/test/CodeGen/X86/arg-copy-elide.ll --- a/llvm/test/CodeGen/X86/arg-copy-elide.ll +++ b/llvm/test/CodeGen/X86/arg-copy-elide.ll @@ -53,22 +53,18 @@ } ; CHECK-LABEL: _split_i64: -; CHECK: pushl %ebp -; CHECK: movl %esp, %ebp ; CHECK: pushl %[[csr2:[^ ]*]] ; CHECK: pushl %[[csr1:[^ ]*]] -; CHECK: andl $-8, %esp -; CHECK-DAG: movl 8(%ebp), %[[csr1]] -; CHECK-DAG: movl 12(%ebp), %[[csr2]] -; CHECK-DAG: leal 8(%ebp), %[[reg:[^ ]*]] +; CHECK-DAG: movl 12(%esp), %[[csr1]] +; CHECK-DAG: movl 16(%esp), %[[csr2]] +; CHECK-DAG: leal 12(%esp), %[[reg:[^ ]*]] ; CHECK: pushl %[[reg]] ; CHECK: calll _addrof_i64 +; CHECK: addl $4, %esp ; CHECK-DAG: movl %[[csr1]], %eax ; CHECK-DAG: movl %[[csr2]], %edx -; CHECK: leal -8(%ebp), %esp ; CHECK: popl %[[csr1]] ; CHECK: popl %[[csr2]] -; CHECK: popl %ebp ; CHECK: retl define i1 @i1_arg(i1 %x) { @@ -101,16 +97,13 @@ } ; CHECK-LABEL: _fastcc_split_i64: -; CHECK: pushl %ebp -; CHECK: movl %esp, %ebp ; CHECK-DAG: movl %edx, %[[r1:[^ ]*]] -; CHECK-DAG: movl 8(%ebp), %[[r2:[^ ]*]] +; CHECK-DAG: movl 20(%esp), %[[r2:[^ ]*]] ; CHECK-DAG: movl %[[r2]], 4(%esp) ; CHECK-DAG: movl %edx, (%esp) ; CHECK: movl %esp, %[[reg:[^ ]*]] ; CHECK: pushl %[[reg]] ; CHECK: calll _addrof_i64 -; CHECK: popl %ebp ; CHECK: retl diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll --- a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll @@ -1164,9 +1164,9 @@ ; X64-NEXT: vzeroupper ; X64-NEXT: retq eintry: - %__a.addr.i = alloca <4 x i64>, align 16 - %__b.addr.i = alloca <4 x i64>, align 16 - %vCr = alloca <4 x i64>, align 16 + %__a.addr.i = alloca <4 x i64>, align 32 + %__b.addr.i = alloca <4 x i64>, align 32 + %vCr = alloca <4 x i64>, align 32 store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16 %tmp = load <4 x i64>, <4 x i64>* %vCr, align 16 %tmp2 = load i8, i8* %cV_R.addr, align 4 @@ -1255,9 +1255,9 @@ ; X64-NEXT: vzeroupper ; X64-NEXT: retq eintry: - %__a.addr.i = alloca <4 x i64>, align 16 - %__b.addr.i = alloca <4 x i64>, align 16 - %vCr = alloca <4 x i64>, align 16 + %__a.addr.i = alloca <4 x i64>, align 32 + %__b.addr.i = alloca <4 x i64>, align 32 + %vCr = alloca <4 x i64>, align 32 store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16 %tmp = load <4 x i64>, <4 x i64>* %vCr, align 16 %tmp2 = load i16, i16* %cV_R.addr, align 4 @@ -1346,9 +1346,9 @@ ; X64-NEXT: vzeroupper ; X64-NEXT: retq eintry: - %__a.addr.i = alloca <4 x i64>, align 16 - %__b.addr.i = alloca <4 x i64>, align 16 - %vCr = alloca <4 x i64>, align 16 + %__a.addr.i = alloca <4 x i64>, align 32 + %__b.addr.i = alloca <4 x i64>, align 32 + %vCr = alloca <4 x i64>, align 32 store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16 %tmp = load <4 x i64>, <4 x i64>* %vCr, align 16 %tmp2 = load i32, i32* %cV_R.addr, align 4 @@ -1436,9 +1436,9 @@ ; X64-NEXT: vzeroupper ; X64-NEXT: retq eintry: - %__a.addr.i = alloca <4 x i64>, align 16 - %__b.addr.i = alloca <4 x i64>, align 16 - %vCr = alloca <4 x i64>, align 16 + %__a.addr.i = alloca <4 x i64>, align 32 + %__b.addr.i = alloca <4 x i64>, align 32 + %vCr = alloca <4 x i64>, align 32 store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16 %tmp = load <4 x i64>, <4 x i64>* %vCr, align 16 %tmp2 = load i64, i64* %cV_R.addr, align 4 diff --git a/llvm/test/CodeGen/X86/avx512-intel-ocl.ll b/llvm/test/CodeGen/X86/avx512-intel-ocl.ll --- a/llvm/test/CodeGen/X86/avx512-intel-ocl.ll +++ b/llvm/test/CodeGen/X86/avx512-intel-ocl.ll @@ -77,7 +77,7 @@ ; X64-NEXT: popq %r13 ; X64-NEXT: popq %rbp ; X64-NEXT: retq - %y = alloca <16 x float>, align 16 + %y = alloca <16 x float>, align 64 %x = fadd <16 x float> %a, %b %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) %2 = load <16 x float>, <16 x float>* %y, align 16 @@ -158,7 +158,7 @@ ; X64-NEXT: popq %r13 ; X64-NEXT: popq %rbp ; X64-NEXT: retq - %y = alloca <16 x float>, align 16 + %y = alloca <16 x float>, align 64 %x = fadd <16 x float> %a, %b %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) %2 = load <16 x float>, <16 x float>* %y, align 16 diff --git a/llvm/test/CodeGen/X86/load-local-v3i129.ll b/llvm/test/CodeGen/X86/load-local-v3i129.ll --- a/llvm/test/CodeGen/X86/load-local-v3i129.ll +++ b/llvm/test/CodeGen/X86/load-local-v3i129.ll @@ -4,26 +4,20 @@ define void @_start() { ; CHECK-LABEL: _start: ; CHECK: # %bb.0: # %Entry -; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movq %rsp, %rbp -; CHECK-NEXT: .cfi_def_cfa_register %rbp -; CHECK-NEXT: andq $-128, %rsp -; CHECK-NEXT: subq $256, %rsp # imm = 0x100 -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: shrdq $2, %rcx, %rax ; CHECK-NEXT: shrq $2, %rcx ; CHECK-NEXT: leaq 1(,%rax,4), %rdx -; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: shrdq $62, %rcx, %rax -; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; CHECK-NEXT: orq $-2, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq $-1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rbp, %rsp -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: orq $-2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq $-1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq Entry: %y = alloca <3 x i129>, align 4 diff --git a/llvm/test/CodeGen/X86/movtopush.ll b/llvm/test/CodeGen/X86/movtopush.ll --- a/llvm/test/CodeGen/X86/movtopush.ll +++ b/llvm/test/CodeGen/X86/movtopush.ll @@ -246,7 +246,7 @@ entry: %p = alloca i32, align 4 %q = alloca i32, align 4 - %s = alloca %struct.s, align 4 + %s = alloca %struct.s, align 8 call void @good(i32 1, i32 2, i32 3, i32 4) %pv = ptrtoint i32* %p to i32 %qv = ptrtoint i32* %q to i32 @@ -407,7 +407,7 @@ define void @test14(%struct.A* %a) { entry: %ref.tmp = alloca %struct.B, align 1 - %agg.tmp = alloca i64, align 4 + %agg.tmp = alloca i64, align 8 %tmpcast = bitcast i64* %agg.tmp to %struct.A* %tmp = alloca %struct.B, align 1 %0 = bitcast %struct.A* %a to i64*