Index: llvm/include/llvm/CodeGen/SelectionDAG.h =================================================================== --- llvm/include/llvm/CodeGen/SelectionDAG.h +++ llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1608,6 +1608,12 @@ void dump() const; + /// In most cases this function returns the ABI alignment for a given type, + /// except for illegal vector types where the alignment exceeds that of the + /// stack. In such cases we attempt to break the vector down to a legal type + /// and return the ABI alignment for that instead. + Align getReducedAlign(EVT VT, bool UseABI); + /// Create a stack temporary based on the size in bytes and the alignment SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment); Index: llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp @@ -883,12 +883,19 @@ SDLoc dl(Op); // Create the stack frame object. Make sure it is aligned for both // the source and destination types. - SDValue StackPtr = DAG.CreateStackTemporary(Op.getValueType(), DestVT); + + // In cases where the vector is illegal it will be broken down into parts + // and stored in parts - we should use the alignment for the smallest part. + Align DestAlign = DAG.getReducedAlign(DestVT, /*UseABI=*/false); + Align OpAlign = DAG.getReducedAlign(Op.getValueType(), /*UseABI=*/false); + Align Align = std::max(DestAlign, OpAlign); + SDValue StackPtr = + DAG.CreateStackTemporary(Op.getValueType().getStoreSize(), Align); // Emit a store to the stack slot. - SDValue Store = - DAG.getStore(DAG.getEntryNode(), dl, Op, StackPtr, MachinePointerInfo()); + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op, StackPtr, + MachinePointerInfo(), Align); // Result is a load from the stack slot. - return DAG.getLoad(DestVT, dl, Store, StackPtr, MachinePointerInfo()); + return DAG.getLoad(DestVT, dl, Store, StackPtr, MachinePointerInfo(), Align); } /// Replace the node's results with custom code provided by the target and Index: llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -154,9 +154,13 @@ // Create the stack frame object. Make sure it is aligned for both // the source and expanded destination types. - Align Alignment = DAG.getDataLayout().getPrefTypeAlign( - NOutVT.getTypeForEVT(*DAG.getContext())); - SDValue StackPtr = DAG.CreateStackTemporary(InVT, Alignment.value()); + + // In cases where the vector is illegal it will be broken down into parts + // and stored in parts - we should use the alignment for the smallest part. + Align InAlign = DAG.getReducedAlign(InVT, /*UseABI=*/false); + Align NOutAlign = DAG.getReducedAlign(NOutVT, /*UseABI=*/false); + Align Align = std::max(InAlign, NOutAlign); + SDValue StackPtr = DAG.CreateStackTemporary(InVT.getStoreSize(), Align); int SPFI = cast(StackPtr.getNode())->getIndex(); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); @@ -165,7 +169,7 @@ SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, PtrInfo); // Load the first half from the stack slot. - Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo, Alignment); + Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo, NOutAlign); // Increment the pointer to the other half. unsigned IncrementSize = NOutVT.getSizeInBits() / 8; @@ -173,7 +177,7 @@ // Load the second half from the stack slot. Hi = DAG.getLoad(NOutVT, dl, Store, StackPtr, - PtrInfo.getWithOffset(IncrementSize), Alignment); + PtrInfo.getWithOffset(IncrementSize), NOutAlign); // Handle endianness of the load. if (TLI.hasBigEndianPartOrdering(OutVT, DAG.getDataLayout())) Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1156,15 +1156,17 @@ } // Spill the vector to the stack. - SDValue StackPtr = DAG.CreateStackTemporary(VecVT); + // In cases where the vector is illegal it will be broken down into parts + // and stored in parts - we should use the alignment for the smallest part. + Align SmallestAlign = DAG.getReducedAlign(VecVT, /*UseABI=*/false); + SDValue StackPtr = + DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign); auto &MF = DAG.getMachineFunction(); auto FrameIndex = cast(StackPtr.getNode())->getIndex(); auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex); - Type *VecType = VecVT.getTypeForEVT(*DAG.getContext()); - Align Alignment = DAG.getDataLayout().getPrefTypeAlign(VecType); - SDValue Store = - DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo, Alignment); + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo, + SmallestAlign); // Store the new subvector into the specified index. SDValue SubVecPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx); @@ -1172,7 +1174,8 @@ MachinePointerInfo::getUnknownStack(MF)); // Load the Lo part from the stack slot. - Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, PtrInfo, Alignment); + Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, PtrInfo, + SmallestAlign); // Increment the pointer to the other part. unsigned IncrementSize = Lo.getValueSizeInBits() / 8; @@ -1180,7 +1183,7 @@ // Load the Hi part from the stack slot. Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, - PtrInfo.getWithOffset(IncrementSize), Alignment); + PtrInfo.getWithOffset(IncrementSize), SmallestAlign); } void DAGTypeLegalizer::SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, @@ -1452,15 +1455,17 @@ } // Spill the vector to the stack. - SDValue StackPtr = DAG.CreateStackTemporary(VecVT); + // In cases where the vector is illegal it will be broken down into parts + // and stored in parts - we should use the alignment for the smallest part. + Align SmallestAlign = DAG.getReducedAlign(VecVT, /*UseABI=*/false); + SDValue StackPtr = + DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign); auto &MF = DAG.getMachineFunction(); auto FrameIndex = cast(StackPtr.getNode())->getIndex(); auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex); - Type *VecType = VecVT.getTypeForEVT(*DAG.getContext()); - Align Alignment = DAG.getDataLayout().getPrefTypeAlign(VecType); - SDValue Store = - DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo, Alignment); + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo, + SmallestAlign); // Store the new element. This may be larger than the vector element type, // so use a truncating store. @@ -1472,7 +1477,7 @@ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT); // Load the Lo part from the stack slot. - Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo, Alignment); + Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo, SmallestAlign); // Increment the pointer to the other part. unsigned IncrementSize = LoVT.getSizeInBits() / 8; @@ -1480,7 +1485,7 @@ // Load the Hi part from the stack slot. Hi = DAG.getLoad(HiVT, dl, Store, StackPtr, - PtrInfo.getWithOffset(IncrementSize), Alignment); + PtrInfo.getWithOffset(IncrementSize), SmallestAlign); // If we adjusted the original type, we need to truncate the results. std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); @@ -2221,11 +2226,16 @@ } // Store the vector to the stack. - SDValue StackPtr = DAG.CreateStackTemporary(VecVT); + // In cases where the vector is illegal it will be broken down into parts + // and stored in parts - we should use the alignment for the smallest part. + Align SmallestAlign = DAG.getReducedAlign(VecVT, /*UseABI=*/false); + SDValue StackPtr = + DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign); auto &MF = DAG.getMachineFunction(); auto FrameIndex = cast(StackPtr.getNode())->getIndex(); auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex); - SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo); + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo, + SmallestAlign); // Load back the required element. StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx); Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1993,6 +1993,34 @@ MachinePointerInfo(VD)); } +Align SelectionDAG::getReducedAlign(EVT VT, bool UseABI) { + const DataLayout &DL = getDataLayout(); + Type *Ty = VT.getTypeForEVT(*getContext()); + Align RedAlign = UseABI ? DL.getABITypeAlign(Ty) : DL.getPrefTypeAlign(Ty); + + if (TLI->isTypeLegal(VT) || !VT.isVector()) + return RedAlign; + + const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); + const Align StackAlign = TFI->getStackAlign(); + + // See if we can choose a smaller ABI alignment in cases where it's an + // illegal vector type that will get broken down. + if (RedAlign > StackAlign) { + EVT IntermediateVT; + MVT RegisterVT; + unsigned NumIntermediates; + unsigned NumRegs = TLI->getVectorTypeBreakdown( + *getContext(), VT, IntermediateVT, NumIntermediates, RegisterVT); + Ty = IntermediateVT.getTypeForEVT(*getContext()); + Align RedAlign2 = UseABI ? DL.getABITypeAlign(Ty) : DL.getPrefTypeAlign(Ty); + if (RedAlign2 < RedAlign) + RedAlign = RedAlign2; + } + + return RedAlign; +} + SDValue SelectionDAG::CreateStackTemporary(TypeSize Bytes, Align Alignment) { MachineFrameInfo &MFI = MF->getFrameInfo(); int FrameIdx = MFI.CreateStackObject(Bytes, Alignment, false); Index: llvm/test/CodeGen/AArch64/build-one-lane.ll =================================================================== --- llvm/test/CodeGen/AArch64/build-one-lane.ll +++ llvm/test/CodeGen/AArch64/build-one-lane.ll @@ -270,3 +270,15 @@ ; CHECK: mov v[[R]].d[1], v{{[0-9]+}}.d[0] ; CHECK: str q[[R]], [x{{[0-9]+}}] } + +; In this test the illegal type has a preferred alignment greater than the +; stack alignment, that gets reduced to the alignment of a broken down +; legal type. +define <32 x i8> @test_lanex_32xi8(<32 x i8> %a, i32 %x) { +; CHECK-LABEL: test_lanex_32xi8 +; CHECK: stp q0, q1, [sp, #-32]! +; CHECK: ldp q0, q1, [sp], #32 + %b = insertelement <32 x i8> %a, i8 30, i32 %x + ret <32 x i8> %b +} + Index: llvm/test/CodeGen/AMDGPU/scratch-simple.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -26,8 +26,8 @@ ; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]] ; GCN-NOT: s_mov_b32 s0 -; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], 0x200, [[CLAMP_IDX]] -; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], 0x400, [[CLAMP_IDX]] +; GCN-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x280, [[CLAMP_IDX]] +; GCN-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0x80}}, [[CLAMP_IDX]] ; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen Index: llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -25,7 +25,7 @@ ; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill ; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Reload ; GCN: NumVgprs: 256 -; GCN: ScratchSize: 1536 +; GCN: ScratchSize: 768 define amdgpu_vs void @main([9 x <4 x i32>] addrspace(4)* inreg %arg, [17 x <4 x i32>] addrspace(4)* inreg %arg1, [17 x <4 x i32>] addrspace(4)* inreg %arg2, [34 x <8 x i32>] addrspace(4)* inreg %arg3, [16 x <4 x i32>] addrspace(4)* inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 { bb: Index: llvm/test/CodeGen/X86/avx512-insert-extract.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -1768,8 +1768,8 @@ ; KNL-NEXT: .cfi_offset %rbp, -16 ; KNL-NEXT: movq %rsp, %rbp ; KNL-NEXT: .cfi_def_cfa_register %rbp -; KNL-NEXT: andq $-128, %rsp -; KNL-NEXT: subq $256, %rsp ## imm = 0x100 +; KNL-NEXT: andq $-64, %rsp +; KNL-NEXT: subq $192, %rsp ; KNL-NEXT: movl 744(%rbp), %eax ; KNL-NEXT: andl $127, %eax ; KNL-NEXT: vmovd %edi, %xmm0 @@ -1939,8 +1939,8 @@ ; SKX-NEXT: .cfi_offset %rbp, -16 ; SKX-NEXT: movq %rsp, %rbp ; SKX-NEXT: .cfi_def_cfa_register %rbp -; SKX-NEXT: andq $-128, %rsp -; SKX-NEXT: subq $256, %rsp ## imm = 0x100 +; SKX-NEXT: andq $-64, %rsp +; SKX-NEXT: subq $192, %rsp ; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SKX-NEXT: vpinsrb $1, 232(%rbp), %xmm0, %xmm0 ; SKX-NEXT: vpinsrb $2, 240(%rbp), %xmm0, %xmm0 @@ -2076,8 +2076,8 @@ ; KNL-NEXT: .cfi_offset %rbp, -16 ; KNL-NEXT: movq %rsp, %rbp ; KNL-NEXT: .cfi_def_cfa_register %rbp -; KNL-NEXT: andq $-128, %rsp -; KNL-NEXT: subq $256, %rsp ## imm = 0x100 +; KNL-NEXT: andq $-64, %rsp +; KNL-NEXT: subq $192, %rsp ; KNL-NEXT: ## kill: def $esi killed $esi def $rsi ; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm3 @@ -2153,8 +2153,8 @@ ; SKX-NEXT: .cfi_offset %rbp, -16 ; SKX-NEXT: movq %rsp, %rbp ; SKX-NEXT: .cfi_def_cfa_register %rbp -; SKX-NEXT: andq $-128, %rsp -; SKX-NEXT: subq $256, %rsp ## imm = 0x100 +; SKX-NEXT: andq $-64, %rsp +; SKX-NEXT: subq $192, %rsp ; SKX-NEXT: ## kill: def $esi killed $esi def $rsi ; SKX-NEXT: vptestmb %zmm0, %zmm0, %k0 ; SKX-NEXT: vptestmb %zmm1, %zmm1, %k1 Index: llvm/test/CodeGen/X86/extractelement-index.ll =================================================================== --- llvm/test/CodeGen/X86/extractelement-index.ll +++ llvm/test/CodeGen/X86/extractelement-index.ll @@ -443,16 +443,10 @@ define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind { ; SSE-LABEL: extractelement_v32i8_var: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: movq %rsp, %rbp -; SSE-NEXT: andq $-32, %rsp -; SSE-NEXT: subq $64, %rsp ; SSE-NEXT: andl $31, %edi -; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, (%rsp) -; SSE-NEXT: movb (%rsp,%rdi), %al -; SSE-NEXT: movq %rbp, %rsp -; SSE-NEXT: popq %rbp +; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movb -40(%rsp,%rdi), %al ; SSE-NEXT: retq ; ; AVX-LABEL: extractelement_v32i8_var: @@ -493,16 +487,10 @@ define i16 @extractelement_v16i16_var(<16 x i16> %a, i256 %i) nounwind { ; SSE-LABEL: extractelement_v16i16_var: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: movq %rsp, %rbp -; SSE-NEXT: andq $-32, %rsp -; SSE-NEXT: subq $64, %rsp ; SSE-NEXT: andl $15, %edi -; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, (%rsp) -; SSE-NEXT: movzwl (%rsp,%rdi,2), %eax -; SSE-NEXT: movq %rbp, %rsp -; SSE-NEXT: popq %rbp +; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movzwl -40(%rsp,%rdi,2), %eax ; SSE-NEXT: retq ; ; AVX-LABEL: extractelement_v16i16_var: @@ -543,16 +531,10 @@ define i32 @extractelement_v8i32_var(<8 x i32> %a, i256 %i) nounwind { ; SSE-LABEL: extractelement_v8i32_var: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: movq %rsp, %rbp -; SSE-NEXT: andq $-32, %rsp -; SSE-NEXT: subq $64, %rsp ; SSE-NEXT: andl $7, %edi -; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, (%rsp) -; SSE-NEXT: movl (%rsp,%rdi,4), %eax -; SSE-NEXT: movq %rbp, %rsp -; SSE-NEXT: popq %rbp +; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl -40(%rsp,%rdi,4), %eax ; SSE-NEXT: retq ; ; AVX-LABEL: extractelement_v8i32_var: @@ -593,16 +575,10 @@ define i64 @extractelement_v4i64_var(<4 x i64> %a, i256 %i) nounwind { ; SSE-LABEL: extractelement_v4i64_var: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: movq %rsp, %rbp -; SSE-NEXT: andq $-32, %rsp -; SSE-NEXT: subq $64, %rsp ; SSE-NEXT: andl $3, %edi -; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, (%rsp) -; SSE-NEXT: movq (%rsp,%rdi,8), %rax -; SSE-NEXT: movq %rbp, %rsp -; SSE-NEXT: popq %rbp +; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq -40(%rsp,%rdi,8), %rax ; SSE-NEXT: retq ; ; AVX-LABEL: extractelement_v4i64_var: Index: llvm/test/CodeGen/X86/half.ll =================================================================== --- llvm/test/CodeGen/X86/half.ll +++ llvm/test/CodeGen/X86/half.ll @@ -382,16 +382,16 @@ ; CHECK-LIBCALL-NEXT: subq $88, %rsp ; CHECK-LIBCALL-NEXT: movl (%rdi), %eax ; CHECK-LIBCALL-NEXT: movl 4(%rdi), %ecx -; CHECK-LIBCALL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; CHECK-LIBCALL-NEXT: movl %eax, (%rsp) ; CHECK-LIBCALL-NEXT: movl %ecx, {{[0-9]+}}(%rsp) -; CHECK-LIBCALL-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm0 ; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-LIBCALL-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 -; CHECK-LIBCALL-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-LIBCALL-NEXT: pextrw $1, %xmm0, %edi ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee ; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-LIBCALL-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %edi ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee ; CHECK-LIBCALL-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -400,11 +400,11 @@ ; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-LIBCALL-NEXT: pextrw $1, %xmm0, %edi ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee -; CHECK-LIBCALL-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %edi ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee -; CHECK-LIBCALL-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-LIBCALL-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-LIBCALL-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0] Index: llvm/test/CodeGen/X86/i64-mem-copy.ll =================================================================== --- llvm/test/CodeGen/X86/i64-mem-copy.ll +++ llvm/test/CodeGen/X86/i64-mem-copy.ll @@ -109,34 +109,28 @@ define void @PR23476(<5 x i64> %in, i64* %out, i32 %index) nounwind { ; X64-LABEL: PR23476: ; X64: # %bb.0: -; X64-NEXT: pushq %rbp -; X64-NEXT: movq %rsp, %rbp -; X64-NEXT: andq $-64, %rsp -; X64-NEXT: subq $128, %rsp ; X64-NEXT: movq %rsi, %xmm0 ; X64-NEXT: movq %rdi, %xmm1 ; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; X64-NEXT: movq %rcx, %xmm0 ; X64-NEXT: movq %rdx, %xmm2 ; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; X64-NEXT: movl 16(%rbp), %eax +; X64-NEXT: movl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: andl $7, %eax ; X64-NEXT: movq %r8, %xmm0 -; X64-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) -; X64-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) -; X64-NEXT: movdqa %xmm1, (%rsp) -; X64-NEXT: movq (%rsp,%rax,8), %rax +; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) +; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: movq -72(%rsp,%rax,8), %rax ; X64-NEXT: movq %rax, (%r9) -; X64-NEXT: movq %rbp, %rsp -; X64-NEXT: popq %rbp ; X64-NEXT: retq ; ; X32-LABEL: PR23476: ; X32: # %bb.0: ; X32-NEXT: pushl %ebp ; X32-NEXT: movl %esp, %ebp -; X32-NEXT: andl $-64, %esp -; X32-NEXT: subl $128, %esp +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $80, %esp ; X32-NEXT: movl 52(%ebp), %eax ; X32-NEXT: andl $7, %eax ; X32-NEXT: movl 48(%ebp), %ecx @@ -156,8 +150,8 @@ ; X32AVX: # %bb.0: ; X32AVX-NEXT: pushl %ebp ; X32AVX-NEXT: movl %esp, %ebp -; X32AVX-NEXT: andl $-64, %esp -; X32AVX-NEXT: subl $128, %esp +; X32AVX-NEXT: andl $-32, %esp +; X32AVX-NEXT: subl $96, %esp ; X32AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X32AVX-NEXT: movl 52(%ebp), %eax ; X32AVX-NEXT: andl $7, %eax Index: llvm/test/CodeGen/X86/insertelement-var-index.ll =================================================================== --- llvm/test/CodeGen/X86/insertelement-var-index.ll +++ llvm/test/CodeGen/X86/insertelement-var-index.ll @@ -262,17 +262,11 @@ define <32 x i8> @arg_i8_v32i8(i8 %x, i32 %y) nounwind { ; SSE-LABEL: arg_i8_v32i8: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: movq %rsp, %rbp -; SSE-NEXT: andq $-32, %rsp -; SSE-NEXT: subq $64, %rsp ; SSE-NEXT: # kill: def $esi killed $esi def $rsi ; SSE-NEXT: andl $31, %esi -; SSE-NEXT: movb %dil, (%rsp,%rsi) -; SSE-NEXT: movaps (%rsp), %xmm0 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movq %rbp, %rsp -; SSE-NEXT: popq %rbp +; SSE-NEXT: movb %dil, -40(%rsp,%rsi) +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: arg_i8_v32i8: @@ -295,17 +289,11 @@ define <16 x i16> @arg_i16_v16i16(i16 %x, i32 %y) nounwind { ; SSE-LABEL: arg_i16_v16i16: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: movq %rsp, %rbp -; SSE-NEXT: andq $-32, %rsp -; SSE-NEXT: subq $64, %rsp ; SSE-NEXT: # kill: def $esi killed $esi def $rsi ; SSE-NEXT: andl $15, %esi -; SSE-NEXT: movw %di, (%rsp,%rsi,2) -; SSE-NEXT: movaps (%rsp), %xmm0 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movq %rbp, %rsp -; SSE-NEXT: popq %rbp +; SSE-NEXT: movw %di, -40(%rsp,%rsi,2) +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: arg_i16_v16i16: @@ -328,17 +316,11 @@ define <8 x i32> @arg_i32_v8i32(i32 %x, i32 %y) nounwind { ; SSE-LABEL: arg_i32_v8i32: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: movq %rsp, %rbp -; SSE-NEXT: andq $-32, %rsp -; SSE-NEXT: subq $64, %rsp ; SSE-NEXT: # kill: def $esi killed $esi def $rsi ; SSE-NEXT: andl $7, %esi -; SSE-NEXT: movl %edi, (%rsp,%rsi,4) -; SSE-NEXT: movaps (%rsp), %xmm0 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movq %rbp, %rsp -; SSE-NEXT: popq %rbp +; SSE-NEXT: movl %edi, -40(%rsp,%rsi,4) +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: arg_i32_v8i32: @@ -360,17 +342,11 @@ define <4 x i64> @arg_i64_v4i64(i64 %x, i32 %y) nounwind { ; SSE-LABEL: arg_i64_v4i64: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: movq %rsp, %rbp -; SSE-NEXT: andq $-32, %rsp -; SSE-NEXT: subq $64, %rsp ; SSE-NEXT: # kill: def $esi killed $esi def $rsi ; SSE-NEXT: andl $3, %esi -; SSE-NEXT: movq %rdi, (%rsp,%rsi,8) -; SSE-NEXT: movaps (%rsp), %xmm0 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movq %rbp, %rsp -; SSE-NEXT: popq %rbp +; SSE-NEXT: movq %rdi, -40(%rsp,%rsi,8) +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: arg_i64_v4i64: @@ -392,17 +368,11 @@ define <8 x float> @arg_f32_v8f32(float %x, i32 %y) nounwind { ; SSE-LABEL: arg_f32_v8f32: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: movq %rsp, %rbp -; SSE-NEXT: andq $-32, %rsp -; SSE-NEXT: subq $64, %rsp ; SSE-NEXT: # kill: def $edi killed $edi def $rdi ; SSE-NEXT: andl $7, %edi -; SSE-NEXT: movss %xmm0, (%rsp,%rdi,4) -; SSE-NEXT: movaps (%rsp), %xmm0 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movq %rbp, %rsp -; SSE-NEXT: popq %rbp +; SSE-NEXT: movss %xmm0, -40(%rsp,%rdi,4) +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: arg_f32_v8f32: @@ -422,17 +392,11 @@ define <4 x double> @arg_f64_v4f64(double %x, i32 %y) nounwind { ; SSE-LABEL: arg_f64_v4f64: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: movq %rsp, %rbp -; SSE-NEXT: andq $-32, %rsp -; SSE-NEXT: subq $64, %rsp ; SSE-NEXT: # kill: def $edi killed $edi def $rdi ; SSE-NEXT: andl $3, %edi -; SSE-NEXT: movsd %xmm0, (%rsp,%rdi,8) -; SSE-NEXT: movaps (%rsp), %xmm0 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movq %rbp, %rsp -; SSE-NEXT: popq %rbp +; SSE-NEXT: movsd %xmm0, -40(%rsp,%rdi,8) +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: arg_f64_v4f64: @@ -452,18 +416,12 @@ define <32 x i8> @load_i8_v32i8(i8* %p, i32 %y) nounwind { ; SSE-LABEL: load_i8_v32i8: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: movq %rsp, %rbp -; SSE-NEXT: andq $-32, %rsp -; SSE-NEXT: subq $64, %rsp ; SSE-NEXT: # kill: def $esi killed $esi def $rsi ; SSE-NEXT: movb (%rdi), %al ; SSE-NEXT: andl $31, %esi -; SSE-NEXT: movb %al, (%rsp,%rsi) -; SSE-NEXT: movaps (%rsp), %xmm0 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movq %rbp, %rsp -; SSE-NEXT: popq %rbp +; SSE-NEXT: movb %al, -40(%rsp,%rsi) +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: load_i8_v32i8: @@ -487,18 +445,12 @@ define <16 x i16> @load_i16_v16i16(i16* %p, i32 %y) nounwind { ; SSE-LABEL: load_i16_v16i16: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: movq %rsp, %rbp -; SSE-NEXT: andq $-32, %rsp -; SSE-NEXT: subq $64, %rsp ; SSE-NEXT: # kill: def $esi killed $esi def $rsi ; SSE-NEXT: movzwl (%rdi), %eax ; SSE-NEXT: andl $15, %esi -; SSE-NEXT: movw %ax, (%rsp,%rsi,2) -; SSE-NEXT: movaps (%rsp), %xmm0 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movq %rbp, %rsp -; SSE-NEXT: popq %rbp +; SSE-NEXT: movw %ax, -40(%rsp,%rsi,2) +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: load_i16_v16i16: @@ -522,18 +474,12 @@ define <8 x i32> @load_i32_v8i32(i32* %p, i32 %y) nounwind { ; SSE-LABEL: load_i32_v8i32: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: movq %rsp, %rbp -; SSE-NEXT: andq $-32, %rsp -; SSE-NEXT: subq $64, %rsp ; SSE-NEXT: # kill: def $esi killed $esi def $rsi ; SSE-NEXT: movl (%rdi), %eax ; SSE-NEXT: andl $7, %esi -; SSE-NEXT: movl %eax, (%rsp,%rsi,4) -; SSE-NEXT: movaps (%rsp), %xmm0 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movq %rbp, %rsp -; SSE-NEXT: popq %rbp +; SSE-NEXT: movl %eax, -40(%rsp,%rsi,4) +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: load_i32_v8i32: @@ -548,18 +494,12 @@ define <4 x i64> @load_i64_v4i64(i64* %p, i32 %y) nounwind { ; SSE-LABEL: load_i64_v4i64: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: movq %rsp, %rbp -; SSE-NEXT: andq $-32, %rsp -; SSE-NEXT: subq $64, %rsp ; SSE-NEXT: # kill: def $esi killed $esi def $rsi ; SSE-NEXT: movq (%rdi), %rax ; SSE-NEXT: andl $3, %esi -; SSE-NEXT: movq %rax, (%rsp,%rsi,8) -; SSE-NEXT: movaps (%rsp), %xmm0 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movq %rbp, %rsp -; SSE-NEXT: popq %rbp +; SSE-NEXT: movq %rax, -40(%rsp,%rsi,8) +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: load_i64_v4i64: @@ -574,18 +514,12 @@ define <8 x float> @load_f32_v8f32(float* %p, i32 %y) nounwind { ; SSE-LABEL: load_f32_v8f32: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: movq %rsp, %rbp -; SSE-NEXT: andq $-32, %rsp -; SSE-NEXT: subq $64, %rsp ; SSE-NEXT: # kill: def $esi killed $esi def $rsi ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: andl $7, %esi -; SSE-NEXT: movss %xmm0, (%rsp,%rsi,4) -; SSE-NEXT: movaps (%rsp), %xmm0 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movq %rbp, %rsp -; SSE-NEXT: popq %rbp +; SSE-NEXT: movss %xmm0, -40(%rsp,%rsi,4) +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: load_f32_v8f32: @@ -600,18 +534,12 @@ define <4 x double> @load_f64_v4f64(double* %p, i32 %y) nounwind { ; SSE-LABEL: load_f64_v4f64: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: movq %rsp, %rbp -; SSE-NEXT: andq $-32, %rsp -; SSE-NEXT: subq $64, %rsp ; SSE-NEXT: # kill: def $esi killed $esi def $rsi ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: andl $3, %esi -; SSE-NEXT: movsd %xmm0, (%rsp,%rsi,8) -; SSE-NEXT: movaps (%rsp), %xmm0 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movq %rbp, %rsp -; SSE-NEXT: popq %rbp +; SSE-NEXT: movsd %xmm0, -40(%rsp,%rsi,8) +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: load_f64_v4f64: Index: llvm/test/CodeGen/X86/pr31088.ll =================================================================== --- llvm/test/CodeGen/X86/pr31088.ll +++ llvm/test/CodeGen/X86/pr31088.ll @@ -66,7 +66,7 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $80, %esp +; X86-NEXT: subl $64, %esp ; X86-NEXT: movzwl 8(%ebp), %esi ; X86-NEXT: movzwl 12(%ebp), %edi ; X86-NEXT: movzwl 20(%ebp), %ebx Index: llvm/test/CodeGen/X86/var-permute-128.ll =================================================================== --- llvm/test/CodeGen/X86/var-permute-128.ll +++ llvm/test/CodeGen/X86/var-permute-128.ll @@ -643,116 +643,112 @@ ; SSE3-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: ; SSE3: # %bb.0: ; SSE3-NEXT: pushq %rbp -; SSE3-NEXT: movq %rsp, %rbp ; SSE3-NEXT: pushq %r15 ; SSE3-NEXT: pushq %r14 ; SSE3-NEXT: pushq %r13 ; SSE3-NEXT: pushq %r12 ; SSE3-NEXT: pushq %rbx -; SSE3-NEXT: andq $-32, %rsp -; SSE3-NEXT: subq $608, %rsp # imm = 0x260 -; SSE3-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) +; SSE3-NEXT: subq $424, %rsp # imm = 0x1A8 +; SSE3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %esi +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %edx -; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d -; SSE3-NEXT: andl $31, %r9d -; SSE3-NEXT: movzbl 64(%rsp,%r9), %ebx -; SSE3-NEXT: movd %ebx, %xmm8 -; SSE3-NEXT: andl $31, %eax -; SSE3-NEXT: movzbl 96(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm15 +; SSE3-NEXT: movaps %xmm0, (%rsp) +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp +; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d +; SSE3-NEXT: andl $31, %r8d +; SSE3-NEXT: movzbl -96(%rsp,%r8), %esi +; SSE3-NEXT: movd %esi, %xmm8 +; SSE3-NEXT: andl $31, %ebp +; SSE3-NEXT: movzbl -64(%rsp,%rbp), %esi +; SSE3-NEXT: movd %esi, %xmm15 ; SSE3-NEXT: andl $31, %edx -; SSE3-NEXT: movzbl 128(%rsp,%rdx), %eax -; SSE3-NEXT: movd %eax, %xmm9 +; SSE3-NEXT: movzbl -32(%rsp,%rdx), %edx +; SSE3-NEXT: movd %edx, %xmm9 ; SSE3-NEXT: andl $31, %ecx -; SSE3-NEXT: movzbl 160(%rsp,%rcx), %eax -; SSE3-NEXT: movd %eax, %xmm3 -; SSE3-NEXT: andl $31, %esi -; SSE3-NEXT: movzbl 192(%rsp,%rsi), %eax +; SSE3-NEXT: movzbl (%rsp,%rcx), %ecx +; SSE3-NEXT: movd %ecx, %xmm3 +; SSE3-NEXT: andl $31, %eax +; SSE3-NEXT: movzbl 32(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm10 ; SSE3-NEXT: andl $31, %edi -; SSE3-NEXT: movzbl 224(%rsp,%rdi), %eax +; SSE3-NEXT: movzbl 64(%rsp,%rdi), %eax ; SSE3-NEXT: movd %eax, %xmm7 -; SSE3-NEXT: andl $31, %r8d -; SSE3-NEXT: movzbl 256(%rsp,%r8), %eax +; SSE3-NEXT: andl $31, %ebx +; SSE3-NEXT: movzbl 96(%rsp,%rbx), %eax ; SSE3-NEXT: movd %eax, %xmm11 -; SSE3-NEXT: andl $31, %r10d -; SSE3-NEXT: movzbl 288(%rsp,%r10), %eax +; SSE3-NEXT: andl $31, %r9d +; SSE3-NEXT: movzbl 128(%rsp,%r9), %eax ; SSE3-NEXT: movd %eax, %xmm6 ; SSE3-NEXT: andl $31, %r13d -; SSE3-NEXT: movzbl 320(%rsp,%r13), %eax +; SSE3-NEXT: movzbl 160(%rsp,%r13), %eax ; SSE3-NEXT: movd %eax, %xmm12 ; SSE3-NEXT: andl $31, %r12d -; SSE3-NEXT: movzbl 352(%rsp,%r12), %eax +; SSE3-NEXT: movzbl 192(%rsp,%r12), %eax ; SSE3-NEXT: movd %eax, %xmm5 ; SSE3-NEXT: andl $31, %r15d -; SSE3-NEXT: movzbl 384(%rsp,%r15), %eax +; SSE3-NEXT: movzbl 224(%rsp,%r15), %eax ; SSE3-NEXT: movd %eax, %xmm13 ; SSE3-NEXT: andl $31, %r14d -; SSE3-NEXT: movzbl 416(%rsp,%r14), %eax +; SSE3-NEXT: movzbl 256(%rsp,%r14), %eax ; SSE3-NEXT: movd %eax, %xmm4 ; SSE3-NEXT: andl $31, %r11d -; SSE3-NEXT: movzbl 448(%rsp,%r11), %eax +; SSE3-NEXT: movzbl 288(%rsp,%r11), %eax ; SSE3-NEXT: movd %eax, %xmm14 -; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE3-NEXT: andl $31, %eax -; SSE3-NEXT: movzbl 480(%rsp,%rax), %eax +; SSE3-NEXT: andl $31, %r10d +; SSE3-NEXT: movzbl 320(%rsp,%r10), %eax ; SSE3-NEXT: movd %eax, %xmm1 ; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE3-NEXT: andl $31, %eax -; SSE3-NEXT: movzbl 512(%rsp,%rax), %eax +; SSE3-NEXT: movzbl 352(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm2 ; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE3-NEXT: andl $31, %eax -; SSE3-NEXT: movzbl 544(%rsp,%rax), %eax +; SSE3-NEXT: movzbl 384(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm0 ; SSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] @@ -769,7 +765,7 @@ ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] -; SSE3-NEXT: leaq -40(%rbp), %rsp +; SSE3-NEXT: addq $424, %rsp # imm = 0x1A8 ; SSE3-NEXT: popq %rbx ; SSE3-NEXT: popq %r12 ; SSE3-NEXT: popq %r13 @@ -781,116 +777,112 @@ ; SSSE3-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: ; SSSE3: # %bb.0: ; SSSE3-NEXT: pushq %rbp -; SSSE3-NEXT: movq %rsp, %rbp ; SSSE3-NEXT: pushq %r15 ; SSSE3-NEXT: pushq %r14 ; SSSE3-NEXT: pushq %r13 ; SSSE3-NEXT: pushq %r12 ; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: andq $-32, %rsp -; SSSE3-NEXT: subq $608, %rsp # imm = 0x260 -; SSSE3-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp) +; SSSE3-NEXT: subq $424, %rsp # imm = 0x1A8 +; SSSE3-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d -; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %esi +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %edx +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d -; SSSE3-NEXT: andl $31, %r9d -; SSSE3-NEXT: movzbl 64(%rsp,%r9), %ebx -; SSSE3-NEXT: movd %ebx, %xmm8 -; SSSE3-NEXT: andl $31, %eax -; SSSE3-NEXT: movzbl 96(%rsp,%rax), %eax -; SSSE3-NEXT: movd %eax, %xmm15 +; SSSE3-NEXT: movaps %xmm0, (%rsp) +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp +; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d +; SSSE3-NEXT: andl $31, %r8d +; SSSE3-NEXT: movzbl -96(%rsp,%r8), %esi +; SSSE3-NEXT: movd %esi, %xmm8 +; SSSE3-NEXT: andl $31, %ebp +; SSSE3-NEXT: movzbl -64(%rsp,%rbp), %esi +; SSSE3-NEXT: movd %esi, %xmm15 ; SSSE3-NEXT: andl $31, %edx -; SSSE3-NEXT: movzbl 128(%rsp,%rdx), %eax -; SSSE3-NEXT: movd %eax, %xmm9 +; SSSE3-NEXT: movzbl -32(%rsp,%rdx), %edx +; SSSE3-NEXT: movd %edx, %xmm9 ; SSSE3-NEXT: andl $31, %ecx -; SSSE3-NEXT: movzbl 160(%rsp,%rcx), %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: andl $31, %esi -; SSSE3-NEXT: movzbl 192(%rsp,%rsi), %eax +; SSSE3-NEXT: movzbl (%rsp,%rcx), %ecx +; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: andl $31, %eax +; SSSE3-NEXT: movzbl 32(%rsp,%rax), %eax ; SSSE3-NEXT: movd %eax, %xmm10 ; SSSE3-NEXT: andl $31, %edi -; SSSE3-NEXT: movzbl 224(%rsp,%rdi), %eax +; SSSE3-NEXT: movzbl 64(%rsp,%rdi), %eax ; SSSE3-NEXT: movd %eax, %xmm7 -; SSSE3-NEXT: andl $31, %r8d -; SSSE3-NEXT: movzbl 256(%rsp,%r8), %eax +; SSSE3-NEXT: andl $31, %ebx +; SSSE3-NEXT: movzbl 96(%rsp,%rbx), %eax ; SSSE3-NEXT: movd %eax, %xmm11 -; SSSE3-NEXT: andl $31, %r10d -; SSSE3-NEXT: movzbl 288(%rsp,%r10), %eax +; SSSE3-NEXT: andl $31, %r9d +; SSSE3-NEXT: movzbl 128(%rsp,%r9), %eax ; SSSE3-NEXT: movd %eax, %xmm6 ; SSSE3-NEXT: andl $31, %r13d -; SSSE3-NEXT: movzbl 320(%rsp,%r13), %eax +; SSSE3-NEXT: movzbl 160(%rsp,%r13), %eax ; SSSE3-NEXT: movd %eax, %xmm12 ; SSSE3-NEXT: andl $31, %r12d -; SSSE3-NEXT: movzbl 352(%rsp,%r12), %eax +; SSSE3-NEXT: movzbl 192(%rsp,%r12), %eax ; SSSE3-NEXT: movd %eax, %xmm5 ; SSSE3-NEXT: andl $31, %r15d -; SSSE3-NEXT: movzbl 384(%rsp,%r15), %eax +; SSSE3-NEXT: movzbl 224(%rsp,%r15), %eax ; SSSE3-NEXT: movd %eax, %xmm13 ; SSSE3-NEXT: andl $31, %r14d -; SSSE3-NEXT: movzbl 416(%rsp,%r14), %eax +; SSSE3-NEXT: movzbl 256(%rsp,%r14), %eax ; SSSE3-NEXT: movd %eax, %xmm4 ; SSSE3-NEXT: andl $31, %r11d -; SSSE3-NEXT: movzbl 448(%rsp,%r11), %eax +; SSSE3-NEXT: movzbl 288(%rsp,%r11), %eax ; SSSE3-NEXT: movd %eax, %xmm14 -; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSSE3-NEXT: andl $31, %eax -; SSSE3-NEXT: movzbl 480(%rsp,%rax), %eax +; SSSE3-NEXT: andl $31, %r10d +; SSSE3-NEXT: movzbl 320(%rsp,%r10), %eax ; SSSE3-NEXT: movd %eax, %xmm1 ; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSSE3-NEXT: andl $31, %eax -; SSSE3-NEXT: movzbl 512(%rsp,%rax), %eax +; SSSE3-NEXT: movzbl 352(%rsp,%rax), %eax ; SSSE3-NEXT: movd %eax, %xmm2 ; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSSE3-NEXT: andl $31, %eax -; SSSE3-NEXT: movzbl 544(%rsp,%rax), %eax +; SSSE3-NEXT: movzbl 384(%rsp,%rax), %eax ; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] @@ -907,7 +899,7 @@ ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] -; SSSE3-NEXT: leaq -40(%rbp), %rsp +; SSSE3-NEXT: addq $424, %rsp # imm = 0x1A8 ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %r12 ; SSSE3-NEXT: popq %r13 @@ -918,10 +910,7 @@ ; ; SSE41-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pushq %rbp -; SSE41-NEXT: movq %rsp, %rbp -; SSE41-NEXT: andq $-32, %rsp -; SSE41-NEXT: subq $544, %rsp # imm = 0x220 +; SSE41-NEXT: subq $392, %rsp # imm = 0x188 ; SSE41-NEXT: movd %xmm2, %eax ; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) @@ -947,64 +936,63 @@ ; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE41-NEXT: movaps %xmm0, (%rsp) -; SSE41-NEXT: movzbl 480(%rsp,%rax), %eax +; SSE41-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE41-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE41-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE41-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE41-NEXT: movzbl 352(%rsp,%rax), %eax ; SSE41-NEXT: movd %eax, %xmm0 ; SSE41-NEXT: pextrb $1, %xmm2, %eax ; SSE41-NEXT: andl $31, %eax -; SSE41-NEXT: pinsrb $1, 448(%rsp,%rax), %xmm0 +; SSE41-NEXT: pinsrb $1, 320(%rsp,%rax), %xmm0 ; SSE41-NEXT: pextrb $2, %xmm2, %eax ; SSE41-NEXT: andl $31, %eax -; SSE41-NEXT: pinsrb $2, 416(%rsp,%rax), %xmm0 +; SSE41-NEXT: pinsrb $2, 288(%rsp,%rax), %xmm0 ; SSE41-NEXT: pextrb $3, %xmm2, %eax ; SSE41-NEXT: andl $31, %eax -; SSE41-NEXT: pinsrb $3, 384(%rsp,%rax), %xmm0 +; SSE41-NEXT: pinsrb $3, 256(%rsp,%rax), %xmm0 ; SSE41-NEXT: pextrb $4, %xmm2, %eax ; SSE41-NEXT: andl $31, %eax -; SSE41-NEXT: pinsrb $4, 352(%rsp,%rax), %xmm0 +; SSE41-NEXT: pinsrb $4, 224(%rsp,%rax), %xmm0 ; SSE41-NEXT: pextrb $5, %xmm2, %eax ; SSE41-NEXT: andl $31, %eax -; SSE41-NEXT: pinsrb $5, 320(%rsp,%rax), %xmm0 +; SSE41-NEXT: pinsrb $5, 192(%rsp,%rax), %xmm0 ; SSE41-NEXT: pextrb $6, %xmm2, %eax ; SSE41-NEXT: andl $31, %eax -; SSE41-NEXT: pinsrb $6, 288(%rsp,%rax), %xmm0 +; SSE41-NEXT: pinsrb $6, 160(%rsp,%rax), %xmm0 ; SSE41-NEXT: pextrb $7, %xmm2, %eax ; SSE41-NEXT: andl $31, %eax -; SSE41-NEXT: pinsrb $7, 256(%rsp,%rax), %xmm0 +; SSE41-NEXT: pinsrb $7, 128(%rsp,%rax), %xmm0 ; SSE41-NEXT: pextrb $8, %xmm2, %eax ; SSE41-NEXT: andl $31, %eax -; SSE41-NEXT: pinsrb $8, 224(%rsp,%rax), %xmm0 +; SSE41-NEXT: pinsrb $8, 96(%rsp,%rax), %xmm0 ; SSE41-NEXT: pextrb $9, %xmm2, %eax ; SSE41-NEXT: andl $31, %eax -; SSE41-NEXT: pinsrb $9, 192(%rsp,%rax), %xmm0 +; SSE41-NEXT: pinsrb $9, 64(%rsp,%rax), %xmm0 ; SSE41-NEXT: pextrb $10, %xmm2, %eax ; SSE41-NEXT: andl $31, %eax -; SSE41-NEXT: pinsrb $10, 160(%rsp,%rax), %xmm0 +; SSE41-NEXT: pinsrb $10, 32(%rsp,%rax), %xmm0 ; SSE41-NEXT: pextrb $11, %xmm2, %eax ; SSE41-NEXT: andl $31, %eax -; SSE41-NEXT: pinsrb $11, 128(%rsp,%rax), %xmm0 +; SSE41-NEXT: pinsrb $11, (%rsp,%rax), %xmm0 ; SSE41-NEXT: pextrb $12, %xmm2, %eax ; SSE41-NEXT: andl $31, %eax -; SSE41-NEXT: pinsrb $12, 96(%rsp,%rax), %xmm0 +; SSE41-NEXT: pinsrb $12, -32(%rsp,%rax), %xmm0 ; SSE41-NEXT: pextrb $13, %xmm2, %eax ; SSE41-NEXT: andl $31, %eax -; SSE41-NEXT: pinsrb $13, 64(%rsp,%rax), %xmm0 +; SSE41-NEXT: pinsrb $13, -64(%rsp,%rax), %xmm0 ; SSE41-NEXT: pextrb $14, %xmm2, %eax ; SSE41-NEXT: andl $31, %eax -; SSE41-NEXT: pinsrb $14, 32(%rsp,%rax), %xmm0 +; SSE41-NEXT: pinsrb $14, -96(%rsp,%rax), %xmm0 ; SSE41-NEXT: pextrb $15, %xmm2, %eax ; SSE41-NEXT: andl $31, %eax -; SSE41-NEXT: pinsrb $15, (%rsp,%rax), %xmm0 -; SSE41-NEXT: movq %rbp, %rsp -; SSE41-NEXT: popq %rbp +; SSE41-NEXT: pinsrb $15, -128(%rsp,%rax), %xmm0 +; SSE41-NEXT: addq $392, %rsp # imm = 0x188 ; SSE41-NEXT: retq ; ; XOP-LABEL: var_shuffle_v16i8_from_v32i8_v16i8: Index: llvm/test/CodeGen/X86/vec_fneg.ll =================================================================== --- llvm/test/CodeGen/X86/vec_fneg.ll +++ llvm/test/CodeGen/X86/vec_fneg.ll @@ -121,7 +121,7 @@ ; X32-SSE1-NEXT: pushl %ebp ; X32-SSE1-NEXT: movl %esp, %ebp ; X32-SSE1-NEXT: andl $-16, %esp -; X32-SSE1-NEXT: subl $32, %esp +; X32-SSE1-NEXT: subl $16, %esp ; X32-SSE1-NEXT: movl $-2147483648, %eax # imm = 0x80000000 ; X32-SSE1-NEXT: movl 12(%ebp), %ecx ; X32-SSE1-NEXT: xorl %eax, %ecx Index: llvm/test/CodeGen/X86/vec_insert-4.ll =================================================================== --- llvm/test/CodeGen/X86/vec_insert-4.ll +++ llvm/test/CodeGen/X86/vec_insert-4.ll @@ -5,36 +5,26 @@ define <8 x float> @f(<8 x float> %a, i32 %b) nounwind { ; X32-LABEL: f: ; X32: ## %bb.0: ## %entry -; X32-NEXT: pushl %ebp -; X32-NEXT: movl %esp, %ebp -; X32-NEXT: andl $-32, %esp -; X32-NEXT: subl $64, %esp -; X32-NEXT: movl 8(%ebp), %eax +; X32-NEXT: subl $44, %esp +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: andl $7, %eax ; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X32-NEXT: movaps %xmm0, (%esp) ; X32-NEXT: movl $1084227584, (%esp,%eax,4) ## imm = 0x40A00000 ; X32-NEXT: movaps (%esp), %xmm0 ; X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 -; X32-NEXT: movl %ebp, %esp -; X32-NEXT: popl %ebp +; X32-NEXT: addl $44, %esp ; X32-NEXT: retl ; ; X64-LABEL: f: ; X64: ## %bb.0: ## %entry -; X64-NEXT: pushq %rbp -; X64-NEXT: movq %rsp, %rbp -; X64-NEXT: andq $-32, %rsp -; X64-NEXT: subq $64, %rsp ; X64-NEXT: ## kill: def $edi killed $edi def $rdi -; X64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; X64-NEXT: movaps %xmm0, (%rsp) +; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: andl $7, %edi -; X64-NEXT: movl $1084227584, (%rsp,%rdi,4) ## imm = 0x40A00000 -; X64-NEXT: movaps (%rsp), %xmm0 -; X64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; X64-NEXT: movq %rbp, %rsp -; X64-NEXT: popq %rbp +; X64-NEXT: movl $1084227584, -40(%rsp,%rdi,4) ## imm = 0x40A00000 +; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 +; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 ; X64-NEXT: retq entry: %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b Index: llvm/test/CodeGen/X86/vector-extend-inreg.ll =================================================================== --- llvm/test/CodeGen/X86/vector-extend-inreg.ll +++ llvm/test/CodeGen/X86/vector-extend-inreg.ll @@ -9,8 +9,8 @@ ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: pushl %ebp ; X32-SSE-NEXT: movl %esp, %ebp -; X32-SSE-NEXT: andl $-128, %esp -; X32-SSE-NEXT: subl $384, %esp # imm = 0x180 +; X32-SSE-NEXT: andl $-16, %esp +; X32-SSE-NEXT: subl $272, %esp # imm = 0x110 ; X32-SSE-NEXT: movl 88(%ebp), %ecx ; X32-SSE-NEXT: movdqa 72(%ebp), %xmm0 ; X32-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero @@ -43,33 +43,29 @@ ; ; X64-SSE-LABEL: extract_any_extend_vector_inreg_v16i64: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: pushq %rbp -; X64-SSE-NEXT: movq %rsp, %rbp -; X64-SSE-NEXT: andq $-128, %rsp -; X64-SSE-NEXT: subq $256, %rsp # imm = 0x100 +; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: # kill: def $edi killed $edi def $rdi ; X64-SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; X64-SSE-NEXT: xorps %xmm0, %xmm0 -; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movaps %xmm0, (%rsp) -; X64-SSE-NEXT: movdqa %xmm7, {{[0-9]+}}(%rsp) +; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) ; X64-SSE-NEXT: andl $15, %edi -; X64-SSE-NEXT: movq (%rsp,%rdi,8), %rax -; X64-SSE-NEXT: movq %rbp, %rsp -; X64-SSE-NEXT: popq %rbp +; X64-SSE-NEXT: movq -128(%rsp,%rdi,8), %rax +; X64-SSE-NEXT: popq %rcx ; X64-SSE-NEXT: retq ; ; X32-AVX-LABEL: extract_any_extend_vector_inreg_v16i64: ; X32-AVX: # %bb.0: ; X32-AVX-NEXT: pushl %ebp ; X32-AVX-NEXT: movl %esp, %ebp -; X32-AVX-NEXT: andl $-128, %esp -; X32-AVX-NEXT: subl $384, %esp # imm = 0x180 +; X32-AVX-NEXT: andl $-32, %esp +; X32-AVX-NEXT: subl $288, %esp # imm = 0x120 ; X32-AVX-NEXT: movl 40(%ebp), %ecx ; X32-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 @@ -96,8 +92,8 @@ ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: pushq %rbp ; X64-AVX-NEXT: movq %rsp, %rbp -; X64-AVX-NEXT: andq $-128, %rsp -; X64-AVX-NEXT: subq $256, %rsp # imm = 0x100 +; X64-AVX-NEXT: andq $-32, %rsp +; X64-AVX-NEXT: subq $160, %rsp ; X64-AVX-NEXT: # kill: def $edi killed $edi def $rdi ; X64-AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm3[3,1,2,3] ; X64-AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero