diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp --- a/llvm/lib/CodeGen/VirtRegMap.cpp +++ b/llvm/lib/CodeGen/VirtRegMap.cpp @@ -30,6 +30,7 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" @@ -94,6 +95,12 @@ unsigned VirtRegMap::createSpillSlot(const TargetRegisterClass *RC) { unsigned Size = TRI->getSpillSize(*RC); Align Alignment = TRI->getSpillAlign(*RC); + // Set preferred alignment if we are still able to realign the stack + auto &ST = MF->getSubtarget(); + Align CurrentAlign = ST.getFrameLowering()->getStackAlign(); + if (Alignment > CurrentAlign && !ST.getRegisterInfo()->canRealignStack(*MF)) { + Alignment = CurrentAlign; + } int SS = MF->getFrameInfo().CreateSpillStackObject(Size, Alignment); ++NumSpillSlots; return SS; diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -1649,20 +1649,20 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #88 -; CHECK-NEXT: sub sp, #88 +; CHECK-NEXT: .pad #64 +; CHECK-NEXT: sub sp, #64 ; CHECK-NEXT: ldrd r12, r10, [r0] ; CHECK-NEXT: @ implicit-def: $s2 ; CHECK-NEXT: and r7, r3, #3 ; CHECK-NEXT: ldr.w r9, [r0, #8] ; CHECK-NEXT: lsrs r0, r3, #2 -; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: str r7, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: str r2, [sp, #60] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: str r2, [sp, #44] @ 4-byte Spill ; CHECK-NEXT: b .LBB19_3 ; CHECK-NEXT: .LBB19_1: @ in Loop: Header=BB19_3 Depth=1 ; CHECK-NEXT: vmov.f32 s14, s7 -; CHECK-NEXT: ldr r2, [sp, #60] @ 4-byte Reload +; CHECK-NEXT: ldr r2, [sp, #44] @ 4-byte Reload ; CHECK-NEXT: vmov.f32 s0, s10 ; CHECK-NEXT: vmov.f32 s7, s6 ; CHECK-NEXT: .LBB19_2: @ %if.end69 @@ -1681,14 +1681,14 @@ ; CHECK-NEXT: @ Child Loop BB19_5 Depth 2 ; CHECK-NEXT: vldr s7, [r10, #8] ; CHECK-NEXT: mov r5, r2 -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload ; CHECK-NEXT: vldr s8, [r10] ; CHECK-NEXT: vldr s10, [r10, #4] ; CHECK-NEXT: vldr s6, [r10, #12] ; CHECK-NEXT: wls lr, r0, .LBB19_6 ; CHECK-NEXT: @ %bb.4: @ %while.body.lr.ph ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: ldr r5, [sp, #60] @ 4-byte Reload +; CHECK-NEXT: ldr r5, [sp, #44] @ 4-byte Reload ; CHECK-NEXT: .LBB19_5: @ %while.body ; CHECK-NEXT: @ Parent Loop BB19_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 @@ -1700,7 +1700,7 @@ ; CHECK-NEXT: vmov r7, s7 ; CHECK-NEXT: vmov r11, s6 ; CHECK-NEXT: vldrw.u32 q1, [r9] -; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vmov r8, s8 ; CHECK-NEXT: vldrw.u32 q0, [r9, #16] ; CHECK-NEXT: ldr r6, [r1, #4] @@ -1717,7 +1717,7 @@ ; CHECK-NEXT: vfma.f32 q1, q6, r4 ; CHECK-NEXT: vldrw.u32 q4, [r9, #96] ; CHECK-NEXT: vfma.f32 q1, q5, r3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vfma.f32 q1, q4, r7 ; CHECK-NEXT: vfma.f32 q1, q0, r11 ; CHECK-NEXT: vmov.f32 s2, s8 @@ -1725,7 +1725,7 @@ ; CHECK-NEXT: le lr, .LBB19_5 ; CHECK-NEXT: .LBB19_6: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: beq .LBB19_1 ; CHECK-NEXT: @ %bb.7: @ %if.then @@ -1742,28 +1742,28 @@ ; CHECK-NEXT: vmov r6, s3 ; CHECK-NEXT: vmul.f32 q3, q3, r6 ; CHECK-NEXT: vmov r6, s1 -; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q2, [sp, #24] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q2, [r9, #112] ; CHECK-NEXT: vldrw.u32 q5, [r9, #48] ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q2, [r9, #80] ; CHECK-NEXT: vldrw.u32 q7, [r9, #64] ; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q2, [sp, #8] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q2, [r9, #16] ; CHECK-NEXT: vmov r2, s7 ; CHECK-NEXT: cmp r7, #1 ; CHECK-NEXT: vfma.f32 q3, q2, r6 -; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp, #8] @ 16-byte Reload ; CHECK-NEXT: vfma.f32 q3, q4, r4 ; CHECK-NEXT: vmov lr, s6 ; CHECK-NEXT: vfma.f32 q3, q5, r3 ; CHECK-NEXT: vfma.f32 q3, q7, r0 ; CHECK-NEXT: vfma.f32 q3, q2, r1 -; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp, #24] @ 16-byte Reload ; CHECK-NEXT: vfma.f32 q3, q2, r2 -; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vfma.f32 q3, q2, lr ; CHECK-NEXT: bne .LBB19_9 ; CHECK-NEXT: @ %bb.8: @ %if.then58 @@ -1786,18 +1786,18 @@ ; CHECK-NEXT: .LBB19_11: @ %if.end69 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 ; CHECK-NEXT: vmov.f32 s2, s3 -; CHECK-NEXT: ldr r2, [sp, #60] @ 4-byte Reload +; CHECK-NEXT: ldr r2, [sp, #44] @ 4-byte Reload ; CHECK-NEXT: b .LBB19_2 ; CHECK-NEXT: .LBB19_12: @ %if.else64 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 ; CHECK-NEXT: vmov.f32 s7, s13 -; CHECK-NEXT: ldr r2, [sp, #60] @ 4-byte Reload +; CHECK-NEXT: ldr r2, [sp, #44] @ 4-byte Reload ; CHECK-NEXT: vmov.f32 s2, s3 ; CHECK-NEXT: vstr s14, [r5, #8] ; CHECK-NEXT: vmov.f32 s8, s1 ; CHECK-NEXT: b .LBB19_2 ; CHECK-NEXT: .LBB19_13: @ %do.end -; CHECK-NEXT: add sp, #88 +; CHECK-NEXT: add sp, #64 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -676,20 +676,20 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #72 -; CHECK-NEXT: sub sp, #72 +; CHECK-NEXT: .pad #64 +; CHECK-NEXT: sub sp, #64 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: str r1, [sp, #64] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #56] @ 4-byte Spill ; CHECK-NEXT: mov r1, r2 -; CHECK-NEXT: str r2, [sp, #68] @ 4-byte Spill +; CHECK-NEXT: str r2, [sp, #60] @ 4-byte Spill ; CHECK-NEXT: blt.w .LBB12_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: ldr r1, [sp, #68] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload ; CHECK-NEXT: adr r6, .LCPI12_2 ; CHECK-NEXT: vldrw.u32 q1, [r6] ; CHECK-NEXT: movs r7, #1 ; CHECK-NEXT: bic r1, r1, #7 -; CHECK-NEXT: str r1, [sp, #60] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #52] @ 4-byte Spill ; CHECK-NEXT: sub.w r3, r1, #8 ; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vmov.i16 q0, #0x18 @@ -697,16 +697,16 @@ ; CHECK-NEXT: adr r3, .LCPI12_0 ; CHECK-NEXT: vldrw.u32 q1, [r3] ; CHECK-NEXT: adr r7, .LCPI12_1 -; CHECK-NEXT: str r1, [sp, #56] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #48] @ 4-byte Spill ; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q1, [r7] ; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill ; CHECK-NEXT: .LBB12_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB12_3 Depth 2 -; CHECK-NEXT: ldr r1, [sp, #56] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #48] @ 4-byte Reload ; CHECK-NEXT: dls lr, r1 -; CHECK-NEXT: ldr.w r12, [sp, #64] @ 4-byte Reload +; CHECK-NEXT: ldr.w r12, [sp, #56] @ 4-byte Reload ; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload @@ -836,12 +836,12 @@ ; CHECK-NEXT: le lr, .LBB12_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB12_2 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #68] @ 4-byte Reload -; CHECK-NEXT: ldr r2, [sp, #60] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload +; CHECK-NEXT: ldr r2, [sp, #52] @ 4-byte Reload ; CHECK-NEXT: cmp r2, r1 ; CHECK-NEXT: bne.w .LBB12_2 ; CHECK-NEXT: .LBB12_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #72 +; CHECK-NEXT: add sp, #64 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -924,10 +924,10 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #328 -; CHECK-NEXT: sub sp, #328 +; CHECK-NEXT: .pad #312 +; CHECK-NEXT: sub sp, #312 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: str r1, [sp, #124] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #116] @ 4-byte Spill ; CHECK-NEXT: blt.w .LBB13_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader ; CHECK-NEXT: adr r1, .LCPI13_0 @@ -964,44 +964,44 @@ ; CHECK-NEXT: adr r1, .LCPI13_2 ; CHECK-NEXT: vldrw.u32 q3, [r1] ; CHECK-NEXT: adr r1, .LCPI13_10 -; CHECK-NEXT: vstrw.32 q6, [sp, #288] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q6, [sp, #280] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #304] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q3, [r1] ; CHECK-NEXT: adr r1, .LCPI13_11 -; CHECK-NEXT: ldr.w r8, [sp, #124] @ 4-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #256] @ 16-byte Spill +; CHECK-NEXT: ldr.w r8, [sp, #116] @ 4-byte Reload +; CHECK-NEXT: vstrw.32 q3, [sp, #248] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q6, [sp, #272] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q6, [sp, #264] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #224] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q7, [r1] ; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #208] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q3, [sp, #200] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q3, [sp, #96] @ 16-byte Reload ; CHECK-NEXT: mov r11, r10 -; CHECK-NEXT: vstrw.32 q6, [sp, #240] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q3, [sp, #192] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q6, [sp, #232] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q3, [sp, #184] @ 16-byte Spill ; CHECK-NEXT: .LBB13_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB13_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vadd.i32 q4, q1, r0 -; CHECK-NEXT: vstrw.32 q7, [sp, #144] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q7, [sp, #136] @ 16-byte Spill ; CHECK-NEXT: vmov r1, lr, d8 ; CHECK-NEXT: vadd.i32 q7, q7, r0 ; CHECK-NEXT: vmov r5, r4, d15 ; CHECK-NEXT: vadd.i32 q6, q0, r0 ; CHECK-NEXT: vmov r6, r7, d13 -; CHECK-NEXT: vstrw.32 q1, [sp, #160] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #304] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [sp, #176] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q1, [sp, #152] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [sp, #296] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [sp, #168] @ 16-byte Spill ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: vmov q3, q5 -; CHECK-NEXT: vstrw.32 q1, [sp, #304] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q1, [sp, #296] @ 16-byte Spill ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vldrw.u32 q3, [sp, #224] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q5, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q3, [sp, #216] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q5, [sp, #120] @ 16-byte Spill ; CHECK-NEXT: subs.w r11, r11, #16 ; CHECK-NEXT: ldrb.w r9, [r1] ; CHECK-NEXT: vmov r1, r3, d14 @@ -1028,14 +1028,14 @@ ; CHECK-NEXT: vmov.8 q6[5], r7 ; CHECK-NEXT: ldrb r4, [r1] ; CHECK-NEXT: vmov r1, r5, d3 -; CHECK-NEXT: vldrw.u32 q1, [sp, #256] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q1, [sp, #248] @ 16-byte Reload ; CHECK-NEXT: vadd.i32 q0, q1, r0 -; CHECK-NEXT: vstrw.32 q1, [sp, #256] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #240] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q1, [sp, #248] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [sp, #232] @ 16-byte Reload ; CHECK-NEXT: ldrb.w r12, [r1] ; CHECK-NEXT: vmov r1, r3, d9 ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vldrw.u32 q4, [sp, #192] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp, #184] @ 16-byte Reload ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: ldrb r3, [r3] ; CHECK-NEXT: vmov.8 q6[6], r1 @@ -1045,9 +1045,9 @@ ; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: vmov.8 q7[4], r1 ; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #272] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #264] @ 16-byte Reload ; CHECK-NEXT: vmov.8 q7[5], r7 -; CHECK-NEXT: vstrw.32 q0, [sp, #272] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #264] @ 16-byte Spill ; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: ldrb r3, [r3] @@ -1056,9 +1056,9 @@ ; CHECK-NEXT: vmov r7, r6, d0 ; CHECK-NEXT: vmov.8 q7[7], r3 ; CHECK-NEXT: vmov r3, lr, d1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #288] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #280] @ 16-byte Reload ; CHECK-NEXT: vmov.8 q7[8], r1 -; CHECK-NEXT: vstrw.32 q0, [sp, #288] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #280] @ 16-byte Spill ; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: vmov.8 q7[9], r4 ; CHECK-NEXT: vmov r4, r1, d0 @@ -1073,7 +1073,7 @@ ; CHECK-NEXT: vmov r5, r4, d1 ; CHECK-NEXT: vmov.8 q6[9], r1 ; CHECK-NEXT: vadd.i32 q0, q5, r0 -; CHECK-NEXT: vldrw.u32 q5, [sp, #208] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [sp, #200] @ 16-byte Reload ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: vmov.8 q6[10], r5 @@ -1089,11 +1089,11 @@ ; CHECK-NEXT: vmov r1, r3, d1 ; CHECK-NEXT: vadd.i32 q0, q1, r0 ; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vstrw.32 q1, [sp, #240] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #256] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q1, [sp, #232] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [sp, #248] @ 16-byte Reload ; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vstrw.32 q1, [sp, #256] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #160] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q1, [sp, #248] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [sp, #152] @ 16-byte Reload ; CHECK-NEXT: vadd.i32 q1, q1, q2 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[14], r1 @@ -1110,16 +1110,16 @@ ; CHECK-NEXT: vmov r1, r3, d1 ; CHECK-NEXT: vadd.i32 q0, q3, r0 ; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #224] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #304] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q3, [sp, #296] @ 16-byte Reload ; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #304] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #288] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q3, [sp, #280] @ 16-byte Reload ; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #288] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #272] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q3, [sp, #280] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q3, [sp, #264] @ 16-byte Reload ; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #272] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q3, [sp, #264] @ 16-byte Spill ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[2], r1 ; CHECK-NEXT: ldrb r1, [r3] @@ -1132,8 +1132,8 @@ ; CHECK-NEXT: vmov r1, r3, d1 ; CHECK-NEXT: vadd.i32 q0, q5, r0 ; CHECK-NEXT: vadd.i32 q5, q5, q2 -; CHECK-NEXT: vstrw.32 q5, [sp, #208] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q5, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q5, [sp, #200] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q5, [sp, #120] @ 16-byte Reload ; CHECK-NEXT: vadd.i32 q5, q5, q2 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[6], r1 @@ -1147,7 +1147,7 @@ ; CHECK-NEXT: vmov r1, r3, d1 ; CHECK-NEXT: vadd.i32 q0, q4, r0 ; CHECK-NEXT: vadd.i32 q4, q4, q2 -; CHECK-NEXT: vstrw.32 q4, [sp, #192] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q4, [sp, #184] @ 16-byte Spill ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[10], r1 ; CHECK-NEXT: ldrb r1, [r3] @@ -1163,9 +1163,9 @@ ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[15], r1 ; CHECK-NEXT: vadd.i8 q0, q6, q7 -; CHECK-NEXT: vldrw.u32 q7, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q7, [sp, #136] @ 16-byte Reload ; CHECK-NEXT: vstrb.8 q0, [r8], #16 -; CHECK-NEXT: vldrw.u32 q0, [sp, #176] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #168] @ 16-byte Reload ; CHECK-NEXT: vadd.i32 q7, q7, q2 ; CHECK-NEXT: vadd.i32 q0, q0, q2 ; CHECK-NEXT: bne.w .LBB13_3 @@ -1174,7 +1174,7 @@ ; CHECK-NEXT: cmp r10, r2 ; CHECK-NEXT: bne.w .LBB13_2 ; CHECK-NEXT: .LBB13_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #328 +; CHECK-NEXT: add sp, #312 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -444,9 +444,9 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 -; CHECK-NEXT: ldrd r9, r12, [sp, #128] +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: ldrd r9, r12, [sp, #120] ; CHECK-NEXT: sub.w r7, r12, #1 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: mov.w r8, #0 @@ -506,7 +506,7 @@ ; CHECK-NEXT: cmp r8, r3 ; CHECK-NEXT: bne .LBB9_1 ; CHECK-NEXT: @ %bb.6: @ %for.end25 -; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-multivec-spill.ll b/llvm/test/CodeGen/Thumb2/mve-multivec-spill.ll --- a/llvm/test/CodeGen/Thumb2/mve-multivec-spill.ll +++ b/llvm/test/CodeGen/Thumb2/mve-multivec-spill.ll @@ -10,8 +10,8 @@ ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #112 -; CHECK-NEXT: sub sp, #112 +; CHECK-NEXT: .pad #96 +; CHECK-NEXT: sub sp, #96 ; CHECK-NEXT: vld20.32 {q0, q1}, [r0] ; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: add.w lr, sp, #64 @@ -49,7 +49,7 @@ ; CHECK-NEXT: vldmia lr, {d2, d3, d4, d5} @ 32-byte Reload ; CHECK-NEXT: vstrw.32 q2, [r4, #16] ; CHECK-NEXT: vstrw.32 q1, [r4] -; CHECK-NEXT: add sp, #112 +; CHECK-NEXT: add sp, #96 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll --- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll +++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll @@ -12,8 +12,8 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: adr.w r8, .LCPI0_0 ; CHECK-NEXT: adr.w r9, .LCPI0_1 ; CHECK-NEXT: vldrw.u32 q6, [r8] @@ -149,8 +149,8 @@ ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #416 -; CHECK-NEXT: sub sp, #416 +; CHECK-NEXT: .pad #408 +; CHECK-NEXT: sub sp, #408 ; CHECK-NEXT: movw r7, :lower16:.L_MergedGlobals ; CHECK-NEXT: vldr s12, .LCPI1_0 ; CHECK-NEXT: movt r7, :upper16:.L_MergedGlobals @@ -173,7 +173,7 @@ ; CHECK-NEXT: vdup.32 q7, r3 ; CHECK-NEXT: vmov q6[2], q6[0], r3, r5 ; CHECK-NEXT: vmov.f32 s22, s13 -; CHECK-NEXT: vstrw.32 q0, [sp, #100] +; CHECK-NEXT: vstrw.32 q0, [sp, #92] ; CHECK-NEXT: vmov q0, q7 ; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 ; CHECK-NEXT: vmov q4, q7 @@ -181,7 +181,7 @@ ; CHECK-NEXT: vmov.32 q7[1], r2 ; CHECK-NEXT: vmov.f32 s23, s15 ; CHECK-NEXT: movs r1, #64 -; CHECK-NEXT: str r0, [sp, #48] +; CHECK-NEXT: str r0, [sp, #40] ; CHECK-NEXT: vstrw.32 q5, [r0] ; CHECK-NEXT: str r6, [r0] ; CHECK-NEXT: vstrw.32 q7, [r0] @@ -196,9 +196,9 @@ ; CHECK-NEXT: vmov q2[3], q2[1], r4, r5 ; CHECK-NEXT: vmov.32 q4[0], r8 ; CHECK-NEXT: @ implicit-def: $r2 -; CHECK-NEXT: str.w r8, [sp, #52] -; CHECK-NEXT: vstrw.32 q3, [sp, #68] -; CHECK-NEXT: strh.w r12, [sp, #414] +; CHECK-NEXT: str.w r8, [sp, #44] +; CHECK-NEXT: vstrw.32 q3, [sp, #60] +; CHECK-NEXT: strh.w r12, [sp, #406] ; CHECK-NEXT: wlstp.8 lr, r1, .LBB1_2 ; CHECK-NEXT: .LBB1_1: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload @@ -209,7 +209,7 @@ ; CHECK-NEXT: str.w r8, [r7] ; CHECK-NEXT: vstrw.32 q4, [r0] ; CHECK-NEXT: vstrw.32 q2, [r0] -; CHECK-NEXT: str.w r12, [sp, #332] +; CHECK-NEXT: str.w r12, [sp, #324] ; CHECK-NEXT: .LBB1_3: @ %for.cond ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: b .LBB1_3 diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll --- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll @@ -1055,45 +1055,45 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #88 -; CHECK-NEXT: sub sp, #88 -; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: .pad #72 +; CHECK-NEXT: sub sp, #72 +; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] ; CHECK-NEXT: subs r1, #7 -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo.w .LBB6_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: ldr r3, [r0, #8] ; CHECK-NEXT: ldr r1, [r0] ; CHECK-NEXT: adds r0, r3, #3 -; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: bic r0, r0, #3 ; CHECK-NEXT: add.w r9, r1, r3, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: lsls r5, r3, #2 ; CHECK-NEXT: add.w r1, r0, r1, lsr #2 -; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: rsb r1, r3, r3, lsl #3 ; CHECK-NEXT: lsls r1, r1, #2 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: .LBB6_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB6_3 Depth 2 ; CHECK-NEXT: adds r1, r0, #6 -; CHECK-NEXT: str r1, [sp, #44] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #5 -; CHECK-NEXT: str r1, [sp, #40] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #3 -; CHECK-NEXT: ldr r7, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill +; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: vmov.i32 q2, #0x0 -; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #2 -; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: add.w r8, r0, #1 ; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: vmov q4, q2 @@ -1102,7 +1102,7 @@ ; CHECK-NEXT: vmov q6, q2 ; CHECK-NEXT: vmov q1, q2 ; CHECK-NEXT: mov r12, r7 -; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill ; CHECK-NEXT: dls lr, r6 ; CHECK-NEXT: .LBB6_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1 @@ -1117,7 +1117,7 @@ ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q0, [r10] ; CHECK-NEXT: vfmat.f32 q6, q0, q7 -; CHECK-NEXT: vstrw.32 q6, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q0, [r11] ; CHECK-NEXT: vfmat.f32 q1, q0, q7 @@ -1129,17 +1129,17 @@ ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q0, [r6] ; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q1, [sp, #56] @ 16-byte Reload ; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q1, q0, q7 -; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q1, [sp, #56] @ 16-byte Spill ; CHECK-NEXT: vmov q1, q3 ; CHECK-NEXT: vmov q3, q2 ; CHECK-NEXT: vmov q2, q4 ; CHECK-NEXT: vmov q4, q5 ; CHECK-NEXT: vmov q5, q6 -; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload ; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vpstt @@ -1168,7 +1168,7 @@ ; CHECK-NEXT: vadd.f32 s0, s2, s0 ; CHECK-NEXT: vadd.f32 s10, s18, s19 ; CHECK-NEXT: vadd.f32 s9, s16, s17 -; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload ; CHECK-NEXT: vadd.f32 s2, s3, s1 ; CHECK-NEXT: vadd.f32 s6, s18, s19 ; CHECK-NEXT: vadd.f32 s5, s16, s17 @@ -1183,25 +1183,25 @@ ; CHECK-NEXT: vadd.f32 s8, s8, s20 ; CHECK-NEXT: vadd.f32 s6, s5, s6 ; CHECK-NEXT: vstr s4, [r1] -; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s6, [r1] -; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s12, [r1] -; CHECK-NEXT: ldr r1, [sp, #40] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s10, [r1] -; CHECK-NEXT: ldr r1, [sp, #44] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s8, [r1] -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: add r9, r1 -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB6_2 ; CHECK-NEXT: .LBB6_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #88 +; CHECK-NEXT: add sp, #72 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -1345,44 +1345,44 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #104 -; CHECK-NEXT: sub sp, #104 -; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: .pad #88 +; CHECK-NEXT: sub sp, #88 +; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] ; CHECK-NEXT: subs r1, #8 -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo.w .LBB7_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: ldr r3, [r0, #8] ; CHECK-NEXT: ldr r1, [r0] ; CHECK-NEXT: adds r0, r3, #3 -; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: bic r0, r0, #3 ; CHECK-NEXT: add.w r12, r1, r3, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: lsls r5, r3, #2 ; CHECK-NEXT: add.w r1, r0, r1, lsr #2 -; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: lsls r1, r3, #5 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: .LBB7_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB7_3 Depth 2 ; CHECK-NEXT: adds r1, r0, #7 -; CHECK-NEXT: str r1, [sp, #44] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #6 -; CHECK-NEXT: str r1, [sp, #40] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #5 -; CHECK-NEXT: ldr r7, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill +; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: ldr.w r9, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: ldr.w r9, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q3, #0x0 -; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #3 -; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: add.w r8, r0, #2 ; CHECK-NEXT: adds r1, r0, #1 ; CHECK-NEXT: mov r3, r12 @@ -1392,8 +1392,8 @@ ; CHECK-NEXT: vmov q7, q3 ; CHECK-NEXT: vmov q2, q3 ; CHECK-NEXT: mov r10, r7 -; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q3, [sp, #56] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q3, [sp, #72] @ 16-byte Spill ; CHECK-NEXT: dls lr, r6 ; CHECK-NEXT: .LBB7_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB7_2 Depth=1 @@ -1404,7 +1404,7 @@ ; CHECK-NEXT: vldrwt.u32 q0, [r9], #16 ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 ; CHECK-NEXT: vfmat.f32 q6, q1, q0 -; CHECK-NEXT: vstrw.32 q6, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r11] ; CHECK-NEXT: vfmat.f32 q7, q1, q0 @@ -1415,23 +1415,23 @@ ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q1, [r6] ; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp, #56] @ 16-byte Reload ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q2, q1, q0 -; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill ; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q1, [r7] -; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload ; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q2, q1, q0 -; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill ; CHECK-NEXT: vmov q2, q4 ; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vmov q3, q5 ; CHECK-NEXT: vmov q5, q6 -; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload ; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r6] @@ -1459,12 +1459,12 @@ ; CHECK-NEXT: vadd.f32 s6, s24, s25 ; CHECK-NEXT: vadd.f32 s14, s18, s19 ; CHECK-NEXT: vadd.f32 s7, s16, s17 -; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload ; CHECK-NEXT: vadd.f32 s8, s8, s9 ; CHECK-NEXT: vadd.f32 s13, s10, s11 ; CHECK-NEXT: vadd.f32 s10, s18, s19 ; CHECK-NEXT: vadd.f32 s9, s16, s17 -; CHECK-NEXT: vldrw.u32 q4, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp, #72] @ 16-byte Reload ; CHECK-NEXT: vadd.f32 s0, s2, s0 ; CHECK-NEXT: vadd.f32 s11, s18, s19 ; CHECK-NEXT: vadd.f32 s15, s16, s17 @@ -1484,26 +1484,26 @@ ; CHECK-NEXT: vstr s10, [r1] ; CHECK-NEXT: add.w r1, r2, r4, lsl #2 ; CHECK-NEXT: vstr s14, [r1] -; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: vadd.f32 s4, s3, s1 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s8, [r1] -; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s12, [r1] -; CHECK-NEXT: ldr r1, [sp, #40] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s4, [r1] -; CHECK-NEXT: ldr r1, [sp, #44] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s6, [r1] -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: add r12, r1 -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB7_2 ; CHECK-NEXT: .LBB7_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #104 +; CHECK-NEXT: add sp, #88 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll --- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll @@ -1061,23 +1061,23 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #56 -; CHECK-NEXT: sub sp, #56 +; CHECK-NEXT: .pad #40 +; CHECK-NEXT: sub sp, #40 ; CHECK-NEXT: cmp r2, #8 -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: vstr s0, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: vstr s0, [sp] @ 4-byte Spill ; CHECK-NEXT: mov r1, r2 -; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r2, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: blo .LBB7_9 ; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: mov.w r11, #0 ; CHECK-NEXT: mov.w r12, #1 -; CHECK-NEXT: str r2, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: lsrs r1, r2, #2 ; CHECK-NEXT: b .LBB7_3 ; CHECK-NEXT: .LBB7_2: @ in Loop: Header=BB7_3 Depth=1 -; CHECK-NEXT: ldr r2, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: add.w r11, r11, #1 ; CHECK-NEXT: lsl.w r12, r12, #2 ; CHECK-NEXT: cmp r2, #7 @@ -1086,27 +1086,27 @@ ; CHECK-NEXT: .LBB7_3: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB7_6 Depth 2 ; CHECK-NEXT: @ Child Loop BB7_7 Depth 3 -; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: cmp.w r12, #1 -; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: lsr.w r2, r1, #2 -; CHECK-NEXT: str r2, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: blt .LBB7_2 ; CHECK-NEXT: @ %bb.4: @ in Loop: Header=BB7_3 Depth=1 ; CHECK-NEXT: lsrs r2, r1, #3 -; CHECK-NEXT: str r2, [sp, #32] @ 4-byte Spill +; CHECK-NEXT: str r2, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: beq .LBB7_2 ; CHECK-NEXT: @ %bb.5: @ %.preheader ; CHECK-NEXT: @ in Loop: Header=BB7_3 Depth=1 -; CHECK-NEXT: ldr r2, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: lsls r1, r1, #1 ; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: lsl.w r10, r2, #1 ; CHECK-NEXT: .LBB7_6: @ Parent Loop BB7_3 Depth=1 ; CHECK-NEXT: @ => This Loop Header: Depth=2 ; CHECK-NEXT: @ Child Loop BB7_7 Depth 3 -; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: ldrd lr, r2, [r0, #16] ; CHECK-NEXT: ldrd r3, r8, [r0, #24] ; CHECK-NEXT: muls r1, r4, r1 @@ -1115,11 +1115,11 @@ ; CHECK-NEXT: ldr.w r3, [r3, r11, lsl #2] ; CHECK-NEXT: ldr.w r6, [lr, r11, lsl #2] ; CHECK-NEXT: add.w r7, r7, r2, lsl #2 -; CHECK-NEXT: ldr r2, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r2, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: add.w r5, r5, r3, lsl #2 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: add.w r3, r8, r6, lsl #2 -; CHECK-NEXT: ldr r6, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: ldr r6, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: add.w r2, r1, r10, lsl #2 ; CHECK-NEXT: add.w r8, r2, r10, lsl #2 ; CHECK-NEXT: add.w r9, r8, r10, lsl #2 @@ -1159,14 +1159,14 @@ ; CHECK-NEXT: bne .LBB7_6 ; CHECK-NEXT: b .LBB7_2 ; CHECK-NEXT: .LBB7_9: -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: lsrs r0, r0, #3 ; CHECK-NEXT: wls lr, r0, .LBB7_12 ; CHECK-NEXT: @ %bb.10: ; CHECK-NEXT: adr r0, .LCPI7_0 -; CHECK-NEXT: vldr s0, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: vldr s0, [sp] @ 4-byte Reload ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vldrw.u32 q2, [q1, #64]! @@ -1182,7 +1182,7 @@ ; CHECK-NEXT: vsub.f32 q2, q0, q3 ; CHECK-NEXT: vmul.f32 q7, q7, r0 ; CHECK-NEXT: vadd.f32 q3, q0, q3 -; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q7, [sp, #24] @ 16-byte Spill ; CHECK-NEXT: vcadd.f32 q7, q6, q5, #90 ; CHECK-NEXT: vmul.f32 q4, q2, r0 ; CHECK-NEXT: vldrw.u32 q2, [q1, #64]! @@ -1192,11 +1192,11 @@ ; CHECK-NEXT: vstrw.32 q3, [q1, #-64] ; CHECK-NEXT: vstrw.32 q4, [q1, #-56] ; CHECK-NEXT: vstrw.32 q5, [q1, #-48] -; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q3, [sp, #24] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q3, [q1, #-40] ; CHECK-NEXT: le lr, .LBB7_11 ; CHECK-NEXT: .LBB7_12: -; CHECK-NEXT: add sp, #56 +; CHECK-NEXT: add sp, #40 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll @@ -133,8 +133,8 @@ ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: cmp r1, #1 ; CHECK-NEXT: blt .LBB3_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader @@ -171,7 +171,7 @@ ; CHECK-NEXT: cmp r2, r1 ; CHECK-NEXT: bne .LBB3_2 ; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll @@ -768,8 +768,8 @@ ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #40 -; CHECK-NEXT: sub sp, #40 +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vcmp.i8 eq, q2, zr ; CHECK-NEXT: vmov.i8 q2, #0xff @@ -1080,7 +1080,7 @@ ; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: add sp, #40 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -115,8 +115,8 @@ ; CHECK-NEXT: push {r4, r5} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: .pad #136 -; CHECK-NEXT: sub sp, #136 +; CHECK-NEXT: .pad #112 +; CHECK-NEXT: sub sp, #112 ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: add.w r3, r0, #192 @@ -127,23 +127,23 @@ ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 ; CHECK-NEXT: vadd.i32 q4, q2, q3 ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vstrw.32 q4, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q6, [sp, #96] @ 16-byte Reload ; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vldrw.u32 q5, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r3] ; CHECK-NEXT: vadd.i32 q6, q5, q6 -; CHECK-NEXT: vstrw.32 q6, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q6, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vstrw.32 q4, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q4, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vldrw.u32 q5, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [sp, #64] @ 16-byte Reload ; CHECK-NEXT: vadd.i32 q0, q0, q2 ; CHECK-NEXT: vadd.i32 q1, q3, q5 ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r2] @@ -154,9 +154,9 @@ ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload ; CHECK-NEXT: vadd.i32 q0, q0, q5 ; CHECK-NEXT: vldmia sp, {d6, d7, d8, d9, d10, d11, d12, d13} @ 64-byte Reload ; CHECK-NEXT: vadd.i32 q1, q2, q1 @@ -165,13 +165,13 @@ ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: vadd.i32 q1, q5, q6 ; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #136 +; CHECK-NEXT: add sp, #112 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop {r4, r5} ; CHECK-NEXT: bx lr @@ -914,8 +914,8 @@ ; CHECK-NEXT: push {r4, r5} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: .pad #136 -; CHECK-NEXT: sub sp, #136 +; CHECK-NEXT: .pad #112 +; CHECK-NEXT: sub sp, #112 ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: add.w r3, r0, #192 @@ -926,23 +926,23 @@ ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 ; CHECK-NEXT: vadd.f32 q4, q2, q3 ; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vstrw.32 q4, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q6, [sp, #96] @ 16-byte Reload ; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vldrw.u32 q5, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r3] ; CHECK-NEXT: vadd.f32 q6, q5, q6 -; CHECK-NEXT: vstrw.32 q6, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q6, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vstrw.32 q4, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q4, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vldrw.u32 q5, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [sp, #64] @ 16-byte Reload ; CHECK-NEXT: vadd.f32 q0, q0, q2 ; CHECK-NEXT: vadd.f32 q1, q3, q5 ; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r2] @@ -953,9 +953,9 @@ ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload ; CHECK-NEXT: vadd.f32 q0, q0, q5 ; CHECK-NEXT: vldmia sp, {d6, d7, d8, d9, d10, d11, d12, d13} @ 64-byte Reload ; CHECK-NEXT: vadd.f32 q1, q2, q1 @@ -964,13 +964,13 @@ ; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: vadd.f32 q1, q5, q6 ; CHECK-NEXT: vadd.f32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #136 +; CHECK-NEXT: add sp, #112 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop {r4, r5} ; CHECK-NEXT: bx lr @@ -1147,8 +1147,8 @@ ; CHECK-NEXT: push {r4, r5} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #88 -; CHECK-NEXT: sub sp, #88 +; CHECK-NEXT: .pad #80 +; CHECK-NEXT: sub sp, #80 ; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] @@ -1171,7 +1171,7 @@ ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vadd.f16 q0, q0, q4 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #88 +; CHECK-NEXT: add sp, #80 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, r5} ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll @@ -8,14 +8,14 @@ ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: mul r12, r3, r2 ; CHECK-NEXT: lsrs.w r2, r12, #2 ; CHECK-NEXT: beq.w .LBB0_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: mvn r3, #7 -; CHECK-NEXT: ldr r2, [sp, #96] +; CHECK-NEXT: ldr r2, [sp, #88] ; CHECK-NEXT: and.w r3, r3, r12, lsr #2 ; CHECK-NEXT: sub.w r12, r3, #8 ; CHECK-NEXT: movs r3, #1 @@ -123,7 +123,7 @@ ; CHECK-NEXT: vstrh.16 q4, [r1, #-48] ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: .LBB0_3: @ %while.end -; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r7, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -119,8 +119,8 @@ ; CHECK-NEXT: push {r4, r5} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #216 -; CHECK-NEXT: sub sp, #216 +; CHECK-NEXT: .pad #192 +; CHECK-NEXT: sub sp, #192 ; CHECK-NEXT: vldrw.u32 q2, [r0, #144] ; CHECK-NEXT: add r2, sp, #64 ; CHECK-NEXT: vldrw.u32 q1, [r0, #80] @@ -180,7 +180,7 @@ ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1] ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: add sp, #216 +; CHECK-NEXT: add sp, #192 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, r5} ; CHECK-NEXT: bx lr @@ -889,8 +889,8 @@ ; CHECK-NEXT: push {r4, r5} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #216 -; CHECK-NEXT: sub sp, #216 +; CHECK-NEXT: .pad #192 +; CHECK-NEXT: sub sp, #192 ; CHECK-NEXT: vldrw.u32 q2, [r0, #144] ; CHECK-NEXT: add r2, sp, #64 ; CHECK-NEXT: vldrw.u32 q1, [r0, #80] @@ -950,7 +950,7 @@ ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst43.32 {q4, q5, q6, q7}, [r1] ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: add sp, #216 +; CHECK-NEXT: add sp, #192 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, r5} ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll --- a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll @@ -23,41 +23,41 @@ ; CHECK-NEXT: pushq %r15 ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $4056, %rsp # imm = 0xFD8 +; CHECK-NEXT: subq $2120, %rsp # imm = 0x848 ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %ebp ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 -; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqu64 %zmm0, (%rsp) +; CHECK-NEXT: movb $1, (%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg (%rsp) ; CHECK-NEXT: movl $buf, %eax ; CHECK-NEXT: movl $32, %r14d ; CHECK-NEXT: movw $8, %r15w ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm1 ; CHECK-NEXT: movabsq $64, %rax -; CHECK-NEXT: tilestored %tmm1, 2048(%rsp,%rax) # 1024-byte Folded Spill +; CHECK-NEXT: tilestored %tmm1, 1088(%rsp,%rax) # 1024-byte Folded Spill ; CHECK-NEXT: movl $buf+1024, %eax ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2 ; CHECK-NEXT: movabsq $64, %rax -; CHECK-NEXT: tilestored %tmm2, 1024(%rsp,%rax) # 1024-byte Folded Spill +; CHECK-NEXT: tilestored %tmm2, 64(%rsp,%rax) # 1024-byte Folded Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg (%rsp) ; CHECK-NEXT: movl $buf+2048, %eax ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0 ; CHECK-NEXT: movabsq $64, %rcx -; CHECK-NEXT: tileloadd 2048(%rsp,%rcx), %tmm1 # 1024-byte Folded Reload +; CHECK-NEXT: tileloadd 1088(%rsp,%rcx), %tmm1 # 1024-byte Folded Reload ; CHECK-NEXT: movabsq $64, %rcx -; CHECK-NEXT: tileloadd 1024(%rsp,%rcx), %tmm2 # 1024-byte Folded Reload +; CHECK-NEXT: tileloadd 64(%rsp,%rcx), %tmm2 # 1024-byte Folded Reload ; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 ; CHECK-NEXT: tilestored %tmm0, (%rax,%r14) -; CHECK-NEXT: addq $4056, %rsp # imm = 0xFD8 +; CHECK-NEXT: addq $2120, %rsp # imm = 0x848 ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %r15 @@ -111,16 +111,16 @@ ; CHECK-NEXT: pushq %r13 ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $3016, %rsp # imm = 0xBC8 +; CHECK-NEXT: subq $1096, %rsp # imm = 0x448 ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 -; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqu64 %zmm0, (%rsp) +; CHECK-NEXT: movb $1, (%rsp) ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg (%rsp) ; CHECK-NEXT: testl %r14d, %r14d ; CHECK-NEXT: jg .LBB2_4 ; CHECK-NEXT: # %bb.1: # %.preheader @@ -133,14 +133,14 @@ ; CHECK-NEXT: .LBB2_2: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: tileloadd (%r15,%r12), %tmm0 ; CHECK-NEXT: movabsq $64, %rax -; CHECK-NEXT: tilestored %tmm0, 1024(%rsp,%rax) # 1024-byte Folded Spill +; CHECK-NEXT: tilestored %tmm0, 64(%rsp,%rax) # 1024-byte Folded Spill ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg (%rsp) ; CHECK-NEXT: movabsq $64, %rax -; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm0 # 1024-byte Folded Reload +; CHECK-NEXT: tileloadd 64(%rsp,%rax), %tmm0 # 1024-byte Folded Reload ; CHECK-NEXT: tilestored %tmm0, (%r13,%r12) ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg (%rsp) ; CHECK-NEXT: decl %ebp ; CHECK-NEXT: cmpl $7, %ebp ; CHECK-NEXT: jne .LBB2_2 @@ -155,7 +155,7 @@ ; CHECK-NEXT: jmp .LBB2_8 ; CHECK-NEXT: .LBB2_4: ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg (%rsp) ; CHECK-NEXT: movl $32, %eax ; CHECK-NEXT: movl $buf+1024, %ecx ; CHECK-NEXT: movw $8, %dx @@ -165,7 +165,7 @@ ; CHECK-NEXT: decl %r14d ; CHECK-NEXT: .LBB2_8: ; CHECK-NEXT: movl %r14d, %eax -; CHECK-NEXT: addq $3016, %rsp # imm = 0xBC8 +; CHECK-NEXT: addq $1096, %rsp # imm = 0x448 ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 ; CHECK-NEXT: popq %r13 @@ -270,11 +270,11 @@ ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $3024, %rsp # imm = 0xBD0 +; CHECK-NEXT: subq $1088, %rsp # imm = 0x440 ; CHECK-NEXT: movl %edi, %ebx ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 -; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqu64 %zmm0, (%rsp) +; CHECK-NEXT: movb $1, (%rsp) ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $buf, %r14d @@ -285,22 +285,22 @@ ; CHECK-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg (%rsp) ; CHECK-NEXT: testl %ebx, %ebx ; CHECK-NEXT: jle .LBB3_3 ; CHECK-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1 ; CHECK-NEXT: tileloadd (%r14,%r15), %tmm0 ; CHECK-NEXT: movabsq $64, %rax -; CHECK-NEXT: tilestored %tmm0, 1024(%rsp,%rax) # 1024-byte Folded Spill +; CHECK-NEXT: tilestored %tmm0, 64(%rsp,%rax) # 1024-byte Folded Spill ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg (%rsp) ; CHECK-NEXT: movabsq $64, %rax -; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm0 # 1024-byte Folded Reload +; CHECK-NEXT: tileloadd 64(%rsp,%rax), %tmm0 # 1024-byte Folded Reload ; CHECK-NEXT: tilestored %tmm0, (%r12,%r15) ; CHECK-NEXT: callq foo ; CHECK-NEXT: jmp .LBB3_1 ; CHECK-NEXT: .LBB3_3: -; CHECK-NEXT: addq $3024, %rsp # imm = 0xBD0 +; CHECK-NEXT: addq $1088, %rsp # imm = 0x440 ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 ; CHECK-NEXT: popq %r14 diff --git a/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll b/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll --- a/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll @@ -35,12 +35,12 @@ ; CHECK-NEXT: .LBB0_2: # %loop.header ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: movabsq $64, %rax -; CHECK-NEXT: tilestored %tmm3, 2048(%rsp,%rax) # 1024-byte Folded Spill +; CHECK-NEXT: tilestored %tmm3, 3024(%rsp,%rax) # 1024-byte Folded Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: movabsq $64, %rax -; CHECK-NEXT: tileloadd 2048(%rsp,%rax), %tmm3 # 1024-byte Folded Reload +; CHECK-NEXT: tileloadd 3024(%rsp,%rax), %tmm3 # 1024-byte Folded Reload ; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm0 ; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm1 ; CHECK-NEXT: # implicit-def: $rax diff --git a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll --- a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll @@ -10,12 +10,12 @@ ; CHECK-NEXT: pushq %r15 ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $4056, %rsp # imm = 0xFD8 +; CHECK-NEXT: subq $2120, %rsp # imm = 0x848 ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %ebp ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 -; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqu64 %zmm0, (%rsp) +; CHECK-NEXT: movb $1, (%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) @@ -30,7 +30,7 @@ ; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg (%rsp) ; CHECK-NEXT: movl $32, %r14d ; CHECK-NEXT: movl $buf+2048, %r15d ; CHECK-NEXT: tileloadd (%r15,%r14), %tmm5 @@ -44,16 +44,16 @@ ; CHECK-NEXT: movl $buf+1024, %eax ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm1 ; CHECK-NEXT: movabsq $64, %rax -; CHECK-NEXT: tilestored %tmm5, 2048(%rsp,%rax) # 1024-byte Folded Spill +; CHECK-NEXT: tilestored %tmm5, 1088(%rsp,%rax) # 1024-byte Folded Spill ; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm5 ; CHECK-NEXT: movabsq $64, %rax -; CHECK-NEXT: tilestored %tmm5, 1024(%rsp,%rax) # 1024-byte Folded Spill +; CHECK-NEXT: tilestored %tmm5, 64(%rsp,%rax) # 1024-byte Folded Spill ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg (%rsp) ; CHECK-NEXT: movabsq $64, %rax -; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload +; CHECK-NEXT: tileloadd 64(%rsp,%rax), %tmm6 # 1024-byte Folded Reload ; CHECK-NEXT: jmp .LBB0_3 ; CHECK-NEXT: .LBB0_2: # %if.false ; CHECK-NEXT: movl $buf, %eax @@ -62,16 +62,16 @@ ; CHECK-NEXT: movl $buf+1024, %eax ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm3 ; CHECK-NEXT: movabsq $64, %rax -; CHECK-NEXT: tilestored %tmm5, 2048(%rsp,%rax) # 1024-byte Folded Spill +; CHECK-NEXT: tilestored %tmm5, 1088(%rsp,%rax) # 1024-byte Folded Spill ; CHECK-NEXT: tdpbssd %tmm3, %tmm2, %tmm5 ; CHECK-NEXT: movabsq $64, %rax -; CHECK-NEXT: tilestored %tmm5, 1024(%rsp,%rax) # 1024-byte Folded Spill +; CHECK-NEXT: tilestored %tmm5, 64(%rsp,%rax) # 1024-byte Folded Spill ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg (%rsp) ; CHECK-NEXT: movabsq $64, %rax -; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload +; CHECK-NEXT: tileloadd 64(%rsp,%rax), %tmm6 # 1024-byte Folded Reload ; CHECK-NEXT: tilestored %tmm6, (%r15,%r14) ; CHECK-NEXT: .LBB0_3: # %exit ; CHECK-NEXT: movl $buf, %eax @@ -79,11 +79,11 @@ ; CHECK-NEXT: movw $8, %dx ; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm4 ; CHECK-NEXT: movabsq $64, %rax -; CHECK-NEXT: tileloadd 2048(%rsp,%rax), %tmm5 # 1024-byte Folded Reload +; CHECK-NEXT: tileloadd 1088(%rsp,%rax), %tmm5 # 1024-byte Folded Reload ; CHECK-NEXT: tdpbssd %tmm4, %tmm6, %tmm5 ; CHECK-NEXT: movl $buf+2048, %eax ; CHECK-NEXT: tilestored %tmm5, (%rax,%rcx) -; CHECK-NEXT: addq $4056, %rsp # imm = 0xFD8 +; CHECK-NEXT: addq $2120, %rsp # imm = 0x848 ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %r15 diff --git a/llvm/test/CodeGen/X86/AMX/amx-spill.ll b/llvm/test/CodeGen/X86/AMX/amx-spill.ll --- a/llvm/test/CodeGen/X86/AMX/amx-spill.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-spill.ll @@ -7,33 +7,33 @@ define dso_local void @test_api(i32 %0, i16 signext %1, i16 signext %2) nounwind { ; CHECK-LABEL: test_api: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $2936, %rsp # imm = 0xB78 +; CHECK-NEXT: subq $968, %rsp # imm = 0x3C8 ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 -; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %si, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %dl, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %dl, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %dl, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %dl, {{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %sil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %sil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %dl, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %dl, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %sil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %dl, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %sil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %dl, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $buf, %r8d ; CHECK-NEXT: movl $32, %eax ; CHECK-NEXT: tileloadd (%r8,%rax), %tmm1 ; CHECK-NEXT: tileloadd (%r8,%rax), %tmm1 ; CHECK-NEXT: movabsq $64, %rcx -; CHECK-NEXT: tilestored %tmm1, 896(%rsp,%rcx) # 1024-byte Folded Spill +; CHECK-NEXT: tilestored %tmm1, -64(%rsp,%rcx) # 1024-byte Folded Spill ; CHECK-NEXT: tileloadd (%r8,%rax), %tmm3 ; CHECK-NEXT: tileloadd (%r8,%rax), %tmm4 ; CHECK-NEXT: tileloadd (%r8,%rax), %tmm2 @@ -54,14 +54,14 @@ ; CHECK-NEXT: .LBB0_3: ; CHECK-NEXT: tdpbssd %tmm7, %tmm6, %tmm1 ; CHECK-NEXT: movabsq $64, %rax -; CHECK-NEXT: tileloadd 896(%rsp,%rax), %tmm7 # 1024-byte Folded Reload +; CHECK-NEXT: tileloadd -64(%rsp,%rax), %tmm7 # 1024-byte Folded Reload ; CHECK-NEXT: tdpbssd %tmm7, %tmm1, %tmm3 ; CHECK-NEXT: tdpbssd %tmm4, %tmm3, %tmm2 ; CHECK-NEXT: tdpbssd %tmm5, %tmm2, %tmm0 ; CHECK-NEXT: movl $buf, %eax ; CHECK-NEXT: movl $32, %ecx ; CHECK-NEXT: tilestored %tmm0, (%rax,%rcx) -; CHECK-NEXT: addq $2936, %rsp # imm = 0xB78 +; CHECK-NEXT: addq $968, %rsp # imm = 0x3C8 ; CHECK-NEXT: tilerelease ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/addsub-constant-folding.ll b/llvm/test/CodeGen/X86/addsub-constant-folding.ll --- a/llvm/test/CodeGen/X86/addsub-constant-folding.ll +++ b/llvm/test/CodeGen/X86/addsub-constant-folding.ll @@ -64,12 +64,12 @@ define <4 x i32> @vec_add_const_add_const(<4 x i32> %arg) { ; X86-LABEL: vec_add_const_add_const: ; X86: # %bb.0: -; X86-NEXT: paddd {{\.LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: vec_add_const_add_const: ; X64: # %bb.0: -; X64-NEXT: paddd {{.*}}(%rip), %xmm0 +; X64-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: retq %t0 = add <4 x i32> %arg, %t1 = add <4 x i32> %t0, @@ -79,16 +79,16 @@ define <4 x i32> @vec_add_const_add_const_extrause(<4 x i32> %arg) { ; X86-LABEL: vec_add_const_add_const_extrause: ; X86: # %bb.0: -; X86-NEXT: subl $28, %esp -; X86-NEXT: .cfi_def_cfa_offset 32 +; X86-NEXT: subl $16, %esp +; X86-NEXT: .cfi_def_cfa_offset 20 ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: movdqu %xmm0, (%esp) # 16-byte Spill ; X86-NEXT: movdqa {{.*#+}} xmm0 = [8,8,8,8] ; X86-NEXT: paddd %xmm1, %xmm0 ; X86-NEXT: calll vec_use@PLT ; X86-NEXT: movdqu (%esp), %xmm0 # 16-byte Reload -; X86-NEXT: paddd {{\.LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-NEXT: addl $28, %esp +; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: addl $16, %esp ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; @@ -102,7 +102,7 @@ ; X64-NEXT: paddd %xmm1, %xmm0 ; X64-NEXT: callq vec_use@PLT ; X64-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; X64-NEXT: paddd {{.*}}(%rip), %xmm0 +; X64-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: addq $24, %rsp ; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq @@ -115,12 +115,12 @@ define <4 x i32> @vec_add_const_add_const_nonsplat(<4 x i32> %arg) { ; X86-LABEL: vec_add_const_add_const_nonsplat: ; X86: # %bb.0: -; X86-NEXT: paddd {{\.LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: vec_add_const_add_const_nonsplat: ; X64: # %bb.0: -; X64-NEXT: paddd {{.*}}(%rip), %xmm0 +; X64-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: retq %t0 = add <4 x i32> %arg, %t1 = add <4 x i32> %t0, @@ -186,12 +186,12 @@ define <4 x i32> @vec_add_const_sub_const(<4 x i32> %arg) { ; X86-LABEL: vec_add_const_sub_const: ; X86: # %bb.0: -; X86-NEXT: paddd {{\.LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: vec_add_const_sub_const: ; X64: # %bb.0: -; X64-NEXT: paddd {{.*}}(%rip), %xmm0 +; X64-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: retq %t0 = add <4 x i32> %arg, %t1 = sub <4 x i32> %t0, @@ -201,16 +201,16 @@ define <4 x i32> @vec_add_const_sub_const_extrause(<4 x i32> %arg) { ; X86-LABEL: vec_add_const_sub_const_extrause: ; X86: # %bb.0: -; X86-NEXT: subl $28, %esp -; X86-NEXT: .cfi_def_cfa_offset 32 +; X86-NEXT: subl $16, %esp +; X86-NEXT: .cfi_def_cfa_offset 20 ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: movdqu %xmm0, (%esp) # 16-byte Spill ; X86-NEXT: movdqa {{.*#+}} xmm0 = [8,8,8,8] ; X86-NEXT: paddd %xmm1, %xmm0 ; X86-NEXT: calll vec_use@PLT ; X86-NEXT: movdqu (%esp), %xmm0 # 16-byte Reload -; X86-NEXT: paddd {{\.LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-NEXT: addl $28, %esp +; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: addl $16, %esp ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; @@ -224,7 +224,7 @@ ; X64-NEXT: paddd %xmm1, %xmm0 ; X64-NEXT: callq vec_use@PLT ; X64-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; X64-NEXT: paddd {{.*}}(%rip), %xmm0 +; X64-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: addq $24, %rsp ; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq @@ -237,12 +237,12 @@ define <4 x i32> @vec_add_const_sub_const_nonsplat(<4 x i32> %arg) { ; X86-LABEL: vec_add_const_sub_const_nonsplat: ; X86: # %bb.0: -; X86-NEXT: paddd {{\.LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: vec_add_const_sub_const_nonsplat: ; X64: # %bb.0: -; X64-NEXT: paddd {{.*}}(%rip), %xmm0 +; X64-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: retq %t0 = add <4 x i32> %arg, %t1 = sub <4 x i32> %t0, @@ -328,8 +328,8 @@ define <4 x i32> @vec_add_const_const_sub_extrause(<4 x i32> %arg) { ; X86-LABEL: vec_add_const_const_sub_extrause: ; X86: # %bb.0: -; X86-NEXT: subl $28, %esp -; X86-NEXT: .cfi_def_cfa_offset 32 +; X86-NEXT: subl $16, %esp +; X86-NEXT: .cfi_def_cfa_offset 20 ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: movdqu %xmm0, (%esp) # 16-byte Spill ; X86-NEXT: movdqa {{.*#+}} xmm0 = [8,8,8,8] @@ -338,7 +338,7 @@ ; X86-NEXT: movdqa {{.*#+}} xmm0 = [4294967290,4294967290,4294967290,4294967290] ; X86-NEXT: movdqu (%esp), %xmm1 # 16-byte Reload ; X86-NEXT: psubd %xmm1, %xmm0 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $16, %esp ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; @@ -440,12 +440,12 @@ define <4 x i32> @vec_sub_const_add_const(<4 x i32> %arg) { ; X86-LABEL: vec_sub_const_add_const: ; X86: # %bb.0: -; X86-NEXT: paddd {{\.LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: vec_sub_const_add_const: ; X64: # %bb.0: -; X64-NEXT: paddd {{.*}}(%rip), %xmm0 +; X64-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: retq %t0 = sub <4 x i32> %arg, %t1 = add <4 x i32> %t0, @@ -455,14 +455,14 @@ define <4 x i32> @vec_sub_const_add_const_extrause(<4 x i32> %arg) { ; X86-LABEL: vec_sub_const_add_const_extrause: ; X86: # %bb.0: -; X86-NEXT: subl $28, %esp -; X86-NEXT: .cfi_def_cfa_offset 32 +; X86-NEXT: subl $16, %esp +; X86-NEXT: .cfi_def_cfa_offset 20 ; X86-NEXT: movdqu %xmm0, (%esp) # 16-byte Spill -; X86-NEXT: psubd {{\.LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: calll vec_use@PLT ; X86-NEXT: movdqu (%esp), %xmm0 # 16-byte Reload -; X86-NEXT: paddd {{\.LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-NEXT: addl $28, %esp +; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: addl $16, %esp ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; @@ -471,10 +471,10 @@ ; X64-NEXT: subq $24, %rsp ; X64-NEXT: .cfi_def_cfa_offset 32 ; X64-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; X64-NEXT: psubd {{.*}}(%rip), %xmm0 +; X64-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: callq vec_use@PLT ; X64-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; X64-NEXT: paddd {{.*}}(%rip), %xmm0 +; X64-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: addq $24, %rsp ; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq @@ -487,12 +487,12 @@ define <4 x i32> @vec_sub_const_add_const_nonsplat(<4 x i32> %arg) { ; X86-LABEL: vec_sub_const_add_const_nonsplat: ; X86: # %bb.0: -; X86-NEXT: paddd {{\.LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: vec_sub_const_add_const_nonsplat: ; X64: # %bb.0: -; X64-NEXT: paddd {{.*}}(%rip), %xmm0 +; X64-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: retq %t0 = sub <4 x i32> %arg, %t1 = add <4 x i32> %t0, @@ -558,12 +558,12 @@ define <4 x i32> @vec_sub_const_sub_const(<4 x i32> %arg) { ; X86-LABEL: vec_sub_const_sub_const: ; X86: # %bb.0: -; X86-NEXT: psubd {{\.LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: vec_sub_const_sub_const: ; X64: # %bb.0: -; X64-NEXT: psubd {{.*}}(%rip), %xmm0 +; X64-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: retq %t0 = sub <4 x i32> %arg, %t1 = sub <4 x i32> %t0, @@ -573,14 +573,14 @@ define <4 x i32> @vec_sub_const_sub_const_extrause(<4 x i32> %arg) { ; X86-LABEL: vec_sub_const_sub_const_extrause: ; X86: # %bb.0: -; X86-NEXT: subl $28, %esp -; X86-NEXT: .cfi_def_cfa_offset 32 +; X86-NEXT: subl $16, %esp +; X86-NEXT: .cfi_def_cfa_offset 20 ; X86-NEXT: movdqu %xmm0, (%esp) # 16-byte Spill -; X86-NEXT: psubd {{\.LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: calll vec_use@PLT ; X86-NEXT: movdqu (%esp), %xmm0 # 16-byte Reload -; X86-NEXT: psubd {{\.LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-NEXT: addl $28, %esp +; X86-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: addl $16, %esp ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; @@ -589,10 +589,10 @@ ; X64-NEXT: subq $24, %rsp ; X64-NEXT: .cfi_def_cfa_offset 32 ; X64-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; X64-NEXT: psubd {{.*}}(%rip), %xmm0 +; X64-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: callq vec_use@PLT ; X64-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; X64-NEXT: psubd {{.*}}(%rip), %xmm0 +; X64-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: addq $24, %rsp ; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq @@ -605,12 +605,12 @@ define <4 x i32> @vec_sub_const_sub_const_nonsplat(<4 x i32> %arg) { ; X86-LABEL: vec_sub_const_sub_const_nonsplat: ; X86: # %bb.0: -; X86-NEXT: psubd {{\.LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: vec_sub_const_sub_const_nonsplat: ; X64: # %bb.0: -; X64-NEXT: psubd {{.*}}(%rip), %xmm0 +; X64-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: retq %t0 = sub <4 x i32> %arg, %t1 = sub <4 x i32> %t0, @@ -696,15 +696,15 @@ define <4 x i32> @vec_sub_const_const_sub_extrause(<4 x i32> %arg) { ; X86-LABEL: vec_sub_const_const_sub_extrause: ; X86: # %bb.0: -; X86-NEXT: subl $28, %esp -; X86-NEXT: .cfi_def_cfa_offset 32 -; X86-NEXT: psubd {{\.LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: subl $16, %esp +; X86-NEXT: .cfi_def_cfa_offset 20 +; X86-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: movdqu %xmm0, (%esp) # 16-byte Spill ; X86-NEXT: calll vec_use@PLT ; X86-NEXT: movdqa {{.*#+}} xmm0 = [2,2,2,2] ; X86-NEXT: movdqu (%esp), %xmm1 # 16-byte Reload ; X86-NEXT: psubd %xmm1, %xmm0 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $16, %esp ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; @@ -712,7 +712,7 @@ ; X64: # %bb.0: ; X64-NEXT: subq $24, %rsp ; X64-NEXT: .cfi_def_cfa_offset 32 -; X64-NEXT: psubd {{.*}}(%rip), %xmm0 +; X64-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; X64-NEXT: callq vec_use@PLT ; X64-NEXT: movdqa {{.*#+}} xmm0 = [2,2,2,2] @@ -826,8 +826,8 @@ define <4 x i32> @vec_const_sub_add_const_extrause(<4 x i32> %arg) { ; X86-LABEL: vec_const_sub_add_const_extrause: ; X86: # %bb.0: -; X86-NEXT: subl $28, %esp -; X86-NEXT: .cfi_def_cfa_offset 32 +; X86-NEXT: subl $16, %esp +; X86-NEXT: .cfi_def_cfa_offset 20 ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: movdqu %xmm0, (%esp) # 16-byte Spill ; X86-NEXT: movdqa {{.*#+}} xmm0 = [8,8,8,8] @@ -836,7 +836,7 @@ ; X86-NEXT: movdqa {{.*#+}} xmm0 = [10,10,10,10] ; X86-NEXT: movdqu (%esp), %xmm1 # 16-byte Reload ; X86-NEXT: psubd %xmm1, %xmm0 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $16, %esp ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; @@ -960,8 +960,8 @@ define <4 x i32> @vec_const_sub_sub_const_extrause(<4 x i32> %arg) { ; X86-LABEL: vec_const_sub_sub_const_extrause: ; X86: # %bb.0: -; X86-NEXT: subl $28, %esp -; X86-NEXT: .cfi_def_cfa_offset 32 +; X86-NEXT: subl $16, %esp +; X86-NEXT: .cfi_def_cfa_offset 20 ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: movdqu %xmm0, (%esp) # 16-byte Spill ; X86-NEXT: movdqa {{.*#+}} xmm0 = [8,8,8,8] @@ -970,7 +970,7 @@ ; X86-NEXT: movdqa {{.*#+}} xmm0 = [6,6,6,6] ; X86-NEXT: movdqu (%esp), %xmm1 # 16-byte Reload ; X86-NEXT: psubd %xmm1, %xmm0 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $16, %esp ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; @@ -1074,12 +1074,12 @@ define <4 x i32> @vec_const_sub_const_sub(<4 x i32> %arg) { ; X86-LABEL: vec_const_sub_const_sub: ; X86: # %bb.0: -; X86-NEXT: paddd {{\.LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: vec_const_sub_const_sub: ; X64: # %bb.0: -; X64-NEXT: paddd {{.*}}(%rip), %xmm0 +; X64-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: retq %t0 = sub <4 x i32> , %arg %t1 = sub <4 x i32> , %t0 @@ -1089,8 +1089,8 @@ define <4 x i32> @vec_const_sub_const_sub_extrause(<4 x i32> %arg) { ; X86-LABEL: vec_const_sub_const_sub_extrause: ; X86: # %bb.0: -; X86-NEXT: subl $28, %esp -; X86-NEXT: .cfi_def_cfa_offset 32 +; X86-NEXT: subl $16, %esp +; X86-NEXT: .cfi_def_cfa_offset 20 ; X86-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8] ; X86-NEXT: psubd %xmm0, %xmm1 ; X86-NEXT: movdqu %xmm1, (%esp) # 16-byte Spill @@ -1099,7 +1099,7 @@ ; X86-NEXT: movdqa {{.*#+}} xmm0 = [2,2,2,2] ; X86-NEXT: movdqu (%esp), %xmm1 # 16-byte Reload ; X86-NEXT: psubd %xmm1, %xmm0 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $16, %esp ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; @@ -1126,12 +1126,12 @@ define <4 x i32> @vec_const_sub_const_sub_nonsplat(<4 x i32> %arg) { ; X86-LABEL: vec_const_sub_const_sub_nonsplat: ; X86: # %bb.0: -; X86-NEXT: paddd {{\.LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: vec_const_sub_const_sub_nonsplat: ; X64: # %bb.0: -; X64-NEXT: paddd {{.*}}(%rip), %xmm0 +; X64-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: retq %t0 = sub <4 x i32> , %arg %t1 = sub <4 x i32> , %t0 diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86_64.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86_64.ll --- a/llvm/test/CodeGen/X86/avx-intrinsics-x86_64.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86_64.ll @@ -7,10 +7,10 @@ ; AVX: # %bb.0: ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x58,0xc1] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: # encoding: [0xc5,0xfd,0x11,0x44,0x24,0xc8] +; AVX-NEXT: # encoding: [0xc5,0xfd,0x11,0x44,0x24,0xd8] ; AVX-NEXT: vzeroall # encoding: [0xc5,0xfc,0x77] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: # encoding: [0xc5,0xfc,0x10,0x44,0x24,0xc8] +; AVX-NEXT: # encoding: [0xc5,0xfc,0x10,0x44,0x24,0xd8] ; AVX-NEXT: retq # encoding: [0xc3] ; ; AVX512VL-LABEL: test_x86_avx_vzeroall: @@ -30,10 +30,10 @@ ; AVX: # %bb.0: ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x58,0xc1] ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: # encoding: [0xc5,0xfd,0x11,0x44,0x24,0xc8] +; AVX-NEXT: # encoding: [0xc5,0xfd,0x11,0x44,0x24,0xd8] ; AVX-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: # encoding: [0xc5,0xfc,0x10,0x44,0x24,0xc8] +; AVX-NEXT: # encoding: [0xc5,0xfc,0x10,0x44,0x24,0xd8] ; AVX-NEXT: retq # encoding: [0xc3] ; ; AVX512VL-LABEL: test_x86_avx_vzeroupper: diff --git a/llvm/test/CodeGen/X86/avx-vzeroupper.ll b/llvm/test/CodeGen/X86/avx-vzeroupper.ll --- a/llvm/test/CodeGen/X86/avx-vzeroupper.ll +++ b/llvm/test/CodeGen/X86/avx-vzeroupper.ll @@ -32,56 +32,56 @@ define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind { ; VZ-LABEL: test01: ; VZ: # %bb.0: -; VZ-NEXT: subq $56, %rsp +; VZ-NEXT: subq $40, %rsp ; VZ-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; VZ-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; VZ-NEXT: vmovaps x(%rip), %xmm0 ; VZ-NEXT: vzeroupper ; VZ-NEXT: callq do_sse -; VZ-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; VZ-NEXT: vmovaps %xmm0, x(%rip) ; VZ-NEXT: callq do_sse -; VZ-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; VZ-NEXT: vmovaps %xmm0, x(%rip) ; VZ-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; VZ-NEXT: addq $56, %rsp +; VZ-NEXT: addq $40, %rsp ; VZ-NEXT: retq ; ; DISABLE-VZ-LABEL: test01: ; DISABLE-VZ: # %bb.0: -; DISABLE-VZ-NEXT: subq $56, %rsp +; DISABLE-VZ-NEXT: subq $40, %rsp ; DISABLE-VZ-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; DISABLE-VZ-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; DISABLE-VZ-NEXT: vmovaps x(%rip), %xmm0 ; DISABLE-VZ-NEXT: callq do_sse -; DISABLE-VZ-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; DISABLE-VZ-NEXT: vmovaps %xmm0, x(%rip) ; DISABLE-VZ-NEXT: callq do_sse -; DISABLE-VZ-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; DISABLE-VZ-NEXT: vmovaps %xmm0, x(%rip) ; DISABLE-VZ-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; DISABLE-VZ-NEXT: addq $56, %rsp +; DISABLE-VZ-NEXT: addq $40, %rsp ; DISABLE-VZ-NEXT: retq ; ; BDVER2-LABEL: test01: ; BDVER2: # %bb.0: -; BDVER2-NEXT: subq $56, %rsp -; BDVER2-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; BDVER2-NEXT: subq $40, %rsp +; BDVER2-NEXT: vmovaps x(%rip), %xmm0 ; BDVER2-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; BDVER2-NEXT: vzeroupper ; BDVER2-NEXT: callq do_sse -; BDVER2-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; BDVER2-NEXT: vmovaps %xmm0, x(%rip) ; BDVER2-NEXT: callq do_sse -; BDVER2-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; BDVER2-NEXT: vmovaps %xmm0, x(%rip) ; BDVER2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; BDVER2-NEXT: addq $56, %rsp +; BDVER2-NEXT: addq $40, %rsp ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: test01: ; BTVER2: # %bb.0: -; BTVER2-NEXT: subq $56, %rsp -; BTVER2-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; BTVER2-NEXT: subq $40, %rsp +; BTVER2-NEXT: vmovaps x(%rip), %xmm0 ; BTVER2-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; BTVER2-NEXT: callq do_sse -; BTVER2-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; BTVER2-NEXT: vmovaps %xmm0, x(%rip) ; BTVER2-NEXT: callq do_sse -; BTVER2-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; BTVER2-NEXT: vmovaps %xmm0, x(%rip) ; BTVER2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; BTVER2-NEXT: addq $56, %rsp +; BTVER2-NEXT: addq $40, %rsp ; BTVER2-NEXT: retq %tmp = load <4 x float>, <4 x float>* @x, align 16 %call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind @@ -145,7 +145,7 @@ ; VZ-NEXT: # =>This Inner Loop Header: Depth=1 ; VZ-NEXT: callq do_sse ; VZ-NEXT: callq do_sse -; VZ-NEXT: vmovaps g+{{.*}}(%rip), %xmm0 +; VZ-NEXT: vmovaps g+16(%rip), %xmm0 ; VZ-NEXT: callq do_sse ; VZ-NEXT: decl %ebx ; VZ-NEXT: jne .LBB3_3 @@ -174,7 +174,7 @@ ; DISABLE-VZ-NEXT: # =>This Inner Loop Header: Depth=1 ; DISABLE-VZ-NEXT: callq do_sse ; DISABLE-VZ-NEXT: callq do_sse -; DISABLE-VZ-NEXT: vmovaps g+{{.*}}(%rip), %xmm0 +; DISABLE-VZ-NEXT: vmovaps g+16(%rip), %xmm0 ; DISABLE-VZ-NEXT: callq do_sse ; DISABLE-VZ-NEXT: decl %ebx ; DISABLE-VZ-NEXT: jne .LBB3_3 @@ -203,7 +203,7 @@ ; BDVER2-NEXT: # =>This Inner Loop Header: Depth=1 ; BDVER2-NEXT: callq do_sse ; BDVER2-NEXT: callq do_sse -; BDVER2-NEXT: vmovaps g+{{.*}}(%rip), %xmm0 +; BDVER2-NEXT: vmovaps g+16(%rip), %xmm0 ; BDVER2-NEXT: callq do_sse ; BDVER2-NEXT: decl %ebx ; BDVER2-NEXT: jne .LBB3_3 @@ -232,7 +232,7 @@ ; BTVER2-NEXT: # =>This Inner Loop Header: Depth=1 ; BTVER2-NEXT: callq do_sse ; BTVER2-NEXT: callq do_sse -; BTVER2-NEXT: vmovaps g+{{.*}}(%rip), %xmm0 +; BTVER2-NEXT: vmovaps g+16(%rip), %xmm0 ; BTVER2-NEXT: callq do_sse ; BTVER2-NEXT: decl %ebx ; BTVER2-NEXT: jne .LBB3_3 diff --git a/llvm/test/CodeGen/X86/avx512-bugfix-25270.ll b/llvm/test/CodeGen/X86/avx512-bugfix-25270.ll --- a/llvm/test/CodeGen/X86/avx512-bugfix-25270.ll +++ b/llvm/test/CodeGen/X86/avx512-bugfix-25270.ll @@ -7,7 +7,7 @@ ; CHECK-LABEL: bar__512: ; CHECK: ## %bb.0: ## %allocas ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $112, %rsp +; CHECK-NEXT: subq $64, %rsp ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: vmovups (%rdi), %zmm0 ; CHECK-NEXT: vmovups %zmm0, (%rsp) ## 64-byte Spill @@ -18,7 +18,7 @@ ; CHECK-NEXT: callq _Print__512 ; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; CHECK-NEXT: vmovaps %zmm0, (%rbx) -; CHECK-NEXT: addq $112, %rsp +; CHECK-NEXT: addq $64, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: retq allocas: diff --git a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll --- a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll +++ b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll @@ -323,12 +323,12 @@ ; ; WIN64-LABEL: test_argRetFloat: ; WIN64: # %bb.0: -; WIN64-NEXT: vaddss __real@{{.*}}(%rip), %xmm0, %xmm0 +; WIN64-NEXT: vaddss __real@3f800000(%rip), %xmm0, %xmm0 ; WIN64-NEXT: retq ; ; LINUXOSX64-LABEL: test_argRetFloat: ; LINUXOSX64: # %bb.0: -; LINUXOSX64-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 +; LINUXOSX64-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; LINUXOSX64-NEXT: retq %add = fadd float 1.0, %a ret float %add @@ -393,12 +393,12 @@ ; ; WIN64-LABEL: test_argRetDouble: ; WIN64: # %bb.0: -; WIN64-NEXT: vaddsd __real@{{.*}}(%rip), %xmm0, %xmm0 +; WIN64-NEXT: vaddsd __real@3ff0000000000000(%rip), %xmm0, %xmm0 ; WIN64-NEXT: retq ; ; LINUXOSX64-LABEL: test_argRetDouble: ; LINUXOSX64: # %bb.0: -; LINUXOSX64-NEXT: vaddsd {{.*}}(%rip), %xmm0, %xmm0 +; LINUXOSX64-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; LINUXOSX64-NEXT: retq %add = fadd double %a, 1.0 ret double %add @@ -758,7 +758,7 @@ define dso_local x86_regcallcc <8 x i32> @test_CallargRet256Vector(<8 x i1> %x, <8 x i32> %a) { ; X32-LABEL: test_CallargRet256Vector: ; X32: # %bb.0: -; X32-NEXT: subl $92, %esp +; X32-NEXT: subl $36, %esp ; X32-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill ; X32-NEXT: kmovd %eax, %k1 ; X32-NEXT: kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill @@ -767,13 +767,13 @@ ; X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload ; X32-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %ymm1 # 32-byte Reload ; X32-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} -; X32-NEXT: addl $92, %esp +; X32-NEXT: addl $36, %esp ; X32-NEXT: retl ; ; WIN64-LABEL: test_CallargRet256Vector: ; WIN64: # %bb.0: -; WIN64-NEXT: subq $88, %rsp -; WIN64-NEXT: .seh_stackalloc 88 +; WIN64-NEXT: subq $56, %rsp +; WIN64-NEXT: .seh_stackalloc 56 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; WIN64-NEXT: kmovd %eax, %k1 @@ -783,14 +783,14 @@ ; WIN64-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; WIN64-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; WIN64-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} -; WIN64-NEXT: addq $88, %rsp +; WIN64-NEXT: addq $56, %rsp ; WIN64-NEXT: retq ; WIN64-NEXT: .seh_endproc ; ; LINUXOSX64-LABEL: test_CallargRet256Vector: ; LINUXOSX64: # %bb.0: -; LINUXOSX64-NEXT: subq $88, %rsp -; LINUXOSX64-NEXT: .cfi_def_cfa_offset 96 +; LINUXOSX64-NEXT: subq $56, %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 64 ; LINUXOSX64-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; LINUXOSX64-NEXT: kmovd %eax, %k1 ; LINUXOSX64-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill @@ -799,7 +799,7 @@ ; LINUXOSX64-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; LINUXOSX64-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; LINUXOSX64-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} -; LINUXOSX64-NEXT: addq $88, %rsp +; LINUXOSX64-NEXT: addq $56, %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: retq %b = call x86_regcallcc <8 x i32> @test_argRet256Vector(<8 x i1> %x, <8 x i32> %a, <8 x i32> %a) @@ -834,7 +834,7 @@ define dso_local x86_regcallcc <16 x i32> @test_CallargRet512Vector(<16 x i1> %x, <16 x i32> %a) { ; X32-LABEL: test_CallargRet512Vector: ; X32: # %bb.0: -; X32-NEXT: subl $188, %esp +; X32-NEXT: subl $68, %esp ; X32-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 64-byte Spill ; X32-NEXT: kmovd %eax, %k1 ; X32-NEXT: kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill @@ -843,13 +843,13 @@ ; X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload ; X32-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm1 # 64-byte Reload ; X32-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; X32-NEXT: addl $188, %esp +; X32-NEXT: addl $68, %esp ; X32-NEXT: retl ; ; WIN64-LABEL: test_CallargRet512Vector: ; WIN64: # %bb.0: -; WIN64-NEXT: subq $184, %rsp -; WIN64-NEXT: .seh_stackalloc 184 +; WIN64-NEXT: subq $88, %rsp +; WIN64-NEXT: .seh_stackalloc 88 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; WIN64-NEXT: kmovd %eax, %k1 @@ -859,14 +859,14 @@ ; WIN64-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; WIN64-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; WIN64-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; WIN64-NEXT: addq $184, %rsp +; WIN64-NEXT: addq $88, %rsp ; WIN64-NEXT: retq ; WIN64-NEXT: .seh_endproc ; ; LINUXOSX64-LABEL: test_CallargRet512Vector: ; LINUXOSX64: # %bb.0: -; LINUXOSX64-NEXT: subq $184, %rsp -; LINUXOSX64-NEXT: .cfi_def_cfa_offset 192 +; LINUXOSX64-NEXT: subq $88, %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 96 ; LINUXOSX64-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; LINUXOSX64-NEXT: kmovd %eax, %k1 ; LINUXOSX64-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill @@ -875,7 +875,7 @@ ; LINUXOSX64-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; LINUXOSX64-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; LINUXOSX64-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; LINUXOSX64-NEXT: addq $184, %rsp +; LINUXOSX64-NEXT: addq $88, %rsp ; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: retq %b = call x86_regcallcc <16 x i32> @test_argRet512Vector(<16 x i1> %x, <16 x i32> %a, <16 x i32> %a) @@ -1267,7 +1267,7 @@ ; ; WIN64-LABEL: test_argMultiRet: ; WIN64: # %bb.0: -; WIN64-NEXT: vaddsd __real@{{.*}}(%rip), %xmm1, %xmm1 +; WIN64-NEXT: vaddsd __real@4014000000000000(%rip), %xmm1, %xmm1 ; WIN64-NEXT: movl $999, %edx # imm = 0x3E7 ; WIN64-NEXT: movl $4, %eax ; WIN64-NEXT: movb $7, %cl @@ -1275,7 +1275,7 @@ ; ; LINUXOSX64-LABEL: test_argMultiRet: ; LINUXOSX64: # %bb.0: -; LINUXOSX64-NEXT: vaddsd {{.*}}(%rip), %xmm1, %xmm1 +; LINUXOSX64-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; LINUXOSX64-NEXT: movl $999, %edx # imm = 0x3E7 ; LINUXOSX64-NEXT: movl $4, %eax ; LINUXOSX64-NEXT: movb $7, %cl diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -651,7 +651,7 @@ ; X86-LABEL: vector_i128_i64: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: subl $72, %esp +; X86-NEXT: subl $64, %esp ; X86-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -696,7 +696,7 @@ ; X86-NEXT: paddq %xmm2, %xmm1 ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; X86-NEXT: psubq %xmm1, %xmm0 -; X86-NEXT: addl $72, %esp +; X86-NEXT: addl $64, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll @@ -651,7 +651,7 @@ ; X86-LABEL: vector_i128_i64: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: subl $72, %esp +; X86-NEXT: subl $64, %esp ; X86-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -696,7 +696,7 @@ ; X86-NEXT: paddq %xmm2, %xmm1 ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; X86-NEXT: psubq %xmm1, %xmm0 -; X86-NEXT: addl $72, %esp +; X86-NEXT: addl $64, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/fma.ll b/llvm/test/CodeGen/X86/fma.ll --- a/llvm/test/CodeGen/X86/fma.ll +++ b/llvm/test/CodeGen/X86/fma.ll @@ -251,41 +251,41 @@ define float @test_f32_cst() #0 { ; FMA32-LABEL: test_f32_cst: ; FMA32: ## %bb.0: ## %entry -; FMA32-NEXT: flds LCPI4_0 ## encoding: [0xd9,0x05,A,A,A,A] -; FMA32-NEXT: ## fixup A - offset: 2, value: LCPI4_0, kind: FK_Data_4 +; FMA32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} ## encoding: [0xd9,0x05,A,A,A,A] +; FMA32-NEXT: ## fixup A - offset: 2, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; FMA32-NEXT: retl ## encoding: [0xc3] ; ; FMACALL32-LABEL: test_f32_cst: ; FMACALL32: ## %bb.0: ## %entry -; FMACALL32-NEXT: flds LCPI4_0 ## encoding: [0xd9,0x05,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 2, value: LCPI4_0, kind: FK_Data_4 +; FMACALL32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} ## encoding: [0xd9,0x05,A,A,A,A] +; FMACALL32-NEXT: ## fixup A - offset: 2, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; FMACALL32-NEXT: retl ## encoding: [0xc3] ; ; FMA64-LABEL: test_f32_cst: ; FMA64: ## %bb.0: ## %entry -; FMA64-NEXT: vmovss {{.*}}(%rip), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A] -; FMA64-NEXT: ## fixup A - offset: 4, value: LCPI4_0-4, kind: reloc_riprel_4byte +; FMA64-NEXT: vmovss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A] +; FMA64-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; FMA64-NEXT: ## xmm0 = mem[0],zero,zero,zero ; FMA64-NEXT: retq ## encoding: [0xc3] ; ; FMACALL64-LABEL: test_f32_cst: ; FMACALL64: ## %bb.0: ## %entry -; FMACALL64-NEXT: movss {{.*}}(%rip), %xmm0 ## encoding: [0xf3,0x0f,0x10,0x05,A,A,A,A] -; FMACALL64-NEXT: ## fixup A - offset: 4, value: LCPI4_0-4, kind: reloc_riprel_4byte +; FMACALL64-NEXT: movss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ## encoding: [0xf3,0x0f,0x10,0x05,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; FMACALL64-NEXT: ## xmm0 = mem[0],zero,zero,zero ; FMACALL64-NEXT: retq ## encoding: [0xc3] ; ; AVX512-LABEL: test_f32_cst: ; AVX512: ## %bb.0: ## %entry -; AVX512-NEXT: vmovss {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A] -; AVX512-NEXT: ## fixup A - offset: 4, value: LCPI4_0-4, kind: reloc_riprel_4byte +; AVX512-NEXT: vmovss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A] +; AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; AVX512-NEXT: ## xmm0 = mem[0],zero,zero,zero ; AVX512-NEXT: retq ## encoding: [0xc3] ; ; AVX512VL-LABEL: test_f32_cst: ; AVX512VL: ## %bb.0: ## %entry -; AVX512VL-NEXT: vmovss {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A] -; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI4_0-4, kind: reloc_riprel_4byte +; AVX512VL-NEXT: vmovss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A] +; AVX512VL-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; AVX512VL-NEXT: ## xmm0 = mem[0],zero,zero,zero ; AVX512VL-NEXT: retq ## encoding: [0xc3] entry: @@ -642,161 +642,161 @@ ; ; FMACALL32_BDVER2-LABEL: test_v8f32: ; FMACALL32_BDVER2: ## %bb.0: ## %entry -; FMACALL32_BDVER2-NEXT: subl $316, %esp ## encoding: [0x81,0xec,0x3c,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: ## imm = 0x13C +; FMACALL32_BDVER2-NEXT: subl $284, %esp ## encoding: [0x81,0xec,0x1c,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## imm = 0x11C ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm2, %xmm3 ## encoding: [0xc4,0xe3,0x7d,0x19,0xd3,0x01] ; FMACALL32_BDVER2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x94,0x24,0x00,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x94,0x24,0xf0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm1, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xca,0x01] ; FMACALL32_BDVER2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x8c,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x8c,0x24,0xd0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01] ; FMACALL32_BDVER2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x84,0x24,0xb0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x5c,0x24,0x60] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x5c,0x24,0x50] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm3, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x5c,0x24,0x08,0x02] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x50] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x40] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x40] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x30] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x02] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xb4,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xa4,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x50] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x40] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x40] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x30] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xa8,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x98,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x9c,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x8c,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xf0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] ; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xd0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xb0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x90,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xf0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] ; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xd0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02] ; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xb0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x84,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x74] ; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xf0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] ; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xd0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] ; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xb0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x78] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x68] ; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xf0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] ; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xd0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] ; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xb0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x50] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x40] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x40] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x30] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x78] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x68] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x84,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x74] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x90,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x9c,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x8c,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xa8,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x98,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xb4,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xa4,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x3c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x2c] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x38,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x28,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x1c] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x28,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x18,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[2,3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x34,0x20] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x24,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x14,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0],xmm1[3] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x30,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x20,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x20,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x10,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1,2],mem[0] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] -; FMACALL32_BDVER2-NEXT: addl $316, %esp ## encoding: [0x81,0xc4,0x3c,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: ## imm = 0x13C +; FMACALL32_BDVER2-NEXT: addl $284, %esp ## encoding: [0x81,0xc4,0x1c,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## imm = 0x11C ; FMACALL32_BDVER2-NEXT: retl ## encoding: [0xc3] entry: %call = call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) @@ -1663,7 +1663,7 @@ ; ; FMACALL32_BDVER2-LABEL: test_v4f64: ; FMACALL32_BDVER2: ## %bb.0: ## %entry -; FMACALL32_BDVER2-NEXT: subl $252, %esp ## encoding: [0x81,0xec,0xfc,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: subl $236, %esp ## encoding: [0x81,0xec,0xec,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm2, %xmm3 ## encoding: [0xc4,0xe3,0x7d,0x19,0xd3,0x01] ; FMACALL32_BDVER2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x94,0x24,0xc0,0x00,0x00,0x00] @@ -1742,7 +1742,7 @@ ; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x18] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0,1] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] -; FMACALL32_BDVER2-NEXT: addl $252, %esp ## encoding: [0x81,0xc4,0xfc,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: addl $236, %esp ## encoding: [0x81,0xc4,0xec,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: retl ## encoding: [0xc3] entry: %call = call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c) diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -364,7 +364,7 @@ ; CHECK-I686-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) ; CHECK-I686-NEXT: shrl $31, %eax ; CHECK-I686-NEXT: fildll {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: fadds {{\.LCPI[0-9]+_[0-9]+}}(,%eax,4) +; CHECK-I686-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4) ; CHECK-I686-NEXT: fstps (%esp) ; CHECK-I686-NEXT: calll __gnu_f2h_ieee ; CHECK-I686-NEXT: movw %ax, (%esi) @@ -735,7 +735,7 @@ ; BWON-F16C-NEXT: pushq %r15 ; BWON-F16C-NEXT: pushq %r14 ; BWON-F16C-NEXT: pushq %rbx -; BWON-F16C-NEXT: subq $88, %rsp +; BWON-F16C-NEXT: subq $56, %rsp ; BWON-F16C-NEXT: movq %rdi, %rbx ; BWON-F16C-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; BWON-F16C-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] @@ -744,7 +744,7 @@ ; BWON-F16C-NEXT: movl %eax, %r14d ; BWON-F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; BWON-F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 -; BWON-F16C-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; BWON-F16C-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; BWON-F16C-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; BWON-F16C-NEXT: vzeroupper ; BWON-F16C-NEXT: callq __truncdfhf2@PLT @@ -754,13 +754,13 @@ ; BWON-F16C-NEXT: vzeroupper ; BWON-F16C-NEXT: callq __truncdfhf2@PLT ; BWON-F16C-NEXT: movl %eax, %ebp -; BWON-F16C-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; BWON-F16C-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; BWON-F16C-NEXT: callq __truncdfhf2@PLT ; BWON-F16C-NEXT: movw %ax, 4(%rbx) ; BWON-F16C-NEXT: movw %bp, (%rbx) ; BWON-F16C-NEXT: movw %r15w, 6(%rbx) ; BWON-F16C-NEXT: movw %r14w, 2(%rbx) -; BWON-F16C-NEXT: addq $88, %rsp +; BWON-F16C-NEXT: addq $56, %rsp ; BWON-F16C-NEXT: popq %rbx ; BWON-F16C-NEXT: popq %r14 ; BWON-F16C-NEXT: popq %r15 diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll --- a/llvm/test/CodeGen/X86/rotate-extract-vector.ll +++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll @@ -56,7 +56,7 @@ define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind { ; X86-LABEL: vrolq_extract_udiv: ; X86: # %bb.0: -; X86-NEXT: subl $44, %esp +; X86-NEXT: subl $32, %esp ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: vmovss %xmm0, (%esp) @@ -77,7 +77,7 @@ ; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 ; X86-NEXT: vprolq $57, %zmm0, %zmm0 ; X86-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; X86-NEXT: addl $44, %esp +; X86-NEXT: addl $32, %esp ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; @@ -109,7 +109,7 @@ ; X86-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9] ; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; X86-NEXT: vprold $7, %zmm0, %zmm0 -; X86-NEXT: vpand {{\.LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; @@ -118,7 +118,7 @@ ; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9] ; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; X64-NEXT: vprold $7, %zmm0, %zmm0 -; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq %lhs_mul = mul <4 x i32> %i, @@ -132,16 +132,16 @@ define <32 x i16> @illegal_no_extract_mul(<32 x i16> %i) nounwind { ; X86-LABEL: illegal_no_extract_mul: ; X86: # %bb.0: -; X86-NEXT: vpmullw {{\.LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm1 -; X86-NEXT: vpmullw {{\.LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 +; X86-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm1 +; X86-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 ; X86-NEXT: vpsrlw $10, %zmm0, %zmm0 ; X86-NEXT: vporq %zmm0, %zmm1, %zmm0 ; X86-NEXT: retl ; ; X64-LABEL: illegal_no_extract_mul: ; X64: # %bb.0: -; X64-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm1 -; X64-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm0 +; X64-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; X64-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; X64-NEXT: vpsrlw $10, %zmm0, %zmm0 ; X64-NEXT: vporq %zmm0, %zmm1, %zmm0 ; X64-NEXT: retq @@ -207,7 +207,7 @@ define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind { ; X86-LABEL: no_extract_udiv: ; X86: # %bb.0: -; X86-NEXT: subl $60, %esp +; X86-NEXT: subl $48, %esp ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: vmovss %xmm0, (%esp) @@ -246,7 +246,7 @@ ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload ; X86-NEXT: vpsllq $56, %xmm1, %xmm1 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 -; X86-NEXT: addl $60, %esp +; X86-NEXT: addl $48, %esp ; X86-NEXT: retl ; ; X64-LABEL: no_extract_udiv: diff --git a/llvm/test/CodeGen/X86/sext-vsetcc.ll b/llvm/test/CodeGen/X86/sext-vsetcc.ll --- a/llvm/test/CodeGen/X86/sext-vsetcc.ll +++ b/llvm/test/CodeGen/X86/sext-vsetcc.ll @@ -158,7 +158,7 @@ ; ; AVX512-LABEL: cmp_ne_load_const_extra_use2: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $120, %rsp +; AVX512-NEXT: subq $72, %rsp ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 @@ -168,7 +168,7 @@ ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq use_v8i1@PLT ; AVX512-NEXT: vpmovsxbw (%rsp), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: addq $120, %rsp +; AVX512-NEXT: addq $72, %rsp ; AVX512-NEXT: retq %loadx = load <8 x i8>, <8 x i8>* %x %icmp = icmp ne <8 x i8> %loadx, zeroinitializer diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll @@ -321,8 +321,8 @@ define <8 x double> @stack_fold_cmppd_mask(<8 x double> %a0, <8 x double> %a1, <8 x double>* %a2, i8 %mask, <8 x double> %b0, <8 x double> %b1) { ; CHECK-LABEL: stack_fold_cmppd_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $184, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 192 +; CHECK-NEXT: subq $136, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 144 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -337,7 +337,7 @@ ; CHECK-NEXT: kandb %k0, %k1, %k1 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: vblendmpd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload -; CHECK-NEXT: addq $184, %rsp +; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -354,8 +354,8 @@ define <8 x double> @stack_fold_cmppd_mask_commuted(<8 x double> %a0, <8 x double> %a1, <8 x double>* %a2, i8 %mask, <8 x double> %b0, <8 x double> %b1) { ; CHECK-LABEL: stack_fold_cmppd_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $184, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 192 +; CHECK-NEXT: subq $136, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 144 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -370,7 +370,7 @@ ; CHECK-NEXT: kandb %k0, %k1, %k1 ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: vblendmpd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload -; CHECK-NEXT: addq $184, %rsp +; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -406,8 +406,8 @@ define <16 x float> @stack_fold_cmpps_mask(<16 x float> %a0, <16 x float> %a1, <16 x float>* %a2, i16 %mask, <16 x float> %b0, <16 x float> %b1) { ; CHECK-LABEL: stack_fold_cmpps_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $184, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 192 +; CHECK-NEXT: subq $136, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 144 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -422,7 +422,7 @@ ; CHECK-NEXT: kandw %k0, %k1, %k1 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: vblendmps (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload -; CHECK-NEXT: addq $184, %rsp +; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -439,8 +439,8 @@ define <16 x float> @stack_fold_cmpps_mask_commuted(<16 x float> %a0, <16 x float> %a1, <16 x float>* %a2, i16 %mask, <16 x float> %b0, <16 x float> %b1) { ; CHECK-LABEL: stack_fold_cmpps_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $184, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 192 +; CHECK-NEXT: subq $136, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 144 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -455,7 +455,7 @@ ; CHECK-NEXT: kandw %k0, %k1, %k1 ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: vblendmps (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload -; CHECK-NEXT: addq $184, %rsp +; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -1198,8 +1198,8 @@ define <8 x double> @stack_fold_shuff64x2(<8 x double> %a, <8 x double> %b) { ; CHECK-LABEL: stack_fold_shuff64x2: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -1208,7 +1208,7 @@ ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: vshuff64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; CHECK-NEXT: # zmm0 = zmm0[0,1,4,5],mem[2,3,0,1] -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -1219,8 +1219,8 @@ define <8 x double> @stack_fold_shuff64x2_mask(<8 x double> %a, <8 x double> %b, i8 %mask, <8 x double>* %passthru) { ; CHECK-LABEL: stack_fold_shuff64x2_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -1232,7 +1232,7 @@ ; CHECK-NEXT: vshuff64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,4,5],mem[2,3,0,1] ; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -1247,8 +1247,8 @@ define <8 x double> @stack_fold_shuff64x2_maskz(<8 x double> %a, <8 x double> %b, i8 %mask, <8 x double>* %passthru) { ; CHECK-LABEL: stack_fold_shuff64x2_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -1258,7 +1258,7 @@ ; CHECK-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: vshuff64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload ; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,4,5],mem[2,3,0,1] -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -1271,8 +1271,8 @@ define <16 x float> @stack_fold_shuff32x4_mask(<16 x float> %a, <16 x float> %b, i16 %mask, <16 x float>* %passthru) { ; CHECK-LABEL: stack_fold_shuff32x4_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -1284,7 +1284,7 @@ ; CHECK-NEXT: vshuff32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3] ; CHECK-NEXT: vmovaps %zmm1, %zmm0 -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -1299,8 +1299,8 @@ define <16 x float> @stack_fold_shuff32x4_maskz(<16 x float> %a, <16 x float> %b, i16 %mask) { ; CHECK-LABEL: stack_fold_shuff32x4_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -1310,7 +1310,7 @@ ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: vshuff32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload ; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3] -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll --- a/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll @@ -12,8 +12,8 @@ define <16 x i32> @stack_fold_valignd(<16 x i32> %a, <16 x i32> %b) { ; CHECK-LABEL: stack_fold_valignd: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -22,7 +22,7 @@ ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; CHECK-NEXT: # zmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0] -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -33,8 +33,8 @@ define <16 x i32> @stack_fold_valignd_mask(<16 x i32> %a, <16 x i32> %b, <16 x i32>* %passthru, i16 %mask) { ; CHECK-LABEL: stack_fold_valignd_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -46,7 +46,7 @@ ; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: # zmm1 {%k1} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -60,8 +60,8 @@ define <16 x i32> @stack_fold_valignd_maskz(<16 x i32> %a, <16 x i32> %b, i16 %mask) { ; CHECK-LABEL: stack_fold_valignd_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -71,7 +71,7 @@ ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0] -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -84,8 +84,8 @@ define <8 x i64> @stack_fold_valignq(<8 x i64> %a, <8 x i64> %b) { ; CHECK-LABEL: stack_fold_valignq: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -94,7 +94,7 @@ ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; CHECK-NEXT: # zmm0 = mem[1,2,3,4,5,6,7],zmm0[0] -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -105,8 +105,8 @@ define <8 x i64> @stack_fold_valignq_mask(<8 x i64> %a, <8 x i64> %b, <8 x i64>* %passthru, i8 %mask) { ; CHECK-LABEL: stack_fold_valignq_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -118,7 +118,7 @@ ; CHECK-NEXT: valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: # zmm1 {%k1} = mem[1,2,3,4,5,6,7],zmm0[0] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -132,8 +132,8 @@ define <8 x i64> @stack_fold_valignq_maskz(<8 x i64> %a, <8 x i64> %b, i8 %mask) { ; CHECK-LABEL: stack_fold_valignq_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -143,7 +143,7 @@ ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7],zmm0[0] -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -490,8 +490,8 @@ define <64 x i8> @stack_fold_pabsb_mask(<64 x i8> %passthru, <64 x i8> %a0, i64 %mask) { ; CHECK-LABEL: stack_fold_pabsb_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -500,7 +500,7 @@ ; CHECK-NEXT: kmovq %rdi, %k1 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -550,8 +550,8 @@ define <16 x i32> @stack_fold_pabsd_mask(<16 x i32> %passthru, <16 x i32> %a0, i16 %mask) { ; CHECK-LABEL: stack_fold_pabsd_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -560,7 +560,7 @@ ; CHECK-NEXT: kmovd %edi, %k1 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -610,8 +610,8 @@ define <8 x i64> @stack_fold_pabsq_mask(<8 x i64> %passthru, <8 x i64> %a0, i8 %mask) { ; CHECK-LABEL: stack_fold_pabsq_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -620,7 +620,7 @@ ; CHECK-NEXT: kmovd %edi, %k1 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -670,8 +670,8 @@ define <32 x i16> @stack_fold_pabsw_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) { ; CHECK-LABEL: stack_fold_pabsw_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -680,7 +680,7 @@ ; CHECK-NEXT: kmovd %edi, %k1 ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -1643,8 +1643,8 @@ define <64 x i8> @stack_fold_palignr(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_palignr: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -1653,7 +1653,7 @@ ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; CHECK-NEXT: # zmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48] -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -1664,8 +1664,8 @@ define <64 x i8> @stack_fold_palignr_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %passthru, i64 %mask) { ; CHECK-LABEL: stack_fold_palignr_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -1677,7 +1677,7 @@ ; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: # zmm1 {%k1} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -1691,8 +1691,8 @@ define <64 x i8> @stack_fold_palignr_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; CHECK-LABEL: stack_fold_palignr_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -1702,7 +1702,7 @@ ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload ; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48] -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -2023,8 +2023,8 @@ define <16 x i32> @stack_fold_pcmpeqd_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) { ; CHECK-LABEL: stack_fold_pcmpeqd_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $184, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 192 +; CHECK-NEXT: subq $136, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 144 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -2038,7 +2038,7 @@ ; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: vpblendmd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload -; CHECK-NEXT: addq $184, %rsp +; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -2055,8 +2055,8 @@ define <16 x i32> @stack_fold_pcmpeqd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) { ; CHECK-LABEL: stack_fold_pcmpeqd_mask_commuted: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $184, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 192 +; CHECK-NEXT: subq $136, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 144 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -2070,7 +2070,7 @@ ; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: vpblendmd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload -; CHECK-NEXT: addq $184, %rsp +; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -2087,8 +2087,8 @@ define <16 x i32> @stack_fold_pcmpled_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) { ; CHECK-LABEL: stack_fold_pcmpled_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $184, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 192 +; CHECK-NEXT: subq $136, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 144 ; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -2102,7 +2102,7 @@ ; CHECK-NEXT: vpcmpled {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: vpblendmd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload -; CHECK-NEXT: addq $184, %rsp +; CHECK-NEXT: addq $136, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -2119,8 +2119,8 @@ define i16 @stack_fold_pcmpleud(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) { ; CHECK-LABEL: stack_fold_pcmpleud: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -2132,7 +2132,7 @@ ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: andl %esi, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -6100,8 +6100,8 @@ define <16 x i32> @stack_fold_pshufd_zmm_mask(<16 x i32> %passthru, <16 x i32> %a0, i16 %mask) { ; CHECK-LABEL: stack_fold_pshufd_zmm_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -6111,7 +6111,7 @@ ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -6157,8 +6157,8 @@ define <32 x i16> @stack_fold_pshufhw_zmm_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) { ; CHECK-LABEL: stack_fold_pshufhw_zmm_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -6168,7 +6168,7 @@ ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: # zmm0 {%k1} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28] -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -6214,8 +6214,8 @@ define <32 x i16> @stack_fold_pshuflw_zmm_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) { ; CHECK-LABEL: stack_fold_pshuflw_zmm_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -6225,7 +6225,7 @@ ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31] -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -6914,8 +6914,8 @@ define <8 x i64> @stack_fold_shufi64x2(<8 x i64> %a, <8 x i64> %b) { ; CHECK-LABEL: stack_fold_shufi64x2: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -6924,7 +6924,7 @@ ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; CHECK-NEXT: # zmm0 = zmm0[0,1,4,5],mem[2,3,0,1] -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -6935,8 +6935,8 @@ define <8 x i64> @stack_fold_shufi64x2_mask(<8 x i64> %a, <8 x i64> %b, i8 %mask, <8 x i64>* %passthru) { ; CHECK-LABEL: stack_fold_shufi64x2_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -6948,7 +6948,7 @@ ; CHECK-NEXT: vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,4,5],mem[2,3,0,1] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -6963,8 +6963,8 @@ define <8 x i64> @stack_fold_shufi64x2_maskz(<8 x i64> %a, <8 x i64> %b, i8 %mask, <8 x i64>* %passthru) { ; CHECK-LABEL: stack_fold_shufi64x2_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -6974,7 +6974,7 @@ ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload ; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,4,5],mem[2,3,0,1] -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -6987,8 +6987,8 @@ define <16 x i32> @stack_fold_shufi32x4_mask(<16 x i32> %a, <16 x i32> %b, i16 %mask, <16 x i32>* %passthru) { ; CHECK-LABEL: stack_fold_shufi32x4_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -7000,7 +7000,7 @@ ; CHECK-NEXT: vshufi32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3] ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -7015,8 +7015,8 @@ define <16 x i32> @stack_fold_shufi32x4_maskz(<16 x i32> %a, <16 x i32> %b, i16 %mask) { ; CHECK-LABEL: stack_fold_shufi32x4_maskz: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; CHECK-NEXT: #APP @@ -7026,7 +7026,7 @@ ; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; CHECK-NEXT: vshufi32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload ; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3] -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() diff --git a/llvm/test/CodeGen/X86/vec-libcalls.ll b/llvm/test/CodeGen/X86/vec-libcalls.ll --- a/llvm/test/CodeGen/X86/vec-libcalls.ll +++ b/llvm/test/CodeGen/X86/vec-libcalls.ll @@ -40,7 +40,7 @@ ; CHECK-LABEL: sin_v1f32: ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq sinf +; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq %r = call <1 x float> @llvm.sin.v1f32(<1 x float> %x) @@ -52,11 +52,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: callq sinf +; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq sinf +; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; CHECK-NEXT: addq $40, %rsp @@ -70,17 +70,17 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: callq sinf +; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq sinf +; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq sinf +; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; CHECK-NEXT: addq $40, %rsp @@ -94,23 +94,23 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: callq sinf +; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq sinf +; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq sinf +; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[3,3,3,3] -; CHECK-NEXT: callq sinf +; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; CHECK-NEXT: addq $40, %rsp @@ -122,37 +122,37 @@ define <5 x float> @sin_v5f32(<5 x float> %x) nounwind { ; CHECK-LABEL: sin_v5f32: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $88, %rsp +; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq sinf +; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq sinf +; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq sinf +; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[3,3,3,3] -; CHECK-NEXT: callq sinf +; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq sinf +; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: addq $88, %rsp +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: retq %r = call <5 x float> @llvm.sin.v5f32(<5 x float> %x) ret <5 x float> %r @@ -161,43 +161,43 @@ define <6 x float> @sin_v6f32(<6 x float> %x) nounwind { ; CHECK-LABEL: sin_v6f32: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $88, %rsp +; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq sinf +; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq sinf +; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq sinf +; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq sinf +; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq sinf +; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[3,3,3,3] -; CHECK-NEXT: callq sinf +; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; CHECK-NEXT: addq $88, %rsp +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: retq %r = call <6 x float> @llvm.sin.v6f32(<6 x float> %x) ret <6 x float> %r @@ -206,25 +206,25 @@ define <3 x double> @sin_v3f64(<3 x double> %x) nounwind { ; CHECK-LABEL: sin_v3f64: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $88, %rsp +; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq sin +; CHECK-NEXT: callq sin@PLT ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq sin +; CHECK-NEXT: callq sin@PLT ; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; CHECK-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq sin +; CHECK-NEXT: callq sin@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: addq $88, %rsp +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: retq %r = call <3 x double> @llvm.sin.v3f64(<3 x double> %x) ret <3 x double> %r @@ -233,7 +233,7 @@ define <2 x float> @fabs_v2f32(<2 x float> %x) nounwind { ; CHECK-LABEL: fabs_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NEXT: retq %r = call <2 x float> @llvm.fabs.v2f32(<2 x float> %x) ret <2 x float> %r @@ -253,11 +253,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: callq cosf +; CHECK-NEXT: callq cosf@PLT ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq cosf +; CHECK-NEXT: callq cosf@PLT ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; CHECK-NEXT: addq $40, %rsp @@ -271,11 +271,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: callq expf +; CHECK-NEXT: callq expf@PLT ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq expf +; CHECK-NEXT: callq expf@PLT ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; CHECK-NEXT: addq $40, %rsp @@ -289,11 +289,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: callq exp2f +; CHECK-NEXT: callq exp2f@PLT ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq exp2f +; CHECK-NEXT: callq exp2f@PLT ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; CHECK-NEXT: addq $40, %rsp @@ -316,11 +316,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: callq logf +; CHECK-NEXT: callq logf@PLT ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq logf +; CHECK-NEXT: callq logf@PLT ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; CHECK-NEXT: addq $40, %rsp @@ -334,11 +334,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: callq log10f +; CHECK-NEXT: callq log10f@PLT ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq log10f +; CHECK-NEXT: callq log10f@PLT ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; CHECK-NEXT: addq $40, %rsp @@ -352,11 +352,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: callq log2f +; CHECK-NEXT: callq log2f@PLT ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq log2f +; CHECK-NEXT: callq log2f@PLT ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; CHECK-NEXT: addq $40, %rsp @@ -386,8 +386,8 @@ define <2 x float> @round_v2f32(<2 x float> %x) nounwind { ; CHECK-LABEL: round_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-NEXT: vorps {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vroundps $11, %xmm0, %xmm0 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec-strict-128.ll b/llvm/test/CodeGen/X86/vec-strict-128.ll --- a/llvm/test/CodeGen/X86/vec-strict-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-128.ll @@ -224,8 +224,8 @@ define <4 x float> @f13(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 { ; SSE-X86-LABEL: f13: ; SSE-X86: # %bb.0: -; SSE-X86-NEXT: subl $108, %esp -; SSE-X86-NEXT: .cfi_def_cfa_offset 112 +; SSE-X86-NEXT: subl $100, %esp +; SSE-X86-NEXT: .cfi_def_cfa_offset 104 ; SSE-X86-NEXT: movups %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; SSE-X86-NEXT: movups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; SSE-X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill @@ -282,7 +282,7 @@ ; SSE-X86-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE-X86-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-X86-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-X86-NEXT: addl $108, %esp +; SSE-X86-NEXT: addl $100, %esp ; SSE-X86-NEXT: .cfi_def_cfa_offset 4 ; SSE-X86-NEXT: retl ; @@ -296,7 +296,7 @@ ; SSE-X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE-X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; SSE-X64-NEXT: callq fmaf +; SSE-X64-NEXT: callq fmaf@PLT ; SSE-X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] @@ -304,14 +304,14 @@ ; SSE-X64-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-X64-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE-X64-NEXT: callq fmaf +; SSE-X64-NEXT: callq fmaf@PLT ; SSE-X64-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-X64-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-X64-NEXT: callq fmaf +; SSE-X64-NEXT: callq fmaf@PLT ; SSE-X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] @@ -319,7 +319,7 @@ ; SSE-X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-X64-NEXT: callq fmaf +; SSE-X64-NEXT: callq fmaf@PLT ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-X64-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload @@ -381,7 +381,7 @@ ; SSE-X64-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-X64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-X64-NEXT: callq fma +; SSE-X64-NEXT: callq fma@PLT ; SSE-X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-X64-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] @@ -389,7 +389,7 @@ ; SSE-X64-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-X64-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE-X64-NEXT: callq fma +; SSE-X64-NEXT: callq fma@PLT ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-X64-NEXT: movaps %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -8,13 +8,13 @@ ; CHECK-LABEL: constrained_vector_fdiv_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: divss {{.*}}(%rip), %xmm0 +; CHECK-NEXT: divss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fdiv_v1f32: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vdivss {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vdivss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq entry: %div = call <1 x float> @llvm.experimental.constrained.fdiv.v1f32( @@ -29,13 +29,13 @@ ; CHECK-LABEL: constrained_vector_fdiv_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0] -; CHECK-NEXT: divpd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: divpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fdiv_v2f64: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0] -; AVX-NEXT: vdivpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vdivpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq entry: %div = call <2 x double> @llvm.experimental.constrained.fdiv.v2f64( @@ -85,9 +85,9 @@ ; CHECK-LABEL: constrained_vector_fdiv_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0] -; CHECK-NEXT: divpd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: divpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: divsd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: divsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: movsd %xmm1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movapd %xmm0, %xmm1 ; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] @@ -98,9 +98,9 @@ ; AVX-LABEL: constrained_vector_fdiv_v3f64: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vdivsd {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vdivsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.0E+0,2.0E+0] -; AVX-NEXT: vdivpd {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vdivpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: retq entry: @@ -125,7 +125,7 @@ ; AVX1-LABEL: constrained_vector_fdiv_v4f64: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] -; AVX1-NEXT: vdivpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vdivpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_fdiv_v4f64: @@ -304,8 +304,8 @@ ; ; AVX-LABEL: constrained_vector_frem_v3f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmod@PLT @@ -322,7 +322,7 @@ ; AVX-NEXT: callq fmod@PLT ; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp +; AVX-NEXT: addq $40, %rsp ; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq entry: @@ -405,13 +405,13 @@ ; CHECK-LABEL: constrained_vector_fmul_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: mulss {{.*}}(%rip), %xmm0 +; CHECK-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fmul_v1f32: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq entry: %mul = call <1 x float> @llvm.experimental.constrained.fmul.v1f32( @@ -426,13 +426,13 @@ ; CHECK-LABEL: constrained_vector_fmul_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308] -; CHECK-NEXT: mulpd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fmul_v2f64: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308] -; AVX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq entry: %mul = call <2 x double> @llvm.experimental.constrained.fmul.v2f64( @@ -451,7 +451,7 @@ ; CHECK-NEXT: mulss %xmm1, %xmm2 ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: mulss %xmm1, %xmm0 -; CHECK-NEXT: mulss {{.*}}(%rip), %xmm1 +; CHECK-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; CHECK-NEXT: retq @@ -459,9 +459,9 @@ ; AVX-LABEL: constrained_vector_fmul_v3f32: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm2 -; AVX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3] ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ; AVX-NEXT: retq @@ -479,9 +479,9 @@ ; CHECK-LABEL: constrained_vector_fmul_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308] -; CHECK-NEXT: mulpd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: mulsd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: movsd %xmm1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movapd %xmm0, %xmm1 ; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] @@ -492,9 +492,9 @@ ; AVX-LABEL: constrained_vector_fmul_v3f64: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmulsd {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308] -; AVX-NEXT: vmulpd {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: retq entry: @@ -513,19 +513,19 @@ ; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308] ; CHECK-NEXT: movapd {{.*#+}} xmm1 = [4.0E+0,5.0E+0] ; CHECK-NEXT: mulpd %xmm0, %xmm1 -; CHECK-NEXT: mulpd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fmul_v4f64: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308] -; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_fmul_v4f64: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308] -; AVX512-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq entry: %mul = call <4 x double> @llvm.experimental.constrained.fmul.v4f64( @@ -542,13 +542,13 @@ ; CHECK-LABEL: constrained_vector_fadd_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: addss {{.*}}(%rip), %xmm0 +; CHECK-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fadd_v1f32: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq entry: %add = call <1 x float> @llvm.experimental.constrained.fadd.v1f32( @@ -563,13 +563,13 @@ ; CHECK-LABEL: constrained_vector_fadd_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308] -; CHECK-NEXT: addpd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fadd_v2f64: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308] -; AVX-NEXT: vaddpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq entry: %add = call <2 x double> @llvm.experimental.constrained.fadd.v2f64( @@ -588,7 +588,7 @@ ; CHECK-NEXT: addss %xmm2, %xmm1 ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: addss %xmm2, %xmm0 -; CHECK-NEXT: addss {{.*}}(%rip), %xmm2 +; CHECK-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq @@ -598,8 +598,8 @@ ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm2 -; AVX-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX-NEXT: retq @@ -617,9 +617,9 @@ ; CHECK-LABEL: constrained_vector_fadd_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308] -; CHECK-NEXT: addpd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: xorpd %xmm1, %xmm1 -; CHECK-NEXT: addsd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: addsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: movsd %xmm1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movapd %xmm0, %xmm1 ; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] @@ -630,9 +630,9 @@ ; AVX-LABEL: constrained_vector_fadd_v3f64: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vaddsd {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308] -; AVX-NEXT: vaddpd {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: retq entry: @@ -651,19 +651,19 @@ ; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308] ; CHECK-NEXT: movapd {{.*#+}} xmm1 = [2.0E+0,2.0000000000000001E-1] ; CHECK-NEXT: addpd %xmm0, %xmm1 -; CHECK-NEXT: addpd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fadd_v4f64: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308] -; AVX1-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_fadd_v4f64: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308] -; AVX512-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq entry: %add = call <4 x double> @llvm.experimental.constrained.fadd.v4f64( @@ -680,13 +680,13 @@ ; CHECK-LABEL: constrained_vector_fsub_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: subss {{.*}}(%rip), %xmm0 +; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fsub_v1f32: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vsubss {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vsubss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq entry: %sub = call <1 x float> @llvm.experimental.constrained.fsub.v1f32( @@ -701,13 +701,13 @@ ; CHECK-LABEL: constrained_vector_fsub_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308] -; CHECK-NEXT: subpd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fsub_v2f64: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308] -; AVX-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq entry: %sub = call <2 x double> @llvm.experimental.constrained.fsub.v2f64( @@ -726,8 +726,8 @@ ; CHECK-NEXT: movaps %xmm1, %xmm2 ; CHECK-NEXT: subss %xmm0, %xmm2 ; CHECK-NEXT: movaps %xmm1, %xmm0 -; CHECK-NEXT: subss {{.*}}(%rip), %xmm0 -; CHECK-NEXT: subss {{.*}}(%rip), %xmm1 +; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; CHECK-NEXT: retq @@ -737,8 +737,8 @@ ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vsubss {{.*}}(%rip), %xmm1, %xmm2 -; AVX-NEXT: vsubss {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vsubss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX-NEXT: vsubss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX-NEXT: retq @@ -759,7 +759,7 @@ ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: subsd %xmm0, %xmm1 ; CHECK-NEXT: movapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308] -; CHECK-NEXT: subpd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: movsd %xmm1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movapd %xmm0, %xmm1 ; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] @@ -773,7 +773,7 @@ ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [-1.7976931348623157E+308,-1.7976931348623157E+308] -; AVX-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: retq entry: @@ -791,20 +791,20 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308] ; CHECK-NEXT: movapd %xmm0, %xmm1 -; CHECK-NEXT: subpd {{.*}}(%rip), %xmm1 -; CHECK-NEXT: subpd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fsub_v4f64: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308] -; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_fsub_v4f64: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308] -; AVX512-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq entry: %sub = call <4 x double> @llvm.experimental.constrained.fsub.v4f64( @@ -840,12 +840,12 @@ define <2 x double> @constrained_vector_sqrt_v2f64() #0 { ; CHECK-LABEL: constrained_vector_sqrt_v2f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: sqrtpd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: sqrtpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_sqrt_v2f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vsqrtpd {{.*}}(%rip), %xmm0 +; AVX-NEXT: vsqrtpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AVX-NEXT: retq entry: %sqrt = call <2 x double> @llvm.experimental.constrained.sqrt.v2f64( @@ -892,7 +892,7 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: sqrtsd %xmm0, %xmm1 -; CHECK-NEXT: sqrtpd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: sqrtpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: movsd %xmm1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movapd %xmm0, %xmm1 ; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] @@ -904,7 +904,7 @@ ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vsqrtpd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vsqrtpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: retq entry: @@ -918,13 +918,13 @@ define <4 x double> @constrained_vector_sqrt_v4f64() #0 { ; CHECK-LABEL: constrained_vector_sqrt_v4f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: sqrtpd {{.*}}(%rip), %xmm1 -; CHECK-NEXT: sqrtpd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: sqrtpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: sqrtpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_sqrt_v4f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vsqrtpd {{.*}}(%rip), %ymm0 +; AVX-NEXT: vsqrtpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX-NEXT: retq entry: %sqrt = call <4 x double> @llvm.experimental.constrained.sqrt.v4f64( @@ -1094,8 +1094,8 @@ ; ; AVX-LABEL: constrained_vector_pow_v3f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq pow@PLT @@ -1112,7 +1112,7 @@ ; AVX-NEXT: callq pow@PLT ; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp +; AVX-NEXT: addq $40, %rsp ; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq entry: @@ -1351,8 +1351,8 @@ ; ; AVX-LABEL: constrained_vector_powi_v3f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: movl $3, %edi ; AVX-NEXT: callq __powidf2@PLT @@ -1369,7 +1369,7 @@ ; AVX-NEXT: callq __powidf2@PLT ; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp +; AVX-NEXT: addq $40, %rsp ; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq entry: @@ -1589,8 +1589,8 @@ ; ; AVX-LABEL: constrained_vector_sin_v3f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq sin@PLT ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -1604,7 +1604,7 @@ ; AVX-NEXT: callq sin@PLT ; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp +; AVX-NEXT: addq $40, %rsp ; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq entry: @@ -1814,8 +1814,8 @@ ; ; AVX-LABEL: constrained_vector_cos_v3f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq cos@PLT ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -1829,7 +1829,7 @@ ; AVX-NEXT: callq cos@PLT ; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp +; AVX-NEXT: addq $40, %rsp ; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq entry: @@ -2039,8 +2039,8 @@ ; ; AVX-LABEL: constrained_vector_exp_v3f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq exp@PLT ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2054,7 +2054,7 @@ ; AVX-NEXT: callq exp@PLT ; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp +; AVX-NEXT: addq $40, %rsp ; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq entry: @@ -2264,8 +2264,8 @@ ; ; AVX-LABEL: constrained_vector_exp2_v3f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq exp2@PLT ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2279,7 +2279,7 @@ ; AVX-NEXT: callq exp2@PLT ; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp +; AVX-NEXT: addq $40, %rsp ; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq entry: @@ -2489,8 +2489,8 @@ ; ; AVX-LABEL: constrained_vector_log_v3f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log@PLT ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2504,7 +2504,7 @@ ; AVX-NEXT: callq log@PLT ; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp +; AVX-NEXT: addq $40, %rsp ; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq entry: @@ -2714,8 +2714,8 @@ ; ; AVX-LABEL: constrained_vector_log10_v3f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log10@PLT ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2729,7 +2729,7 @@ ; AVX-NEXT: callq log10@PLT ; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp +; AVX-NEXT: addq $40, %rsp ; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq entry: @@ -2939,8 +2939,8 @@ ; ; AVX-LABEL: constrained_vector_log2_v3f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq log2@PLT ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2954,7 +2954,7 @@ ; AVX-NEXT: callq log2@PLT ; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp +; AVX-NEXT: addq $40, %rsp ; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq entry: @@ -3065,7 +3065,7 @@ ; ; AVX-LABEL: constrained_vector_rint_v2f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vroundpd $4, {{.*}}(%rip), %xmm0 +; AVX-NEXT: vroundpd $4, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AVX-NEXT: retq entry: %rint = call <2 x double> @llvm.experimental.constrained.rint.v2f64( @@ -3144,7 +3144,7 @@ ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vroundsd $4, %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vroundpd $4, {{.*}}(%rip), %xmm1 +; AVX-NEXT: vroundpd $4, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: retq entry: @@ -3183,7 +3183,7 @@ ; ; AVX-LABEL: constrained_vector_rint_v4f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vroundpd $4, {{.*}}(%rip), %ymm0 +; AVX-NEXT: vroundpd $4, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX-NEXT: retq entry: %rint = call <4 x double> @llvm.experimental.constrained.rint.v4f64( @@ -3236,7 +3236,7 @@ ; ; AVX-LABEL: constrained_vector_nearbyint_v2f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vroundpd $12, {{.*}}(%rip), %xmm0 +; AVX-NEXT: vroundpd $12, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AVX-NEXT: retq entry: %nearby = call <2 x double> @llvm.experimental.constrained.nearbyint.v2f64( @@ -3315,7 +3315,7 @@ ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vroundsd $12, %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vroundpd $12, {{.*}}(%rip), %xmm1 +; AVX-NEXT: vroundpd $12, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: retq entry: @@ -3354,7 +3354,7 @@ ; ; AVX-LABEL: constrained_vector_nearbyint_v4f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vroundpd $12, {{.*}}(%rip), %ymm0 +; AVX-NEXT: vroundpd $12, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX-NEXT: retq entry: %nearby = call <4 x double> @llvm.experimental.constrained.nearbyint.v4f64( @@ -3520,8 +3520,8 @@ ; ; AVX-LABEL: constrained_vector_max_v3f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmax@PLT @@ -3538,7 +3538,7 @@ ; AVX-NEXT: callq fmax@PLT ; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp +; AVX-NEXT: addq $40, %rsp ; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq entry: @@ -3771,8 +3771,8 @@ ; ; AVX-LABEL: constrained_vector_min_v3f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: callq fmin@PLT @@ -3789,7 +3789,7 @@ ; AVX-NEXT: callq fmin@PLT ; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp +; AVX-NEXT: addq $40, %rsp ; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq entry: @@ -3870,12 +3870,12 @@ define <1 x i32> @constrained_vector_fptosi_v1i32_v1f32() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v1i32_v1f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %eax +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fptosi_v1i32_v1f32: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttss2si {{.*}}(%rip), %eax +; AVX-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; AVX-NEXT: retq entry: %result = call <1 x i32> @llvm.experimental.constrained.fptosi.v1i32.v1f32( @@ -3887,12 +3887,12 @@ define <2 x i32> @constrained_vector_fptosi_v2i32_v2f32() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v2i32_v2f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttps2dq {{.*}}(%rip), %xmm0 +; CHECK-NEXT: cvttps2dq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fptosi_v2i32_v2f32: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttps2dq {{.*}}(%rip), %xmm0 +; AVX-NEXT: vcvttps2dq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AVX-NEXT: retq entry: %result = call <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f32( @@ -3904,23 +3904,23 @@ define <3 x i32> @constrained_vector_fptosi_v3i32_v3f32() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v3i32_v3f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %eax +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %eax +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %eax +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fptosi_v3i32_v3f32: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttss2si {{.*}}(%rip), %eax +; AVX-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vcvttss2si {{.*}}(%rip), %eax +; AVX-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: vcvttss2si {{.*}}(%rip), %eax +; AVX-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq entry: @@ -3934,12 +3934,12 @@ define <4 x i32> @constrained_vector_fptosi_v4i32_v4f32() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v4i32_v4f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttps2dq {{.*}}(%rip), %xmm0 +; CHECK-NEXT: cvttps2dq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fptosi_v4i32_v4f32: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttps2dq {{.*}}(%rip), %xmm0 +; AVX-NEXT: vcvttps2dq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AVX-NEXT: retq entry: %result = call <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f32( @@ -3952,12 +3952,12 @@ define <1 x i64> @constrained_vector_fptosi_v1i64_v1f32() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v1i64_v1f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fptosi_v1i64_v1f32: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX-NEXT: retq entry: %result = call <1 x i64> @llvm.experimental.constrained.fptosi.v1i64.v1f32( @@ -3969,34 +3969,34 @@ define <2 x i64> @constrained_vector_fptosi_v2i64_v2f32() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v2i64_v2f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movq %rax, %xmm1 -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fptosi_v2i64_v2f32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vmovq %rax, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-NEXT: retq ; ; AVX512F-LABEL: constrained_vector_fptosi_v2i64_v2f32: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX512F-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX512F-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: constrained_vector_fptosi_v2i64_v2f32: ; AVX512DQ: # %bb.0: # %entry -; AVX512DQ-NEXT: vcvttps2qq {{.*}}(%rip), %zmm0 +; AVX512DQ-NEXT: vcvttps2qq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -4010,31 +4010,31 @@ define <3 x i64> @constrained_vector_fptosi_v3i64_v3f32() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v3i64_v3f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rcx -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rdx -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fptosi_v3i64_v3f32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vmovq %rax, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vmovq %rax, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_fptosi_v3i64_v3f32: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX512-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX512-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512-NEXT: vmovq %rax, %xmm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX512-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512-NEXT: vmovq %rax, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq @@ -4049,28 +4049,28 @@ define <4 x i64> @constrained_vector_fptosi_v4i64_v4f32() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v4i64_v4f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movq %rax, %xmm1 -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movq %rax, %xmm2 -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movq %rax, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fptosi_v4i64_v4f32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vmovq %rax, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vmovq %rax, %xmm2 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -4078,14 +4078,14 @@ ; ; AVX512F-LABEL: constrained_vector_fptosi_v4i64_v4f32: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX512F-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX512F-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX512F-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX512F-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512F-NEXT: vmovq %rax, %xmm2 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -4108,12 +4108,12 @@ define <1 x i32> @constrained_vector_fptosi_v1i32_v1f64() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v1i32_v1f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %eax +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fptosi_v1i32_v1f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttsd2si {{.*}}(%rip), %eax +; AVX-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; AVX-NEXT: retq entry: %result = call <1 x i32> @llvm.experimental.constrained.fptosi.v1i32.v1f64( @@ -4126,12 +4126,12 @@ define <2 x i32> @constrained_vector_fptosi_v2i32_v2f64() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v2i32_v2f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttpd2dq {{.*}}(%rip), %xmm0 +; CHECK-NEXT: cvttpd2dq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fptosi_v2i32_v2f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttpd2dqx {{.*}}(%rip), %xmm0 +; AVX-NEXT: vcvttpd2dqx {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AVX-NEXT: retq entry: %result = call <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f64( @@ -4143,23 +4143,23 @@ define <3 x i32> @constrained_vector_fptosi_v3i32_v3f64() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v3i32_v3f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %eax +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %eax +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %eax +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fptosi_v3i32_v3f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttsd2si {{.*}}(%rip), %eax +; AVX-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vcvttsd2si {{.*}}(%rip), %eax +; AVX-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: vcvttsd2si {{.*}}(%rip), %eax +; AVX-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq entry: @@ -4173,14 +4173,14 @@ define <4 x i32> @constrained_vector_fptosi_v4i32_v4f64() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v4i32_v4f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttpd2dq {{.*}}(%rip), %xmm1 -; CHECK-NEXT: cvttpd2dq {{.*}}(%rip), %xmm0 +; CHECK-NEXT: cvttpd2dq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: cvttpd2dq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fptosi_v4i32_v4f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttpd2dqy {{.*}}(%rip), %xmm0 +; AVX-NEXT: vcvttpd2dqy {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AVX-NEXT: retq entry: %result = call <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f64( @@ -4193,12 +4193,12 @@ define <1 x i64> @constrained_vector_fptosi_v1i64_v1f64() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v1i64_v1f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fptosi_v1i64_v1f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX-NEXT: retq entry: %result = call <1 x i64> @llvm.experimental.constrained.fptosi.v1i64.v1f64( @@ -4210,27 +4210,27 @@ define <2 x i64> @constrained_vector_fptosi_v2i64_v2f64() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v2i64_v2f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movq %rax, %xmm1 -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fptosi_v2i64_v2f64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vmovq %rax, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-NEXT: retq ; ; AVX512F-LABEL: constrained_vector_fptosi_v2i64_v2f64: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX512F-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX512F-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512F-NEXT: retq @@ -4252,31 +4252,31 @@ define <3 x i64> @constrained_vector_fptosi_v3i64_v3f64() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v3i64_v3f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rcx -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rdx -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fptosi_v3i64_v3f64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vmovq %rax, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vmovq %rax, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_fptosi_v3i64_v3f64: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX512-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX512-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512-NEXT: vmovq %rax, %xmm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX512-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512-NEXT: vmovq %rax, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq @@ -4291,28 +4291,28 @@ define <4 x i64> @constrained_vector_fptosi_v4i64_v4f64() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v4i64_v4f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movq %rax, %xmm1 -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movq %rax, %xmm2 -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movq %rax, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fptosi_v4i64_v4f64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vmovq %rax, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vmovq %rax, %xmm2 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -4320,14 +4320,14 @@ ; ; AVX512F-LABEL: constrained_vector_fptosi_v4i64_v4f64: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX512F-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX512F-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX512F-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX512F-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512F-NEXT: vmovq %rax, %xmm2 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -4350,19 +4350,19 @@ define <1 x i32> @constrained_vector_fptoui_v1i32_v1f32() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v1i32_v1f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fptoui_v1i32_v1f32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: # kill: def $eax killed $eax killed $rax ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_fptoui_v1i32_v1f32: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %eax +; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; AVX512-NEXT: retq entry: %result = call <1 x i32> @llvm.experimental.constrained.fptoui.v1i32.v1f32( @@ -4374,17 +4374,17 @@ define <2 x i32> @constrained_vector_fptoui_v2i32_v2f32() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v2i32_v2f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fptoui_v2i32_v2f32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax -; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rcx +; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx ; AVX1-NEXT: vmovd %ecx, %xmm0 ; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -4406,33 +4406,33 @@ define <3 x i32> @constrained_vector_fptoui_v3i32_v3f32() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v3i32_v3f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fptoui_v3i32_v3f32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax -; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rcx +; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx ; AVX1-NEXT: vmovd %ecx, %xmm0 ; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_fptoui_v3i32_v3f32: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %eax +; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %eax +; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %eax +; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; AVX512-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; AVX512-NEXT: retq entry: @@ -4451,7 +4451,7 @@ ; CHECK-NEXT: movaps %xmm1, %xmm2 ; CHECK-NEXT: cmpltps %xmm0, %xmm2 ; CHECK-NEXT: movaps %xmm2, %xmm3 -; CHECK-NEXT: andnps {{.*}}(%rip), %xmm3 +; CHECK-NEXT: andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; CHECK-NEXT: andnps %xmm0, %xmm2 ; CHECK-NEXT: subps %xmm2, %xmm1 ; CHECK-NEXT: cvttps2dq %xmm1, %xmm0 @@ -4526,7 +4526,7 @@ ; ; AVX512-LABEL: constrained_vector_fptoui_v1i64_v1f32: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %rax +; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512-NEXT: retq entry: %result = call <1 x i64> @llvm.experimental.constrained.fptoui.v1i64.v1f32( @@ -4606,16 +4606,16 @@ ; ; AVX512F-LABEL: constrained_vector_fptoui_v2i64_v2f32: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vcvttss2usi {{.*}}(%rip), %rax +; AVX512F-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vcvttss2usi {{.*}}(%rip), %rax +; AVX512F-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: constrained_vector_fptoui_v2i64_v2f32: ; AVX512DQ: # %bb.0: # %entry -; AVX512DQ-NEXT: vcvttps2uqq {{.*}}(%rip), %zmm0 +; AVX512DQ-NEXT: vcvttps2uqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -4722,12 +4722,12 @@ ; ; AVX512-LABEL: constrained_vector_fptoui_v3i64_v3f32: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %rax +; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %rax +; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512-NEXT: vmovq %rax, %xmm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %rax +; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512-NEXT: vmovq %rax, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq @@ -4869,14 +4869,14 @@ ; ; AVX512F-LABEL: constrained_vector_fptoui_v4i64_v4f32: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vcvttss2usi {{.*}}(%rip), %rax +; AVX512F-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vcvttss2usi {{.*}}(%rip), %rax +; AVX512F-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: vcvttss2usi {{.*}}(%rip), %rax +; AVX512F-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vcvttss2usi {{.*}}(%rip), %rax +; AVX512F-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512F-NEXT: vmovq %rax, %xmm2 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -4899,19 +4899,19 @@ define <1 x i32> @constrained_vector_fptoui_v1i32_v1f64() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v1i32_v1f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fptoui_v1i32_v1f64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: # kill: def $eax killed $eax killed $rax ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_fptoui_v1i32_v1f64: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvttsd2usi {{.*}}(%rip), %eax +; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; AVX512-NEXT: retq entry: %result = call <1 x i32> @llvm.experimental.constrained.fptoui.v1i32.v1f64( @@ -4923,17 +4923,17 @@ define <2 x i32> @constrained_vector_fptoui_v2i32_v2f64() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v2i32_v2f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fptoui_v2i32_v2f64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax -; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rcx +; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx ; AVX1-NEXT: vmovd %ecx, %xmm0 ; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -4955,33 +4955,33 @@ define <3 x i32> @constrained_vector_fptoui_v3i32_v3f64() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v3i32_v3f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_fptoui_v3i32_v3f64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax -; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rcx +; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx ; AVX1-NEXT: vmovd %ecx, %xmm0 ; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_fptoui_v3i32_v3f64: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvttsd2usi {{.*}}(%rip), %eax +; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: vcvttsd2usi {{.*}}(%rip), %eax +; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vcvttsd2usi {{.*}}(%rip), %eax +; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; AVX512-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; AVX512-NEXT: retq entry: @@ -4995,14 +4995,14 @@ define <4 x i32> @constrained_vector_fptoui_v4i32_v4f64() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v4i32_v4f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movd %eax, %xmm2 -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -5080,7 +5080,7 @@ ; ; AVX512-LABEL: constrained_vector_fptoui_v1i64_v1f64: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvttsd2usi {{.*}}(%rip), %rax +; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512-NEXT: retq entry: %result = call <1 x i64> @llvm.experimental.constrained.fptoui.v1i64.v1f64( @@ -5160,9 +5160,9 @@ ; ; AVX512F-LABEL: constrained_vector_fptoui_v2i64_v2f64: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vcvttsd2usi {{.*}}(%rip), %rax +; AVX512F-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vcvttsd2usi {{.*}}(%rip), %rax +; AVX512F-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512F-NEXT: retq @@ -5277,12 +5277,12 @@ ; ; AVX512-LABEL: constrained_vector_fptoui_v3i64_v3f64: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvttsd2usi {{.*}}(%rip), %rax +; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vcvttsd2usi {{.*}}(%rip), %rax +; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512-NEXT: vmovq %rax, %xmm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vcvttsd2usi {{.*}}(%rip), %rax +; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512-NEXT: vmovq %rax, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq @@ -5424,14 +5424,14 @@ ; ; AVX512F-LABEL: constrained_vector_fptoui_v4i64_v4f64: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vcvttsd2usi {{.*}}(%rip), %rax +; AVX512F-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vcvttsd2usi {{.*}}(%rip), %rax +; AVX512F-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: vcvttsd2usi {{.*}}(%rip), %rax +; AVX512F-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vcvttsd2usi {{.*}}(%rip), %rax +; AVX512F-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512F-NEXT: vmovq %rax, %xmm2 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 @@ -5475,12 +5475,12 @@ define <2 x float> @constrained_vector_fptrunc_v2f64() #0 { ; CHECK-LABEL: constrained_vector_fptrunc_v2f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvtpd2ps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: cvtpd2ps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fptrunc_v2f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtpd2psx {{.*}}(%rip), %xmm0 +; AVX-NEXT: vcvtpd2psx {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AVX-NEXT: retq entry: %result = call <2 x float> @llvm.experimental.constrained.fptrunc.v2f32.v2f64( @@ -5526,14 +5526,14 @@ define <4 x float> @constrained_vector_fptrunc_v4f64() #0 { ; CHECK-LABEL: constrained_vector_fptrunc_v4f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvtpd2ps {{.*}}(%rip), %xmm1 -; CHECK-NEXT: cvtpd2ps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: cvtpd2ps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: cvtpd2ps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fptrunc_v4f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtpd2psy {{.*}}(%rip), %xmm0 +; AVX-NEXT: vcvtpd2psy {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AVX-NEXT: retq entry: %result = call <4 x float> @llvm.experimental.constrained.fptrunc.v4f32.v4f64( @@ -5566,12 +5566,12 @@ define <2 x double> @constrained_vector_fpext_v2f32() #0 { ; CHECK-LABEL: constrained_vector_fpext_v2f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvtps2pd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: cvtps2pd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fpext_v2f32: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtps2pd {{.*}}(%rip), %xmm0 +; AVX-NEXT: vcvtps2pd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AVX-NEXT: retq entry: %result = call <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f32( @@ -5616,13 +5616,13 @@ define <4 x double> @constrained_vector_fpext_v4f32() #0 { ; CHECK-LABEL: constrained_vector_fpext_v4f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvtps2pd {{.*}}(%rip), %xmm1 -; CHECK-NEXT: cvtps2pd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: cvtps2pd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: cvtps2pd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fpext_v4f32: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtps2pd {{.*}}(%rip), %ymm0 +; AVX-NEXT: vcvtps2pd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX-NEXT: retq entry: %result = call <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f32( @@ -5673,7 +5673,7 @@ ; ; AVX-LABEL: constrained_vector_ceil_v2f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vroundpd $10, {{.*}}(%rip), %xmm0 +; AVX-NEXT: vroundpd $10, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AVX-NEXT: retq entry: %ceil = call <2 x double> @llvm.experimental.constrained.ceil.v2f64( @@ -5750,7 +5750,7 @@ ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vroundpd $10, {{.*}}(%rip), %xmm1 +; AVX-NEXT: vroundpd $10, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: retq entry: @@ -5802,7 +5802,7 @@ ; ; AVX-LABEL: constrained_vector_floor_v2f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vroundpd $9, {{.*}}(%rip), %xmm0 +; AVX-NEXT: vroundpd $9, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AVX-NEXT: retq entry: %floor = call <2 x double> @llvm.experimental.constrained.floor.v2f64( @@ -5879,7 +5879,7 @@ ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vroundpd $9, {{.*}}(%rip), %xmm1 +; AVX-NEXT: vroundpd $9, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: retq entry: @@ -6028,8 +6028,8 @@ ; ; AVX-LABEL: constrained_vector_round_v3f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $56, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq round@PLT ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -6043,7 +6043,7 @@ ; AVX-NEXT: callq round@PLT ; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $56, %rsp +; AVX-NEXT: addq $40, %rsp ; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq entry: @@ -6094,7 +6094,7 @@ ; ; AVX-LABEL: constrained_vector_trunc_v2f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vroundpd $11, {{.*}}(%rip), %xmm0 +; AVX-NEXT: vroundpd $11, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AVX-NEXT: retq entry: %trunc = call <2 x double> @llvm.experimental.constrained.trunc.v2f64( @@ -6171,7 +6171,7 @@ ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vroundpd $11, {{.*}}(%rip), %xmm1 +; AVX-NEXT: vroundpd $11, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: retq entry: @@ -7045,7 +7045,7 @@ ; ; AVX1-LABEL: constrained_vector_uitofp_v2f32_v2i64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm2 ; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm1 @@ -7442,10 +7442,10 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] ; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: por {{.*}}(%rip), %xmm1 +; CHECK-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: psrld $16, %xmm0 -; CHECK-NEXT: por {{.*}}(%rip), %xmm0 -; CHECK-NEXT: subps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: addps %xmm1, %xmm0 ; CHECK-NEXT: retq ; @@ -7454,7 +7454,7 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] -; AVX1-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -7565,7 +7565,7 @@ ; AVX1-NEXT: vcvtsi2sd %rax, %xmm5, %xmm0 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -7670,7 +7670,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm3 +; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 ; AVX1-NEXT: vorpd %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -658,11 +658,11 @@ ; ALL: # %bb.0: ; ALL-NEXT: subq $40, %rsp ; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, (%rsp) ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) ; ALL-NEXT: vmovaps (%rsp), %xmm0 ; ALL-NEXT: addq $40, %rsp @@ -675,28 +675,28 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { ; ALL-LABEL: cvt_4f64_to_4i16: ; ALL: # %bb.0: -; ALL-NEXT: subq $88, %rsp +; ALL-NEXT: subq $72, %rsp ; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) ; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, (%rsp) ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) ; ALL-NEXT: vmovaps (%rsp), %xmm0 -; ALL-NEXT: addq $88, %rsp +; ALL-NEXT: addq $72, %rsp ; ALL-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> @@ -706,28 +706,28 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { ; ALL-LABEL: cvt_4f64_to_8i16_undef: ; ALL: # %bb.0: -; ALL-NEXT: subq $88, %rsp +; ALL-NEXT: subq $72, %rsp ; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) ; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, (%rsp) ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) ; ALL-NEXT: vmovaps (%rsp), %xmm0 -; ALL-NEXT: addq $88, %rsp +; ALL-NEXT: addq $72, %rsp ; ALL-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> @@ -738,28 +738,28 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { ; ALL-LABEL: cvt_4f64_to_8i16_zero: ; ALL: # %bb.0: -; ALL-NEXT: subq $88, %rsp +; ALL-NEXT: subq $72, %rsp ; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) ; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, (%rsp) ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: addq $88, %rsp +; ALL-NEXT: addq $72, %rsp ; ALL-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> @@ -778,13 +778,13 @@ ; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movzwl %ax, %r15d ; AVX1-NEXT: orl %ebx, %r15d ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -792,24 +792,24 @@ ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movzwl %ax, %r14d ; AVX1-NEXT: orl %ebx, %r14d ; AVX1-NEXT: shlq $32, %r14 ; AVX1-NEXT: orq %r15, %r14 ; AVX1-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = mem[1,0] -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movzwl %ax, %r15d ; AVX1-NEXT: orl %ebx, %r15d ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload @@ -817,11 +817,11 @@ ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movzwl %ax, %eax ; AVX1-NEXT: orl %ebx, %eax ; AVX1-NEXT: shlq $32, %rax @@ -845,13 +845,13 @@ ; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movl %eax, %ebx ; AVX2-NEXT: shll $16, %ebx ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movzwl %ax, %r15d ; AVX2-NEXT: orl %ebx, %r15d ; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -859,24 +859,24 @@ ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movl %eax, %ebx ; AVX2-NEXT: shll $16, %ebx ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movzwl %ax, %r14d ; AVX2-NEXT: orl %ebx, %r14d ; AVX2-NEXT: shlq $32, %r14 ; AVX2-NEXT: orq %r15, %r14 ; AVX2-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = mem[1,0] -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movl %eax, %ebx ; AVX2-NEXT: shll $16, %ebx ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movzwl %ax, %r15d ; AVX2-NEXT: orl %ebx, %r15d ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload @@ -884,11 +884,11 @@ ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movl %eax, %ebx ; AVX2-NEXT: shll $16, %ebx ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movzwl %ax, %eax ; AVX2-NEXT: orl %ebx, %eax ; AVX2-NEXT: shlq $32, %rax @@ -907,17 +907,17 @@ ; AVX512-NEXT: pushq %r15 ; AVX512-NEXT: pushq %r14 ; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $96, %rsp +; AVX512-NEXT: subq $80, %rsp ; AVX512-NEXT: vmovupd %zmm0, (%rsp) # 64-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movl %eax, %ebx ; AVX512-NEXT: shll $16, %ebx ; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movzwl %ax, %r15d ; AVX512-NEXT: orl %ebx, %r15d ; AVX512-NEXT: vmovupd (%rsp), %zmm0 # 64-byte Reload @@ -925,11 +925,11 @@ ; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movl %eax, %ebx ; AVX512-NEXT: shll $16, %ebx ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movzwl %ax, %r14d ; AVX512-NEXT: orl %ebx, %r14d ; AVX512-NEXT: shlq $32, %r14 @@ -939,13 +939,13 @@ ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movl %eax, %ebx ; AVX512-NEXT: shll $16, %ebx ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movzwl %ax, %r15d ; AVX512-NEXT: orl %ebx, %r15d ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload @@ -953,11 +953,11 @@ ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movl %eax, %ebx ; AVX512-NEXT: shll $16, %ebx ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movzwl %ax, %eax ; AVX512-NEXT: orl %ebx, %eax ; AVX512-NEXT: shlq $32, %rax @@ -965,7 +965,7 @@ ; AVX512-NEXT: vmovq %rax, %xmm0 ; AVX512-NEXT: vmovq %r14, %xmm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: addq $96, %rsp +; AVX512-NEXT: addq $80, %rsp ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: popq %r14 ; AVX512-NEXT: popq %r15 @@ -984,7 +984,7 @@ ; ALL: # %bb.0: ; ALL-NEXT: pushq %rbx ; ALL-NEXT: movq %rdi, %rbx -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, (%rbx) ; ALL-NEXT: popq %rbx ; ALL-NEXT: retq @@ -1003,10 +1003,10 @@ ; ALL-NEXT: movq %rdi, %rbx ; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movl %eax, %ebp ; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, (%rbx) ; ALL-NEXT: movw %bp, 2(%rbx) ; ALL-NEXT: addq $24, %rsp @@ -1026,32 +1026,32 @@ ; AVX1-NEXT: pushq %r15 ; AVX1-NEXT: pushq %r14 ; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: subq $88, %rsp +; AVX1-NEXT: subq $56, %rsp ; AVX1-NEXT: movq %rdi, %rbx ; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %r14d ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %r15d ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movw %ax, 4(%rbx) ; AVX1-NEXT: movw %bp, (%rbx) ; AVX1-NEXT: movw %r15w, 6(%rbx) ; AVX1-NEXT: movw %r14w, 2(%rbx) -; AVX1-NEXT: addq $88, %rsp +; AVX1-NEXT: addq $56, %rsp ; AVX1-NEXT: popq %rbx ; AVX1-NEXT: popq %r14 ; AVX1-NEXT: popq %r15 @@ -1064,32 +1064,32 @@ ; AVX2-NEXT: pushq %r15 ; AVX2-NEXT: pushq %r14 ; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $88, %rsp +; AVX2-NEXT: subq $56, %rsp ; AVX2-NEXT: movq %rdi, %rbx ; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movl %eax, %r14d ; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movl %eax, %r15d ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movw %ax, 4(%rbx) ; AVX2-NEXT: movw %bp, (%rbx) ; AVX2-NEXT: movw %r15w, 6(%rbx) ; AVX2-NEXT: movw %r14w, 2(%rbx) -; AVX2-NEXT: addq $88, %rsp +; AVX2-NEXT: addq $56, %rsp ; AVX2-NEXT: popq %rbx ; AVX2-NEXT: popq %r14 ; AVX2-NEXT: popq %r15 @@ -1102,32 +1102,32 @@ ; AVX512-NEXT: pushq %r15 ; AVX512-NEXT: pushq %r14 ; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $88, %rsp +; AVX512-NEXT: subq $56, %rsp ; AVX512-NEXT: movq %rdi, %rbx ; AVX512-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movl %eax, %r14d ; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movl %eax, %r15d ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movw %ax, 4(%rbx) ; AVX512-NEXT: movw %bp, (%rbx) ; AVX512-NEXT: movw %r15w, 6(%rbx) ; AVX512-NEXT: movw %r14w, 2(%rbx) -; AVX512-NEXT: addq $88, %rsp +; AVX512-NEXT: addq $56, %rsp ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: popq %r14 ; AVX512-NEXT: popq %r15 @@ -1143,30 +1143,30 @@ ; ALL-LABEL: store_cvt_4f64_to_8i16_undef: ; ALL: # %bb.0: ; ALL-NEXT: pushq %rbx -; ALL-NEXT: subq $80, %rsp +; ALL-NEXT: subq $64, %rsp ; ALL-NEXT: movq %rdi, %rbx ; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) ; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, (%rsp) ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) ; ALL-NEXT: vmovaps (%rsp), %xmm0 ; ALL-NEXT: vmovaps %xmm0, (%rbx) -; ALL-NEXT: addq $80, %rsp +; ALL-NEXT: addq $64, %rsp ; ALL-NEXT: popq %rbx ; ALL-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> @@ -1180,30 +1180,30 @@ ; ALL-LABEL: store_cvt_4f64_to_8i16_zero: ; ALL: # %bb.0: ; ALL-NEXT: pushq %rbx -; ALL-NEXT: subq $80, %rsp +; ALL-NEXT: subq $64, %rsp ; ALL-NEXT: movq %rdi, %rbx ; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) ; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, (%rsp) ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 +; ALL-NEXT: callq __truncdfhf2@PLT ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: vmovaps %xmm0, (%rbx) -; ALL-NEXT: addq $80, %rsp +; ALL-NEXT: addq $64, %rsp ; ALL-NEXT: popq %rbx ; ALL-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> @@ -1222,47 +1222,47 @@ ; AVX1-NEXT: pushq %r13 ; AVX1-NEXT: pushq %r12 ; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: subq $136, %rsp +; AVX1-NEXT: subq $120, %rsp ; AVX1-NEXT: movq %rdi, %rbx ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = mem[1,0] -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %r12d ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %r13d ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %ebp ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %r14d ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movl %eax, %r15d ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: callq __truncdfhf2@PLT ; AVX1-NEXT: movw %ax, 12(%rbx) ; AVX1-NEXT: movw %r15w, 8(%rbx) ; AVX1-NEXT: movw %r14w, 4(%rbx) @@ -1273,7 +1273,7 @@ ; AVX1-NEXT: movw %ax, 6(%rbx) ; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload ; AVX1-NEXT: movw %ax, 2(%rbx) -; AVX1-NEXT: addq $136, %rsp +; AVX1-NEXT: addq $120, %rsp ; AVX1-NEXT: popq %rbx ; AVX1-NEXT: popq %r12 ; AVX1-NEXT: popq %r13 @@ -1290,47 +1290,47 @@ ; AVX2-NEXT: pushq %r13 ; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $136, %rsp +; AVX2-NEXT: subq $120, %rsp ; AVX2-NEXT: movq %rdi, %rbx ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = mem[1,0] -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movl %eax, %r12d ; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movl %eax, %r13d ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movl %eax, %ebp ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movl %eax, %r14d ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movl %eax, %r15d ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: callq __truncdfhf2@PLT ; AVX2-NEXT: movw %ax, 12(%rbx) ; AVX2-NEXT: movw %r15w, 8(%rbx) ; AVX2-NEXT: movw %r14w, 4(%rbx) @@ -1341,7 +1341,7 @@ ; AVX2-NEXT: movw %ax, 6(%rbx) ; AVX2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload ; AVX2-NEXT: movw %ax, 2(%rbx) -; AVX2-NEXT: addq $136, %rsp +; AVX2-NEXT: addq $120, %rsp ; AVX2-NEXT: popq %rbx ; AVX2-NEXT: popq %r12 ; AVX2-NEXT: popq %r13 @@ -1358,49 +1358,49 @@ ; AVX512-NEXT: pushq %r13 ; AVX512-NEXT: pushq %r12 ; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $200, %rsp +; AVX512-NEXT: subq $152, %rsp ; AVX512-NEXT: movq %rdi, %rbx ; AVX512-NEXT: vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; AVX512-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movl %eax, %r12d ; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movl %eax, %r13d ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movl %eax, %ebp ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movl %eax, %r14d ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movl %eax, %r15d ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: movw %ax, 12(%rbx) ; AVX512-NEXT: movw %r15w, 8(%rbx) ; AVX512-NEXT: movw %r14w, 4(%rbx) @@ -1411,7 +1411,7 @@ ; AVX512-NEXT: movw %ax, 6(%rbx) ; AVX512-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload ; AVX512-NEXT: movw %ax, 2(%rbx) -; AVX512-NEXT: addq $200, %rsp +; AVX512-NEXT: addq $152, %rsp ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: popq %r12 ; AVX512-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -515,7 +515,7 @@ define void @vf32(<128 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.vec1, <32 x i16>* %out.vec2, <32 x i16>* %out.vec3) nounwind { ; AVX2-SLOW-LABEL: vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $280, %rsp # imm = 0x118 +; AVX2-SLOW-NEXT: subq $248, %rsp ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm9 @@ -529,7 +529,7 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm0[1,2,3],xmm4[4],xmm0[5,6,7] ; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm0[1,2,3],xmm9[4],xmm0[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm0[1,2,3],xmm5[4],xmm0[5,6,7] ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 @@ -570,7 +570,7 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0],xmm0[1,2,3],xmm10[4],xmm0[5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0],xmm0[1,2,3],xmm11[4],xmm0[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm3, %xmm7 ; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm0[1,2,3],xmm3[4],xmm0[5,6,7] @@ -665,7 +665,7 @@ ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm6 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,0,2,3,4,5,6,7] @@ -675,7 +675,7 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm9 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,2,0,4,5,6,7] @@ -753,17 +753,17 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-SLOW-NEXT: addq $280, %rsp # imm = 0x118 +; AVX2-SLOW-NEXT: addq $248, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-ALL-LABEL: vf32: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: subq $216, %rsp +; AVX2-FAST-ALL-NEXT: subq $200, %rsp ; AVX2-FAST-ALL-NEXT: vmovdqa 64(%rdi), %ymm5 ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-ALL-NEXT: vmovdqa 96(%rdi), %ymm6 -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill ; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FAST-ALL-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-ALL-NEXT: vmovdqa 16(%rdi), %xmm11 @@ -793,7 +793,7 @@ ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-ALL-NEXT: vmovdqa 176(%rdi), %xmm13 ; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0],xmm8[1,2,3],xmm13[4],xmm8[5,6,7] -; AVX2-FAST-ALL-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-ALL-NEXT: vmovdqa 160(%rdi), %xmm0 ; AVX2-FAST-ALL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm8[1,2,3],xmm0[4],xmm8[5,6,7] @@ -850,7 +850,7 @@ ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,2,3,1,3,5,7] -; AVX2-FAST-ALL-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-ALL-NEXT: vpermd (%rsp), %ymm1, %ymm6 # 32-byte Folded Reload ; AVX2-FAST-ALL-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29] ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> @@ -869,13 +869,13 @@ ; AVX2-FAST-ALL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-ALL-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-ALL-NEXT: vpermd %ymm10, %ymm1, %ymm2 ; AVX2-FAST-ALL-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29] ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-ALL-NEXT: vpshufd $231, (%rsp), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-ALL-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-ALL-NEXT: # xmm3 = mem[3,1,2,3] ; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] ; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,1,2,0,4,5,6,7] @@ -924,17 +924,17 @@ ; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, (%rdx) ; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-FAST-ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-ALL-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm1, 32(%r8) ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-ALL-NEXT: addq $216, %rsp +; AVX2-FAST-ALL-NEXT: addq $200, %rsp ; AVX2-FAST-ALL-NEXT: vzeroupper ; AVX2-FAST-ALL-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $280, %rsp # imm = 0x118 +; AVX2-FAST-PERLANE-NEXT: subq $248, %rsp ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm1 @@ -991,7 +991,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0],xmm0[1,2,3],xmm13[4],xmm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm3, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm0[1,2,3],xmm3[4],xmm0[5,6,7] @@ -1061,7 +1061,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] @@ -1084,7 +1084,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,2,0,4,5,6,7] @@ -1119,7 +1119,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, (%rsp), %xmm7 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -1163,7 +1163,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-PERLANE-NEXT: addq $280, %rsp # imm = 0x118 +; AVX2-FAST-PERLANE-NEXT: addq $248, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq %wide.vec = load <128 x i16>, <128 x i16>* %in.vec, align 32 diff --git a/llvm/test/CodeGen/X86/vzero-excess.ll b/llvm/test/CodeGen/X86/vzero-excess.ll --- a/llvm/test/CodeGen/X86/vzero-excess.ll +++ b/llvm/test/CodeGen/X86/vzero-excess.ll @@ -8,16 +8,16 @@ ; CHECK-LABEL: zeroupper_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $48, %rsp +; CHECK-NEXT: subq $32, %rsp ; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq the_unknown +; CHECK-NEXT: callq the_unknown@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; CHECK-NEXT: vaddps (%rbx), %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: addq $48, %rsp +; CHECK-NEXT: addq $32, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -34,12 +34,12 @@ define <8 x float> @zeroupper_v8f32(<8 x float> %x) nounwind { ; CHECK-LABEL: zeroupper_v8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq the_unknown +; CHECK-NEXT: callq the_unknown@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: addq $40, %rsp ; CHECK-NEXT: retq call void @llvm.x86.avx.vzeroupper() call void @the_unknown() @@ -50,16 +50,16 @@ ; CHECK-LABEL: zeroall_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $48, %rsp +; CHECK-NEXT: subq $32, %rsp ; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: vzeroall -; CHECK-NEXT: callq the_unknown +; CHECK-NEXT: callq the_unknown@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; CHECK-NEXT: vaddps (%rbx), %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: addq $48, %rsp +; CHECK-NEXT: addq $32, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -76,12 +76,12 @@ define <8 x float> @zeroall_v8f32(<8 x float> %x) nounwind { ; CHECK-LABEL: zeroall_v8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; CHECK-NEXT: vzeroall -; CHECK-NEXT: callq the_unknown +; CHECK-NEXT: callq the_unknown@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: addq $40, %rsp ; CHECK-NEXT: retq call void @llvm.x86.avx.vzeroall() call void @the_unknown()