Index: llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp =================================================================== --- llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp +++ llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp @@ -99,6 +99,12 @@ "functions in the high bits of the stack pointer."), cl::init(true), cl::Hidden); +static cl::opt AltIPHardening( + PASS_KEY "-ip-alt", + cl::desc("Alternate code generation strategy for passing the " + "predicate state in and out of functions."), + cl::init(true), cl::Hidden); + static cl::opt HardenLoads(PASS_KEY "-loads", cl::desc("Sanitize loads from memory. When disable, no " @@ -168,6 +174,8 @@ SmallVector tracePredStateThroughCFG(MachineFunction &MF, ArrayRef Infos); + unsigned getZeroWordReg(const TargetRegisterClass *RC, MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertPt, DebugLoc Loc); void hardenAllLoads(MachineFunction &MF); unsigned saveEFLAGS(MachineBasicBlock &MBB, @@ -347,6 +355,24 @@ } } +unsigned X86SpeculativeLoadHardeningPass::getZeroWordReg( + const TargetRegisterClass *RC, MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertPt, DebugLoc Loc) { + unsigned Reg = MRI->createVirtualRegister(RC); + unsigned SubReg = MRI->createVirtualRegister(&X86::GR32RegClass); + auto ZeroI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::MOV32r0), SubReg); + ++NumInstsInserted; + MachineOperand *ZeroEFLAGSDefOp = ZeroI->findRegisterDefOperand(X86::EFLAGS); + assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() && + "Must have an implicit def of EFLAGS!"); + ZeroEFLAGSDefOp->setIsDead(true); + BuildMI(MBB, InsertPt, Loc, TII->get(X86::SUBREG_TO_REG), Reg) + .addImm(0) + .addReg(SubReg) + .addImm(X86::sub_32bit); + return Reg; +} + /// Helper to scan a function for loads vulnerable to misspeculation that we /// want to harden. /// @@ -454,21 +480,7 @@ } else { // Otherwise, just build the predicate state itself by zeroing a register // as we don't need any initial state. - PS->InitialReg = MRI->createVirtualRegister(PS->RC); - unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass); - auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0), - PredStateSubReg); - ++NumInstsInserted; - MachineOperand *ZeroEFLAGSDefOp = - ZeroI->findRegisterDefOperand(X86::EFLAGS); - assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() && - "Must have an implicit def of EFLAGS!"); - ZeroEFLAGSDefOp->setIsDead(true); - BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG), - PS->InitialReg) - .addImm(0) - .addReg(PredStateSubReg) - .addImm(X86::sub_32bit); + PS->InitialReg = getZeroWordReg(PS->RC, Entry, EntryInsertPt, Loc); } // We're going to need to trace predicate state throughout the function's @@ -1511,24 +1523,37 @@ /// Takes the current predicate state (in a register) and merges it into the /// stack pointer. The state is essentially a single bit, but we merge this in /// a way that won't form non-canonical pointers and also will be preserved -/// across normal stack adjustments. +/// across normal stack adjustments. The technique is to leave just enough high +/// bits set to form a canonical pointer, but otherwise leave the stack pointer +/// alone. The particular goal is to allow stack adjustments from a `ret` (or +/// some long series of `ret`s and `pop`s to not wrap the address around and +/// make the high bits zero again. If that happens, we will be unable to +/// reliably extract the predicate state from the stack pointer. void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP( MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc, unsigned PredStateReg) { - unsigned TmpReg = MRI->createVirtualRegister(PS->RC); - // FIXME: This hard codes a shift distance based on the number of bits needed - // to stay canonical on 64-bit. We should compute this somehow and support - // 32-bit as part of that. - auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHL64ri), TmpReg) - .addReg(PredStateReg, RegState::Kill) - .addImm(47); - ShiftI->addRegisterDead(X86::EFLAGS, TRI); - ++NumInstsInserted; - auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), X86::RSP) - .addReg(X86::RSP) - .addReg(TmpReg, RegState::Kill); - OrI->addRegisterDead(X86::EFLAGS, TRI); - ++NumInstsInserted; + if (AltIPHardening) { + auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), X86::RSP) + .addReg(X86::RSP) + .addReg(PredStateReg, RegState::Kill); + OrI->addRegisterDead(X86::EFLAGS, TRI); + ++NumInstsInserted; + } else { + unsigned TmpReg = MRI->createVirtualRegister(PS->RC); + // FIXME: This hard codes a shift distance based on the number of bits + // needed to stay canonical on 64-bit. We should compute this somehow and + // support 32-bit as part of that. + auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHL64ri), TmpReg) + .addReg(PredStateReg, RegState::Kill) + .addImm(47); + ShiftI->addRegisterDead(X86::EFLAGS, TRI); + ++NumInstsInserted; + auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), X86::RSP) + .addReg(X86::RSP) + .addReg(TmpReg, RegState::Kill); + OrI->addRegisterDead(X86::EFLAGS, TRI); + ++NumInstsInserted; + } } /// Extracts the predicate state stored in the high bits of the stack pointer. @@ -1536,19 +1561,34 @@ MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc) { unsigned PredStateReg = MRI->createVirtualRegister(PS->RC); - unsigned TmpReg = MRI->createVirtualRegister(PS->RC); - - // We know that the stack pointer will have any preserved predicate state in - // its high bit. We just want to smear this across the other bits. Turns out, - // this is exactly what an arithmetic right shift does. - BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), TmpReg) - .addReg(X86::RSP); - auto ShiftI = - BuildMI(MBB, InsertPt, Loc, TII->get(X86::SAR64ri), PredStateReg) - .addReg(TmpReg, RegState::Kill) - .addImm(TRI->getRegSizeInBits(*PS->RC) - 1); - ShiftI->addRegisterDead(X86::EFLAGS, TRI); - ++NumInstsInserted; + + if (AltIPHardening) { + unsigned ZeroReg = getZeroWordReg(PS->RC, MBB, InsertPt, Loc); + BuildMI(MBB, InsertPt, Loc, TII->get(X86::TEST64ri32)) + .addReg(X86::RSP).addImm(-4096); + int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8; + auto CMovI = BuildMI(MBB, InsertPt, Loc, + TII->get(X86::getCMovFromCond(X86::COND_E, + PredStateSizeInBytes)), + PredStateReg) + .addReg(ZeroReg) + .addReg(PS->PoisonReg); + ++NumInstsInserted; + (void)CMovI; + } else { + // We know that the stack pointer will have any preserved predicate state in + // its high bit. We just want to smear this across the other bits. Turns + // out, this is exactly what an arithmetic right shift does. + unsigned TmpReg = MRI->createVirtualRegister(PS->RC); + BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), TmpReg) + .addReg(X86::RSP); + auto ShiftI = + BuildMI(MBB, InsertPt, Loc, TII->get(X86::SAR64ri), PredStateReg) + .addReg(TmpReg, RegState::Kill) + .addImm(TRI->getRegSizeInBits(*PS->RC) - 1); + ShiftI->addRegisterDead(X86::EFLAGS, TRI); + ++NumInstsInserted; + } return PredStateReg; } Index: llvm/test/CodeGen/X86/speculative-load-hardening-gather.ll =================================================================== --- llvm/test/CodeGen/X86/speculative-load-hardening-gather.ll +++ llvm/test/CodeGen/X86/speculative-load-hardening-gather.ll @@ -6,18 +6,18 @@ define <4 x float> @test_llvm_x86_avx2_gather_d_ps(i8* %b, <4 x i32> %iv, <4 x float> %mask) #0 { ; CHECK-LABEL: test_llvm_x86_avx2_gather_d_ps: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vmovq %rax, %xmm3 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vmovq %rcx, %xmm3 ; CHECK-NEXT: vpbroadcastq %xmm3, %xmm3 ; CHECK-NEXT: vpor %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: vgatherdps %xmm1, (%rdi,%xmm0), %xmm2 -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovaps %xmm2, %xmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> zeroinitializer, i8* %b, <4 x i32> %iv, <4 x float> %mask, i8 1) @@ -29,18 +29,18 @@ define <4 x float> @test_llvm_x86_avx2_gather_q_ps(i8* %b, <2 x i64> %iv, <4 x float> %mask) #0 { ; CHECK-LABEL: test_llvm_x86_avx2_gather_q_ps: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vmovq %rax, %xmm3 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vmovq %rcx, %xmm3 ; CHECK-NEXT: vpbroadcastq %xmm3, %xmm3 ; CHECK-NEXT: vpor %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: vgatherqps %xmm1, (%rdi,%xmm0), %xmm2 -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovaps %xmm2, %xmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> zeroinitializer, i8* %b, <2 x i64> %iv, <4 x float> %mask, i8 1) @@ -52,18 +52,18 @@ define <2 x double> @test_llvm_x86_avx2_gather_d_pd(i8* %b, <4 x i32> %iv, <2 x double> %mask) #0 { ; CHECK-LABEL: test_llvm_x86_avx2_gather_d_pd: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vmovq %rax, %xmm3 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vmovq %rcx, %xmm3 ; CHECK-NEXT: vpbroadcastq %xmm3, %xmm3 ; CHECK-NEXT: vpor %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: vgatherdpd %xmm1, (%rdi,%xmm0), %xmm2 -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovapd %xmm2, %xmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> zeroinitializer, i8* %b, <4 x i32> %iv, <2 x double> %mask, i8 1) @@ -75,18 +75,18 @@ define <2 x double> @test_llvm_x86_avx2_gather_q_pd(i8* %b, <2 x i64> %iv, <2 x double> %mask) #0 { ; CHECK-LABEL: test_llvm_x86_avx2_gather_q_pd: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vmovq %rax, %xmm3 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vmovq %rcx, %xmm3 ; CHECK-NEXT: vpbroadcastq %xmm3, %xmm3 ; CHECK-NEXT: vpor %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: vgatherqpd %xmm1, (%rdi,%xmm0), %xmm2 -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovapd %xmm2, %xmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> zeroinitializer, i8* %b, <2 x i64> %iv, <2 x double> %mask, i8 1) @@ -98,18 +98,18 @@ define <8 x float> @test_llvm_x86_avx2_gather_d_ps_256(i8* %b, <8 x i32> %iv, <8 x float> %mask) #0 { ; CHECK-LABEL: test_llvm_x86_avx2_gather_d_ps_256: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vmovq %rax, %xmm3 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vmovq %rcx, %xmm3 ; CHECK-NEXT: vpbroadcastq %xmm3, %ymm3 ; CHECK-NEXT: vpor %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vgatherdps %ymm1, (%rdi,%ymm0), %ymm2 -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovaps %ymm2, %ymm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> zeroinitializer, i8* %b, <8 x i32> %iv, <8 x float> %mask, i8 1) @@ -121,18 +121,18 @@ define <4 x float> @test_llvm_x86_avx2_gather_q_ps_256(i8* %b, <4 x i64> %iv, <4 x float> %mask) #0 { ; CHECK-LABEL: test_llvm_x86_avx2_gather_q_ps_256: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vmovq %rax, %xmm3 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vmovq %rcx, %xmm3 ; CHECK-NEXT: vpbroadcastq %xmm3, %ymm3 ; CHECK-NEXT: vpor %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vgatherqps %xmm1, (%rdi,%ymm0), %xmm2 -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovaps %xmm2, %xmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: @@ -145,18 +145,18 @@ define <4 x double> @test_llvm_x86_avx2_gather_d_pd_256(i8* %b, <4 x i32> %iv, <4 x double> %mask) #0 { ; CHECK-LABEL: test_llvm_x86_avx2_gather_d_pd_256: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vmovq %rax, %xmm3 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vmovq %rcx, %xmm3 ; CHECK-NEXT: vpbroadcastq %xmm3, %xmm3 ; CHECK-NEXT: vpor %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: vgatherdpd %ymm1, (%rdi,%xmm0), %ymm2 -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovapd %ymm2, %ymm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> zeroinitializer, i8* %b, <4 x i32> %iv, <4 x double> %mask, i8 1) @@ -168,18 +168,18 @@ define <4 x double> @test_llvm_x86_avx2_gather_q_pd_256(i8* %b, <4 x i64> %iv, <4 x double> %mask) #0 { ; CHECK-LABEL: test_llvm_x86_avx2_gather_q_pd_256: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vmovq %rax, %xmm3 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vmovq %rcx, %xmm3 ; CHECK-NEXT: vpbroadcastq %xmm3, %ymm3 ; CHECK-NEXT: vpor %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vgatherqpd %ymm1, (%rdi,%ymm0), %ymm2 -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovapd %ymm2, %ymm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> zeroinitializer, i8* %b, <4 x i64> %iv, <4 x double> %mask, i8 1) @@ -191,18 +191,18 @@ define <4 x i32> @test_llvm_x86_avx2_gather_d_d(i8* %b, <4 x i32> %iv, <4 x i32> %mask) #0 { ; CHECK-LABEL: test_llvm_x86_avx2_gather_d_d: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vmovq %rax, %xmm3 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vmovq %rcx, %xmm3 ; CHECK-NEXT: vpbroadcastq %xmm3, %xmm3 ; CHECK-NEXT: vpor %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: vpgatherdd %xmm1, (%rdi,%xmm0), %xmm2 -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovdqa %xmm2, %xmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> zeroinitializer, i8* %b, <4 x i32> %iv, <4 x i32> %mask, i8 1) @@ -214,18 +214,18 @@ define <4 x i32> @test_llvm_x86_avx2_gather_q_d(i8* %b, <2 x i64> %iv, <4 x i32> %mask) #0 { ; CHECK-LABEL: test_llvm_x86_avx2_gather_q_d: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vmovq %rax, %xmm3 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vmovq %rcx, %xmm3 ; CHECK-NEXT: vpbroadcastq %xmm3, %xmm3 ; CHECK-NEXT: vpor %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: vpgatherqd %xmm1, (%rdi,%xmm0), %xmm2 -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovdqa %xmm2, %xmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> zeroinitializer, i8* %b, <2 x i64> %iv, <4 x i32> %mask, i8 1) @@ -237,18 +237,18 @@ define <2 x i64> @test_llvm_x86_avx2_gather_d_q(i8* %b, <4 x i32> %iv, <2 x i64> %mask) #0 { ; CHECK-LABEL: test_llvm_x86_avx2_gather_d_q: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vmovq %rax, %xmm3 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vmovq %rcx, %xmm3 ; CHECK-NEXT: vpbroadcastq %xmm3, %xmm3 ; CHECK-NEXT: vpor %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: vpgatherdq %xmm1, (%rdi,%xmm0), %xmm2 -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovdqa %xmm2, %xmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> zeroinitializer, i8* %b, <4 x i32> %iv, <2 x i64> %mask, i8 1) @@ -260,18 +260,18 @@ define <2 x i64> @test_llvm_x86_avx2_gather_q_q(i8* %b, <2 x i64> %iv, <2 x i64> %mask) #0 { ; CHECK-LABEL: test_llvm_x86_avx2_gather_q_q: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vmovq %rax, %xmm3 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vmovq %rcx, %xmm3 ; CHECK-NEXT: vpbroadcastq %xmm3, %xmm3 ; CHECK-NEXT: vpor %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: vpgatherqq %xmm1, (%rdi,%xmm0), %xmm2 -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovdqa %xmm2, %xmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> zeroinitializer, i8* %b, <2 x i64> %iv, <2 x i64> %mask, i8 1) @@ -283,18 +283,18 @@ define <8 x i32> @test_llvm_x86_avx2_gather_d_d_256(i8* %b, <8 x i32> %iv, <8 x i32> %mask) #0 { ; CHECK-LABEL: test_llvm_x86_avx2_gather_d_d_256: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vmovq %rax, %xmm3 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vmovq %rcx, %xmm3 ; CHECK-NEXT: vpbroadcastq %xmm3, %ymm3 ; CHECK-NEXT: vpor %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vpgatherdd %ymm1, (%rdi,%ymm0), %ymm2 -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> zeroinitializer, i8* %b, <8 x i32> %iv, <8 x i32> %mask, i8 1) @@ -306,18 +306,18 @@ define <4 x i32> @test_llvm_x86_avx2_gather_q_d_256(i8* %b, <4 x i64> %iv, <4 x i32> %mask) #0 { ; CHECK-LABEL: test_llvm_x86_avx2_gather_q_d_256: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vmovq %rax, %xmm3 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vmovq %rcx, %xmm3 ; CHECK-NEXT: vpbroadcastq %xmm3, %ymm3 ; CHECK-NEXT: vpor %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vpgatherqd %xmm1, (%rdi,%ymm0), %xmm2 -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovdqa %xmm2, %xmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: @@ -330,18 +330,18 @@ define <4 x i64> @test_llvm_x86_avx2_gather_d_q_256(i8* %b, <4 x i32> %iv, <4 x i64> %mask) #0 { ; CHECK-LABEL: test_llvm_x86_avx2_gather_d_q_256: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vmovq %rax, %xmm3 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vmovq %rcx, %xmm3 ; CHECK-NEXT: vpbroadcastq %xmm3, %xmm3 ; CHECK-NEXT: vpor %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: vpgatherdq %ymm1, (%rdi,%xmm0), %ymm2 -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> zeroinitializer, i8* %b, <4 x i32> %iv, <4 x i64> %mask, i8 1) @@ -353,18 +353,18 @@ define <4 x i64> @test_llvm_x86_avx2_gather_q_q_256(i8* %b, <4 x i64> %iv, <4 x i64> %mask) #0 { ; CHECK-LABEL: test_llvm_x86_avx2_gather_q_q_256: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vmovq %rax, %xmm3 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vmovq %rcx, %xmm3 ; CHECK-NEXT: vpbroadcastq %xmm3, %ymm3 ; CHECK-NEXT: vpor %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: vpgatherqq %ymm1, (%rdi,%ymm0), %ymm2 -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> zeroinitializer, i8* %b, <4 x i64> %iv, <4 x i64> %mask, i8 1) @@ -376,18 +376,18 @@ define <16 x float> @test_llvm_x86_avx512_gather_dps_512(i8* %b, <16 x i32> %iv) #1 { ; CHECK-LABEL: test_llvm_x86_avx512_gather_dps_512: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vpbroadcastq %rax, %zmm2 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vpbroadcastq %rcx, %zmm2 ; CHECK-NEXT: vporq %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0), %zmm1 {%k1} -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovaps %zmm1, %zmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <16 x float> @llvm.x86.avx512.gather.dps.512(<16 x float> zeroinitializer, i8* %b, <16 x i32> %iv, i16 -1, i32 1) @@ -399,19 +399,19 @@ define <8 x double> @test_llvm_x86_avx512_gather_dpd_512(i8* %b, <8 x i32> %iv) #1 { ; CHECK-LABEL: test_llvm_x86_avx512_gather_dpd_512: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vmovq %rax, %xmm2 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vmovq %rcx, %xmm2 ; CHECK-NEXT: vpbroadcastq %xmm2, %ymm2 ; CHECK-NEXT: vpor %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: vgatherdpd (%rdi,%ymm0), %zmm1 {%k1} -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <8 x double> @llvm.x86.avx512.gather.dpd.512(<8 x double> zeroinitializer, i8* %b, <8 x i32> %iv, i8 -1, i32 1) @@ -423,18 +423,18 @@ define <8 x float> @test_llvm_x86_avx512_gather_qps_512(i8* %b, <8 x i64> %iv) #1 { ; CHECK-LABEL: test_llvm_x86_avx512_gather_qps_512: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vpbroadcastq %rax, %zmm2 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vpbroadcastq %rcx, %zmm2 ; CHECK-NEXT: vporq %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1} -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovaps %ymm1, %ymm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <8 x float> @llvm.x86.avx512.gather.qps.512(<8 x float> zeroinitializer, i8* %b, <8 x i64> %iv, i8 -1, i32 1) @@ -446,18 +446,18 @@ define <8 x double> @test_llvm_x86_avx512_gather_qpd_512(i8* %b, <8 x i64> %iv) #1 { ; CHECK-LABEL: test_llvm_x86_avx512_gather_qpd_512: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vpbroadcastq %rax, %zmm2 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vpbroadcastq %rcx, %zmm2 ; CHECK-NEXT: vporq %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vgatherqpd (%rdi,%zmm0), %zmm1 {%k1} -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <8 x double> @llvm.x86.avx512.gather.qpd.512(<8 x double> zeroinitializer, i8* %b, <8 x i64> %iv, i8 -1, i32 1) @@ -469,18 +469,18 @@ define <16 x i32> @test_llvm_x86_avx512_gather_dpi_512(i8* %b, <16 x i32> %iv) #1 { ; CHECK-LABEL: test_llvm_x86_avx512_gather_dpi_512: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vpbroadcastq %rax, %zmm2 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vpbroadcastq %rcx, %zmm2 ; CHECK-NEXT: vporq %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vpgatherdd (%rdi,%zmm0), %zmm1 {%k1} -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <16 x i32> @llvm.x86.avx512.gather.dpi.512(<16 x i32> zeroinitializer, i8* %b, <16 x i32> %iv, i16 -1, i32 1) @@ -492,19 +492,19 @@ define <8 x i64> @test_llvm_x86_avx512_gather_dpq_512(i8* %b, <8 x i32> %iv) #1 { ; CHECK-LABEL: test_llvm_x86_avx512_gather_dpq_512: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vmovq %rax, %xmm2 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vmovq %rcx, %xmm2 ; CHECK-NEXT: vpbroadcastq %xmm2, %ymm2 ; CHECK-NEXT: vpor %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: vpgatherdq (%rdi,%ymm0), %zmm1 {%k1} -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <8 x i64> @llvm.x86.avx512.gather.dpq.512(<8 x i64> zeroinitializer, i8* %b, <8 x i32> %iv, i8 -1, i32 1) @@ -517,18 +517,18 @@ define <8 x i32> @test_llvm_x86_avx512_gather_qpi_512(i8* %b, <8 x i64> %iv) #1 { ; CHECK-LABEL: test_llvm_x86_avx512_gather_qpi_512: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vpbroadcastq %rax, %zmm2 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vpbroadcastq %rcx, %zmm2 ; CHECK-NEXT: vporq %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vpgatherqd (%rdi,%zmm0), %ymm1 {%k1} -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <8 x i32> @llvm.x86.avx512.gather.qpi.512(<8 x i32> zeroinitializer, i8* %b, <8 x i64> %iv, i8 -1, i32 1) @@ -540,18 +540,18 @@ define <8 x i64> @test_llvm_x86_avx512_gather_qpq_512(i8* %b, <8 x i64> %iv) #1 { ; CHECK-LABEL: test_llvm_x86_avx512_gather_qpq_512: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vpbroadcastq %rax, %zmm2 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vpbroadcastq %rcx, %zmm2 ; CHECK-NEXT: vporq %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: vpgatherqq (%rdi,%zmm0), %zmm1 {%k1} -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <8 x i64> @llvm.x86.avx512.gather.qpq.512(<8 x i64> zeroinitializer, i8* %b, <8 x i64> %iv, i8 -1, i32 1) @@ -563,16 +563,16 @@ define void @test_llvm_x86_avx512_gatherpf_qps_512(<8 x i64> %iv, i8* %b) #1 { ; CHECK-LABEL: test_llvm_x86_avx512_gatherpf_qps_512: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vpbroadcastq %rax, %zmm1 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vpbroadcastq %rcx, %zmm1 ; CHECK-NEXT: vporq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: vgatherpf0qps (%rdi,%zmm0,4) {%k1} -; CHECK-NEXT: shlq $47, %rax -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: @@ -585,18 +585,18 @@ define <4 x float> @test_llvm_x86_avx512_gather3siv4_sf(i8* %b, <4 x i32> %iv) #2 { ; CHECK-LABEL: test_llvm_x86_avx512_gather3siv4_sf: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vpbroadcastq %rax, %xmm2 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vpbroadcastq %rcx, %xmm2 ; CHECK-NEXT: vpor %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: vgatherdps (%rdi,%xmm0), %xmm1 {%k1} -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovaps %xmm1, %xmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> zeroinitializer, i8* %b, <4 x i32> %iv, i8 -1, i32 1) @@ -608,18 +608,18 @@ define <4 x float> @test_llvm_x86_avx512_gather3div4_sf(i8* %b, <2 x i64> %iv) #2 { ; CHECK-LABEL: test_llvm_x86_avx512_gather3div4_sf: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vpbroadcastq %rax, %xmm2 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vpbroadcastq %rcx, %xmm2 ; CHECK-NEXT: vpor %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: vgatherqps (%rdi,%xmm0), %xmm1 {%k1} -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovaps %xmm1, %xmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> zeroinitializer, i8* %b, <2 x i64> %iv, i8 -1, i32 1) @@ -631,18 +631,18 @@ define <2 x double> @test_llvm_x86_avx512_gather3siv2_df(i8* %b, <4 x i32> %iv) #2 { ; CHECK-LABEL: test_llvm_x86_avx512_gather3siv2_df: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vpbroadcastq %rax, %xmm2 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vpbroadcastq %rcx, %xmm2 ; CHECK-NEXT: vpor %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm0), %xmm1 {%k1} -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovapd %xmm1, %xmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> zeroinitializer, i8* %b, <4 x i32> %iv, i8 -1, i32 1) @@ -654,18 +654,18 @@ define <2 x double> @test_llvm_x86_avx512_gather3div2_df(i8* %b, <2 x i64> %iv) #2 { ; CHECK-LABEL: test_llvm_x86_avx512_gather3div2_df: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vpbroadcastq %rax, %xmm2 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vpbroadcastq %rcx, %xmm2 ; CHECK-NEXT: vpor %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: vgatherqpd (%rdi,%xmm0), %xmm1 {%k1} -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovapd %xmm1, %xmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> zeroinitializer, i8* %b, <2 x i64> %iv, i8 -1, i32 1) @@ -677,18 +677,18 @@ define <8 x float> @test_llvm_x86_avx512_gather3siv8_sf(i8* %b, <8 x i32> %iv) #2 { ; CHECK-LABEL: test_llvm_x86_avx512_gather3siv8_sf: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vpbroadcastq %rax, %ymm2 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vpbroadcastq %rcx, %ymm2 ; CHECK-NEXT: vpor %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: vgatherdps (%rdi,%ymm0), %ymm1 {%k1} -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovaps %ymm1, %ymm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> zeroinitializer, i8* %b, <8 x i32> %iv, i8 -1, i32 1) @@ -700,18 +700,18 @@ define <4 x float> @test_llvm_x86_avx512_gather3div8_sf(i8* %b, <4 x i64> %iv) #2 { ; CHECK-LABEL: test_llvm_x86_avx512_gather3div8_sf: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vpbroadcastq %rax, %ymm2 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vpbroadcastq %rcx, %ymm2 ; CHECK-NEXT: vpor %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: vgatherqps (%rdi,%ymm0), %xmm1 {%k1} -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovaps %xmm1, %xmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: @@ -724,18 +724,18 @@ define <4 x double> @test_llvm_x86_avx512_gather3siv4_df(i8* %b, <4 x i32> %iv) #2 { ; CHECK-LABEL: test_llvm_x86_avx512_gather3siv4_df: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vpbroadcastq %rax, %xmm2 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vpbroadcastq %rcx, %xmm2 ; CHECK-NEXT: vpor %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm0), %ymm1 {%k1} -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovapd %ymm1, %ymm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> zeroinitializer, i8* %b, <4 x i32> %iv, i8 -1, i32 1) @@ -747,18 +747,18 @@ define <4 x double> @test_llvm_x86_avx512_gather3div4_df(i8* %b, <4 x i64> %iv) #2 { ; CHECK-LABEL: test_llvm_x86_avx512_gather3div4_df: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vpbroadcastq %rax, %ymm2 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vpbroadcastq %rcx, %ymm2 ; CHECK-NEXT: vpor %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: vgatherqpd (%rdi,%ymm0), %ymm1 {%k1} -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovapd %ymm1, %ymm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> zeroinitializer, i8* %b, <4 x i64> %iv, i8 -1, i32 1) @@ -770,18 +770,18 @@ define <4 x i32> @test_llvm_x86_avx512_gather3siv4_si(i8* %b, <4 x i32> %iv) #2 { ; CHECK-LABEL: test_llvm_x86_avx512_gather3siv4_si: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vpbroadcastq %rax, %xmm2 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vpbroadcastq %rcx, %xmm2 ; CHECK-NEXT: vpor %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: vpgatherdd (%rdi,%xmm0), %xmm1 {%k1} -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> zeroinitializer, i8* %b, <4 x i32> %iv, i8 -1, i32 1) @@ -793,18 +793,18 @@ define <4 x i32> @test_llvm_x86_avx512_gather3div4_si(i8* %b, <2 x i64> %iv) #2 { ; CHECK-LABEL: test_llvm_x86_avx512_gather3div4_si: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vpbroadcastq %rax, %xmm2 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vpbroadcastq %rcx, %xmm2 ; CHECK-NEXT: vpor %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: vpgatherqd (%rdi,%xmm0), %xmm1 {%k1} -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> zeroinitializer, i8* %b, <2 x i64> %iv, i8 -1, i32 1) @@ -816,18 +816,18 @@ define <2 x i64> @test_llvm_x86_avx512_gather3siv2_di(i8* %b, <4 x i32> %iv) #2 { ; CHECK-LABEL: test_llvm_x86_avx512_gather3siv2_di: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vpbroadcastq %rax, %xmm2 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vpbroadcastq %rcx, %xmm2 ; CHECK-NEXT: vpor %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: vpgatherdq (%rdi,%xmm0), %xmm1 {%k1} -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64> zeroinitializer, i8* %b, <4 x i32> %iv, i8 -1, i32 1) @@ -839,18 +839,18 @@ define <2 x i64> @test_llvm_x86_avx512_gather3div2_di(i8* %b, <2 x i64> %iv) #2 { ; CHECK-LABEL: test_llvm_x86_avx512_gather3div2_di: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vpbroadcastq %rax, %xmm2 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vpbroadcastq %rcx, %xmm2 ; CHECK-NEXT: vpor %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: vpgatherqq (%rdi,%xmm0), %xmm1 {%k1} -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> zeroinitializer, i8* %b, <2 x i64> %iv, i8 -1, i32 1) @@ -862,18 +862,18 @@ define <8 x i32> @test_llvm_x86_avx512_gather3siv8_si(i8* %b, <8 x i32> %iv) #2 { ; CHECK-LABEL: test_llvm_x86_avx512_gather3siv8_si: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vpbroadcastq %rax, %ymm2 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vpbroadcastq %rcx, %ymm2 ; CHECK-NEXT: vpor %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: vpgatherdd (%rdi,%ymm0), %ymm1 {%k1} -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> zeroinitializer, i8* %b, <8 x i32> %iv, i8 -1, i32 1) @@ -885,18 +885,18 @@ define <4 x i32> @test_llvm_x86_avx512_gather3div8_si(i8* %b, <4 x i64> %iv) #2 { ; CHECK-LABEL: test_llvm_x86_avx512_gather3div8_si: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vpbroadcastq %rax, %ymm2 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vpbroadcastq %rcx, %ymm2 ; CHECK-NEXT: vpor %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: vpgatherqd (%rdi,%ymm0), %xmm1 {%k1} -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: @@ -909,18 +909,18 @@ define <4 x i64> @test_llvm_x86_avx512_gather3siv4_di(i8* %b, <4 x i32> %iv) #2 { ; CHECK-LABEL: test_llvm_x86_avx512_gather3siv4_di: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vpbroadcastq %rax, %xmm2 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vpbroadcastq %rcx, %xmm2 ; CHECK-NEXT: vpor %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: vpgatherdq (%rdi,%xmm0), %ymm1 {%k1} -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> zeroinitializer, i8* %b, <4 x i32> %iv, i8 -1, i32 1) @@ -932,18 +932,18 @@ define <4 x i64> @test_llvm_x86_avx512_gather3div4_di(i8* %b, <4 x i64> %iv) #2 { ; CHECK-LABEL: test_llvm_x86_avx512_gather3div4_di: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: movq $-1, %rax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: orq %rax, %rdi -; CHECK-NEXT: vpbroadcastq %rax, %ymm2 +; CHECK-NEXT: orq %rcx, %rdi +; CHECK-NEXT: vpbroadcastq %rcx, %ymm2 ; CHECK-NEXT: vpor %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: vpgatherqq (%rdi,%ymm0), %ymm1 {%k1} -; CHECK-NEXT: shlq $47, %rax ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 -; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: orq %rcx, %rsp ; CHECK-NEXT: retq entry: %v = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> zeroinitializer, i8* %b, <4 x i64> %iv, i8 -1, i32 1) Index: llvm/test/CodeGen/X86/speculative-load-hardening.ll =================================================================== --- llvm/test/CodeGen/X86/speculative-load-hardening.ll +++ llvm/test/CodeGen/X86/speculative-load-hardening.ll @@ -11,12 +11,12 @@ define i32 @test_trivial_entry_load(i32* %ptr) nounwind { ; X64-LABEL: test_trivial_entry_load: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rsp, %rcx ; X64-NEXT: movq $-1, %rax -; X64-NEXT: sarq $63, %rcx +; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %rax, %rcx ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: orl %ecx, %eax -; X64-NEXT: shlq $47, %rcx ; X64-NEXT: orq %rcx, %rsp ; X64-NEXT: retq ; @@ -35,9 +35,10 @@ ; X64-NEXT: pushq %r15 ; X64-NEXT: pushq %r14 ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %rsp, %rax ; X64-NEXT: movq $-1, %rbx -; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %rbx, %rax ; X64-NEXT: testl %edi, %edi ; X64-NEXT: jne .LBB1_1 ; X64-NEXT: # %bb.2: # %then1 @@ -47,14 +48,13 @@ ; X64-NEXT: .LBB1_1: ; X64-NEXT: cmoveq %rbx, %rax ; X64-NEXT: .LBB1_8: # %exit -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r14 ; X64-NEXT: popq %r15 ; X64-NEXT: retq ; X64-NEXT: .LBB1_4: # %then2 -; X64-NEXT: movq %r8, %r15 +; X64-NEXT: movq %r8, %r14 ; X64-NEXT: cmovneq %rbx, %rax ; X64-NEXT: testl %edx, %edx ; X64-NEXT: je .LBB1_6 @@ -62,30 +62,30 @@ ; X64-NEXT: cmoveq %rbx, %rax ; X64-NEXT: movslq (%r9), %rcx ; X64-NEXT: orq %rax, %rcx -; X64-NEXT: leaq (%r15,%rcx,4), %r14 -; X64-NEXT: movl %ecx, (%r15,%rcx,4) +; X64-NEXT: leaq (%r14,%rcx,4), %r15 +; X64-NEXT: movl %ecx, (%r14,%rcx,4) ; X64-NEXT: jmp .LBB1_7 ; X64-NEXT: .LBB1_6: # %then3 ; X64-NEXT: cmovneq %rbx, %rax ; X64-NEXT: movl (%rcx), %ecx -; X64-NEXT: addl (%r15), %ecx +; X64-NEXT: addl (%r14), %ecx ; X64-NEXT: movslq %ecx, %rdi ; X64-NEXT: orq %rax, %rdi -; X64-NEXT: movl (%r15,%rdi,4), %esi +; X64-NEXT: movl (%r14,%rdi,4), %esi ; X64-NEXT: orl %eax, %esi -; X64-NEXT: movq (%r9), %r14 -; X64-NEXT: orq %rax, %r14 -; X64-NEXT: addl (%r14), %esi -; X64-NEXT: shlq $47, %rax +; X64-NEXT: movq (%r9), %r15 +; X64-NEXT: orq %rax, %r15 +; X64-NEXT: addl (%r15), %esi ; X64-NEXT: # kill: def $edi killed $edi killed $rdi ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq leak -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %rbx, %rax ; X64-NEXT: .LBB1_7: # %merge -; X64-NEXT: movslq (%r14), %rcx +; X64-NEXT: movslq (%r15), %rcx ; X64-NEXT: orq %rax, %rcx -; X64-NEXT: movl $0, (%r15,%rcx,4) +; X64-NEXT: movl $0, (%r14,%rcx,4) ; X64-NEXT: jmp .LBB1_8 ; ; X64-LFENCE-LABEL: test_basic_conditions: @@ -181,9 +181,10 @@ ; X64-NEXT: pushq %r14 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %rsp, %rax ; X64-NEXT: movq $-1, %r15 -; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %r15, %rax ; X64-NEXT: testl %edi, %edi ; X64-NEXT: je .LBB2_2 ; X64-NEXT: # %bb.1: @@ -206,18 +207,17 @@ ; X64-NEXT: movq %rax, %rdx ; X64-NEXT: orq %r14, %rdx ; X64-NEXT: movl (%rdx,%rcx,4), %edi -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %r15, %rax ; X64-NEXT: incl %ebx ; X64-NEXT: cmpl %ebp, %ebx ; X64-NEXT: jl .LBB2_6 ; X64-NEXT: # %bb.4: ; X64-NEXT: cmovlq %r15, %rax ; X64-NEXT: .LBB2_5: # %exit -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 @@ -287,20 +287,21 @@ ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx ; X64-NEXT: pushq %rax -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: movq $-1, %r12 -; X64-NEXT: sarq $63, %rax +; X64-NEXT: movq $-1, %rbp +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %rbp, %rax ; X64-NEXT: testl %edi, %edi ; X64-NEXT: je .LBB3_2 ; X64-NEXT: # %bb.1: -; X64-NEXT: cmoveq %r12, %rax +; X64-NEXT: cmoveq %rbp, %rax ; X64-NEXT: jmp .LBB3_10 ; X64-NEXT: .LBB3_2: # %l1.header.preheader ; X64-NEXT: movq %r8, %r14 ; X64-NEXT: movq %rcx, %rbx -; X64-NEXT: movl %edx, %ebp +; X64-NEXT: movl %edx, %r12d ; X64-NEXT: movl %esi, %r15d -; X64-NEXT: cmovneq %r12, %rax +; X64-NEXT: cmovneq %rbp, %rax ; X64-NEXT: xorl %r13d, %r13d ; X64-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: testl %r15d, %r15d @@ -308,16 +309,16 @@ ; X64-NEXT: jmp .LBB3_4 ; X64-NEXT: .p2align 4, 0x90 ; X64-NEXT: .LBB3_12: -; X64-NEXT: cmovgeq %r12, %rax +; X64-NEXT: cmovgeq %rbp, %rax ; X64-NEXT: testl %r15d, %r15d ; X64-NEXT: jle .LBB3_4 ; X64-NEXT: .LBB3_5: # %l2.header.preheader -; X64-NEXT: cmovleq %r12, %rax +; X64-NEXT: cmovleq %rbp, %rax ; X64-NEXT: xorl %r15d, %r15d ; X64-NEXT: jmp .LBB3_6 ; X64-NEXT: .p2align 4, 0x90 ; X64-NEXT: .LBB3_11: # in Loop: Header=BB3_6 Depth=1 -; X64-NEXT: cmovgeq %r12, %rax +; X64-NEXT: cmovgeq %rbp, %rax ; X64-NEXT: .LBB3_6: # %l2.header ; X64-NEXT: # =>This Inner Loop Header: Depth=1 ; X64-NEXT: movslq (%rbx), %rcx @@ -325,39 +326,38 @@ ; X64-NEXT: movq %rax, %rdx ; X64-NEXT: orq %r14, %rdx ; X64-NEXT: movl (%rdx,%rcx,4), %edi -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %rbp, %rax ; X64-NEXT: incl %r15d -; X64-NEXT: cmpl %ebp, %r15d +; X64-NEXT: cmpl %r12d, %r15d ; X64-NEXT: jl .LBB3_11 ; X64-NEXT: # %bb.7: -; X64-NEXT: cmovlq %r12, %rax +; X64-NEXT: cmovlq %rbp, %rax ; X64-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Reload ; X64-NEXT: jmp .LBB3_8 ; X64-NEXT: .p2align 4, 0x90 ; X64-NEXT: .LBB3_4: -; X64-NEXT: cmovgq %r12, %rax +; X64-NEXT: cmovgq %rbp, %rax ; X64-NEXT: .LBB3_8: # %l1.latch ; X64-NEXT: movslq (%rbx), %rcx ; X64-NEXT: orq %rax, %rcx ; X64-NEXT: movq %rax, %rdx ; X64-NEXT: orq %r14, %rdx ; X64-NEXT: movl (%rdx,%rcx,4), %edi -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %rbp, %rax ; X64-NEXT: incl %r13d ; X64-NEXT: cmpl %r15d, %r13d ; X64-NEXT: jl .LBB3_12 ; X64-NEXT: # %bb.9: -; X64-NEXT: cmovlq %r12, %rax +; X64-NEXT: cmovlq %rbp, %rax ; X64-NEXT: .LBB3_10: # %exit -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: addq $8, %rsp ; X64-NEXT: popq %rbx @@ -469,62 +469,67 @@ ; X64-LABEL: test_basic_eh: ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %rbp +; X64-NEXT: pushq %r15 ; X64-NEXT: pushq %r14 ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: movq $-1, %rcx -; X64-NEXT: sarq $63, %rax +; X64-NEXT: pushq %rax +; X64-NEXT: movq $-1, %rbx +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %rbx, %rax ; X64-NEXT: cmpl $41, %edi ; X64-NEXT: jg .LBB4_1 ; X64-NEXT: # %bb.2: # %thrower ; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rsi, %rbx -; X64-NEXT: cmovgq %rcx, %rax +; X64-NEXT: movq %rsi, %r15 +; X64-NEXT: cmovgq %rbx, %rax ; X64-NEXT: movslq %edi, %rcx ; X64-NEXT: movl (%rsi,%rcx,4), %ebp ; X64-NEXT: orl %eax, %ebp ; X64-NEXT: movl $4, %edi -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq __cxa_allocate_exception -; X64-NEXT: movq %rsp, %rcx -; X64-NEXT: sarq $63, %rcx +; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %rbx, %rcx ; X64-NEXT: movl %ebp, (%rax) ; X64-NEXT: .Ltmp0: ; X64-NEXT: xorl %esi, %esi ; X64-NEXT: xorl %edx, %edx -; X64-NEXT: shlq $47, %rcx ; X64-NEXT: movq %rax, %rdi ; X64-NEXT: orq %rcx, %rsp ; X64-NEXT: callq __cxa_throw -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %rbx, %rax ; X64-NEXT: .Ltmp1: ; X64-NEXT: jmp .LBB4_3 ; X64-NEXT: .LBB4_1: -; X64-NEXT: cmovleq %rcx, %rax +; X64-NEXT: cmovleq %rbx, %rax ; X64-NEXT: .LBB4_3: # %exit -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp +; X64-NEXT: addq $8, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r14 +; X64-NEXT: popq %r15 ; X64-NEXT: popq %rbp ; X64-NEXT: retq ; X64-NEXT: .LBB4_4: # %lpad ; X64-NEXT: .Ltmp2: -; X64-NEXT: movq %rsp, %rcx -; X64-NEXT: sarq $63, %rcx +; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %rbx, %rcx ; X64-NEXT: movl (%rax), %eax -; X64-NEXT: addl (%rbx), %eax +; X64-NEXT: addl (%r15), %eax ; X64-NEXT: cltq ; X64-NEXT: orq %rcx, %rax ; X64-NEXT: movl (%r14,%rax,4), %edi ; X64-NEXT: orl %ecx, %edi -; X64-NEXT: shlq $47, %rcx ; X64-NEXT: orq %rcx, %rsp ; X64-NEXT: callq sink -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %rbx, %rax ; ; X64-LFENCE-LABEL: test_basic_eh: ; X64-LFENCE: # %bb.0: # %entry @@ -600,79 +605,79 @@ ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %r15 ; X64-NEXT: pushq %r14 +; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: pushq %rax -; X64-NEXT: movq %rsp, %rax ; X64-NEXT: movq %rcx, %r15 ; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rsi, %rbx ; X64-NEXT: movq %rdi, %r12 -; X64-NEXT: movq $-1, %rcx -; X64-NEXT: sarq $63, %rax +; X64-NEXT: movq $-1, %r13 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %r13, %rax ; X64-NEXT: orq %rax, %r12 ; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_float -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %r13, %rax ; X64-NEXT: orq %rax, %rbx ; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_double -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %r13, %rax ; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X64-NEXT: cvtsd2ss %xmm0, %xmm0 -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_float -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %r13, %rax ; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: cvtss2sd %xmm0, %xmm0 -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_double -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %r13, %rax ; X64-NEXT: orq %rax, %r14 ; X64-NEXT: xorps %xmm0, %xmm0 ; X64-NEXT: cvtsi2ssl (%r14), %xmm0 -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_float -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %r13, %rax ; X64-NEXT: orq %rax, %r15 ; X64-NEXT: xorps %xmm0, %xmm0 ; X64-NEXT: cvtsi2sdq (%r15), %xmm0 -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_double -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %r13, %rax ; X64-NEXT: xorps %xmm0, %xmm0 ; X64-NEXT: cvtsi2ssq (%r15), %xmm0 -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_float -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %r13, %rax ; X64-NEXT: xorps %xmm0, %xmm0 ; X64-NEXT: cvtsi2sdl (%r14), %xmm0 -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_double -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax -; X64-NEXT: shlq $47, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %r13, %rax ; X64-NEXT: orq %rax, %rsp -; X64-NEXT: addq $8, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 +; X64-NEXT: popq %r13 ; X64-NEXT: popq %r14 ; X64-NEXT: popq %r15 ; X64-NEXT: retq @@ -753,68 +758,72 @@ define void @test_vec_loads(<4 x float>* %v4f32ptr, <2 x double>* %v2f64ptr, <16 x i8>* %v16i8ptr, <8 x i16>* %v8i16ptr, <4 x i32>* %v4i32ptr, <2 x i64>* %v2i64ptr) nounwind { ; X64-LABEL: test_vec_loads: ; X64: # %bb.0: # %entry +; X64-NEXT: pushq %rbp ; X64-NEXT: pushq %r15 ; X64-NEXT: pushq %r14 ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %rsp, %rax +; X64-NEXT: pushq %rax ; X64-NEXT: movq %r9, %r14 ; X64-NEXT: movq %r8, %r15 ; X64-NEXT: movq %rcx, %r12 ; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rsi, %rbx -; X64-NEXT: movq $-1, %rcx -; X64-NEXT: sarq $63, %rax +; X64-NEXT: movq $-1, %rbp +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %rbp, %rax ; X64-NEXT: orq %rax, %rdi ; X64-NEXT: movaps (%rdi), %xmm0 -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_v4f32 -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %rbp, %rax ; X64-NEXT: orq %rax, %rbx ; X64-NEXT: movaps (%rbx), %xmm0 -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_v2f64 -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %rbp, %rax ; X64-NEXT: orq %rax, %r13 ; X64-NEXT: movaps (%r13), %xmm0 -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_v16i8 -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %rbp, %rax ; X64-NEXT: orq %rax, %r12 ; X64-NEXT: movaps (%r12), %xmm0 -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_v8i16 -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %rbp, %rax ; X64-NEXT: orq %rax, %r15 ; X64-NEXT: movaps (%r15), %xmm0 -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_v4i32 -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %rbp, %rax ; X64-NEXT: orq %rax, %r14 ; X64-NEXT: movaps (%r14), %xmm0 -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_v2i64 -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax -; X64-NEXT: shlq $47, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %rbp, %rax ; X64-NEXT: orq %rax, %rsp +; X64-NEXT: addq $8, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 ; X64-NEXT: popq %r13 ; X64-NEXT: popq %r14 ; X64-NEXT: popq %r15 +; X64-NEXT: popq %rbp ; X64-NEXT: retq ; ; X64-LFENCE-LABEL: test_vec_loads: @@ -866,65 +875,65 @@ define void @test_deferred_hardening(i32* %ptr1, i32* %ptr2, i32 %x) nounwind { ; X64-LABEL: test_deferred_hardening: ; X64: # %bb.0: # %entry +; X64-NEXT: pushq %r15 ; X64-NEXT: pushq %r14 ; X64-NEXT: pushq %rbx -; X64-NEXT: pushq %rax -; X64-NEXT: movq %rsp, %rax ; X64-NEXT: movq %rsi, %r14 ; X64-NEXT: movq %rdi, %rbx -; X64-NEXT: movq $-1, %rcx -; X64-NEXT: sarq $63, %rax +; X64-NEXT: movq $-1, %r15 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %r15, %rax ; X64-NEXT: movl (%rdi), %edi ; X64-NEXT: incl %edi ; X64-NEXT: imull %edx, %edi ; X64-NEXT: orl %eax, %edi -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %r15, %rax ; X64-NEXT: movl (%rbx), %ecx ; X64-NEXT: movl (%r14), %edx ; X64-NEXT: leal 1(%rcx,%rdx), %edi ; X64-NEXT: orl %eax, %edi -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %r15, %rax ; X64-NEXT: movl (%rbx), %edi ; X64-NEXT: shll $7, %edi ; X64-NEXT: orl %eax, %edi -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %r15, %rax ; X64-NEXT: movzwl (%rbx), %ecx ; X64-NEXT: sarw $7, %cx ; X64-NEXT: movzwl %cx, %edi ; X64-NEXT: notl %edi ; X64-NEXT: orl %eax, %edi -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %r15, %rax ; X64-NEXT: movzwl (%rbx), %ecx ; X64-NEXT: rolw $9, %cx ; X64-NEXT: movswl %cx, %edi ; X64-NEXT: negl %edi ; X64-NEXT: orl %eax, %edi -; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax -; X64-NEXT: shlq $47, %rax +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testq $-4096, %rsp # imm = 0xF000 +; X64-NEXT: cmoveq %r15, %rax ; X64-NEXT: orq %rax, %rsp -; X64-NEXT: addq $8, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r14 +; X64-NEXT: popq %r15 ; X64-NEXT: retq ; ; X64-LFENCE-LABEL: test_deferred_hardening: