diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp --- a/llvm/lib/CodeGen/MachineSink.cpp +++ b/llvm/lib/CodeGen/MachineSink.cpp @@ -127,6 +127,12 @@ /// current block. DenseSet SeenDbgVars; + std::map, bool> + HasStoreCache; + std::map, + std::vector> + StoreInstrCache; + public: static char ID; // Pass identification @@ -159,6 +165,9 @@ MachineBasicBlock *From, MachineBasicBlock *To); + bool hasStoreBetween(MachineBasicBlock *From, MachineBasicBlock *To, + MachineInstr &MI); + /// Postpone the splitting of the given critical /// edge (\p From, \p To). /// @@ -336,6 +345,8 @@ // Process all basic blocks. CEBCandidates.clear(); ToSplit.clear(); + StoreInstrCache.clear(); + HasStoreCache.clear(); for (auto &MBB: MF) MadeChange |= ProcessBlock(MBB); @@ -874,6 +885,65 @@ } } +/// hasStoreBetween - check if there is store betweeen straight line blocks From +/// and To. +bool MachineSinking::hasStoreBetween(MachineBasicBlock *From, + MachineBasicBlock *To, MachineInstr &MI) { + // Make sure From and To are in straight line which means From dominates To + // and To post dominates From. + if (!DT->dominates(From, To) || !PDT->dominates(To, From)) + return true; + + auto BlockPair = std::make_pair(From, To); + + // Does these two blocks pair be queried before and have a definite cached + // result? + if (HasStoreCache.find(BlockPair) != HasStoreCache.end()) + return HasStoreCache[BlockPair]; + + if (StoreInstrCache.find(BlockPair) != StoreInstrCache.end()) + return std::any_of( + StoreInstrCache[BlockPair].begin(), StoreInstrCache[BlockPair].end(), + [&](MachineInstr *I) { return I->mayAlias(AA, MI, false); }); + + bool SawStore = false; + bool hasAliasedStore = false; + // Go through all reachable blocks from From. + for (MachineBasicBlock *BB : depth_first(From)) { + // We insert the instruction at the start of block To, so no need to worry + // about stores inside To. + // Store in block From should be considered just enter SinkInstruction. + if (BB == To || BB == From) + continue; + // To post dominates BB, it must be a path from block From. + if (PDT->dominates(To, BB)) { + for (MachineInstr &I : *BB) { + // Treat as alias conservatively for a call. + if (I.isCall()) { + HasStoreCache[BlockPair] = true; + return true; + } + + if (I.mayStore() || (I.mayLoad() && I.hasOrderedMemoryRef())) { + SawStore = true; + // We still have chance to sink MI if all stores between are not + // aliased to MI. + // Cache all store instructions, so that we don't need to go through + // all From reachable blocks for next load instruction. + if (I.mayAlias(AA, MI, false)) { + StoreInstrCache[BlockPair].push_back(&I); + hasAliasedStore = true; + } + } + } + } + } + // If there is no store at all, cache the result. + if (!SawStore) + HasStoreCache[BlockPair] = false; + return hasAliasedStore; +} + /// SinkInstruction - Determine whether it is safe to sink the specified machine /// instruction out of its current block into a successor. bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore, @@ -934,8 +1004,9 @@ // We cannot sink a load across a critical edge - there may be stores in // other code paths. bool TryBreak = false; - bool store = true; - if (!MI.isSafeToMove(AA, store)) { + bool Store = + MI.mayLoad() ? hasStoreBetween(ParentBlock, SuccToSinkTo, MI) : true; + if (!MI.isSafeToMove(AA, Store)) { LLVM_DEBUG(dbgs() << " *** NOTE: Won't sink load along critical edge.\n"); TryBreak = true; } diff --git a/llvm/test/CodeGen/RISCV/select-optimize-multiple.ll b/llvm/test/CodeGen/RISCV/select-optimize-multiple.ll --- a/llvm/test/CodeGen/RISCV/select-optimize-multiple.ll +++ b/llvm/test/CodeGen/RISCV/select-optimize-multiple.ll @@ -40,40 +40,39 @@ ; RV32I-LABEL: cmovcc128: ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: xori a1, a1, 123 -; RV32I-NEXT: or a2, a1, a2 -; RV32I-NEXT: mv a1, a3 -; RV32I-NEXT: beqz a2, .LBB1_2 +; RV32I-NEXT: or a1, a1, a2 +; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: beqz a1, .LBB1_2 ; RV32I-NEXT: # %bb.1: # %entry -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a2, a4 ; RV32I-NEXT: .LBB1_2: # %entry -; RV32I-NEXT: lw a6, 0(a1) -; RV32I-NEXT: beqz a2, .LBB1_6 +; RV32I-NEXT: beqz a1, .LBB1_5 ; RV32I-NEXT: # %bb.3: # %entry -; RV32I-NEXT: addi a1, a4, 4 -; RV32I-NEXT: lw a5, 0(a1) -; RV32I-NEXT: bnez a2, .LBB1_7 +; RV32I-NEXT: addi a7, a4, 4 +; RV32I-NEXT: bnez a1, .LBB1_6 ; RV32I-NEXT: .LBB1_4: -; RV32I-NEXT: addi a1, a3, 8 -; RV32I-NEXT: lw a1, 0(a1) -; RV32I-NEXT: bnez a2, .LBB1_8 +; RV32I-NEXT: addi a5, a3, 8 +; RV32I-NEXT: j .LBB1_7 ; RV32I-NEXT: .LBB1_5: -; RV32I-NEXT: addi a2, a3, 12 -; RV32I-NEXT: j .LBB1_9 -; RV32I-NEXT: .LBB1_6: -; RV32I-NEXT: addi a1, a3, 4 -; RV32I-NEXT: lw a5, 0(a1) -; RV32I-NEXT: beqz a2, .LBB1_4 +; RV32I-NEXT: addi a7, a3, 4 +; RV32I-NEXT: beqz a1, .LBB1_4 +; RV32I-NEXT: .LBB1_6: # %entry +; RV32I-NEXT: addi a5, a4, 8 ; RV32I-NEXT: .LBB1_7: # %entry -; RV32I-NEXT: addi a1, a4, 8 +; RV32I-NEXT: lw a6, 0(a2) +; RV32I-NEXT: lw a7, 0(a7) +; RV32I-NEXT: lw a2, 0(a5) +; RV32I-NEXT: beqz a1, .LBB1_9 +; RV32I-NEXT: # %bb.8: # %entry +; RV32I-NEXT: addi a1, a4, 12 +; RV32I-NEXT: j .LBB1_10 +; RV32I-NEXT: .LBB1_9: +; RV32I-NEXT: addi a1, a3, 12 +; RV32I-NEXT: .LBB1_10: # %entry ; RV32I-NEXT: lw a1, 0(a1) -; RV32I-NEXT: beqz a2, .LBB1_5 -; RV32I-NEXT: .LBB1_8: # %entry -; RV32I-NEXT: addi a2, a4, 12 -; RV32I-NEXT: .LBB1_9: # %entry -; RV32I-NEXT: lw a2, 0(a2) -; RV32I-NEXT: sw a2, 12(a0) -; RV32I-NEXT: sw a1, 8(a0) -; RV32I-NEXT: sw a5, 4(a0) +; RV32I-NEXT: sw a1, 12(a0) +; RV32I-NEXT: sw a2, 8(a0) +; RV32I-NEXT: sw a7, 4(a0) ; RV32I-NEXT: sw a6, 0(a0) ; RV32I-NEXT: ret ; @@ -124,40 +123,39 @@ define i128 @cmov128(i1 %a, i128 %b, i128 %c) nounwind { ; RV32I-LABEL: cmov128: ; RV32I: # %bb.0: # %entry -; RV32I-NEXT: andi a4, a1, 1 -; RV32I-NEXT: mv a1, a2 -; RV32I-NEXT: bnez a4, .LBB3_2 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: mv a4, a2 +; RV32I-NEXT: bnez a1, .LBB3_2 ; RV32I-NEXT: # %bb.1: # %entry -; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: mv a4, a3 ; RV32I-NEXT: .LBB3_2: # %entry -; RV32I-NEXT: lw a6, 0(a1) -; RV32I-NEXT: bnez a4, .LBB3_6 +; RV32I-NEXT: bnez a1, .LBB3_5 ; RV32I-NEXT: # %bb.3: # %entry -; RV32I-NEXT: addi a1, a3, 4 -; RV32I-NEXT: lw a5, 0(a1) -; RV32I-NEXT: beqz a4, .LBB3_7 +; RV32I-NEXT: addi a7, a3, 4 +; RV32I-NEXT: beqz a1, .LBB3_6 ; RV32I-NEXT: .LBB3_4: -; RV32I-NEXT: addi a1, a2, 8 -; RV32I-NEXT: lw a1, 0(a1) -; RV32I-NEXT: beqz a4, .LBB3_8 +; RV32I-NEXT: addi a5, a2, 8 +; RV32I-NEXT: j .LBB3_7 ; RV32I-NEXT: .LBB3_5: -; RV32I-NEXT: addi a2, a2, 12 -; RV32I-NEXT: j .LBB3_9 -; RV32I-NEXT: .LBB3_6: -; RV32I-NEXT: addi a1, a2, 4 -; RV32I-NEXT: lw a5, 0(a1) -; RV32I-NEXT: bnez a4, .LBB3_4 +; RV32I-NEXT: addi a7, a2, 4 +; RV32I-NEXT: bnez a1, .LBB3_4 +; RV32I-NEXT: .LBB3_6: # %entry +; RV32I-NEXT: addi a5, a3, 8 ; RV32I-NEXT: .LBB3_7: # %entry -; RV32I-NEXT: addi a1, a3, 8 +; RV32I-NEXT: lw a6, 0(a4) +; RV32I-NEXT: lw a7, 0(a7) +; RV32I-NEXT: lw a4, 0(a5) +; RV32I-NEXT: bnez a1, .LBB3_9 +; RV32I-NEXT: # %bb.8: # %entry +; RV32I-NEXT: addi a1, a3, 12 +; RV32I-NEXT: j .LBB3_10 +; RV32I-NEXT: .LBB3_9: +; RV32I-NEXT: addi a1, a2, 12 +; RV32I-NEXT: .LBB3_10: # %entry ; RV32I-NEXT: lw a1, 0(a1) -; RV32I-NEXT: bnez a4, .LBB3_5 -; RV32I-NEXT: .LBB3_8: # %entry -; RV32I-NEXT: addi a2, a3, 12 -; RV32I-NEXT: .LBB3_9: # %entry -; RV32I-NEXT: lw a2, 0(a2) -; RV32I-NEXT: sw a2, 12(a0) -; RV32I-NEXT: sw a1, 8(a0) -; RV32I-NEXT: sw a5, 4(a0) +; RV32I-NEXT: sw a1, 12(a0) +; RV32I-NEXT: sw a4, 8(a0) +; RV32I-NEXT: sw a7, 4(a0) ; RV32I-NEXT: sw a6, 0(a0) ; RV32I-NEXT: ret ; diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll @@ -9,7 +9,7 @@ ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dlstp.8 lr, r2 -; CHECK: .LBB0_2: @ %vector.body +; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16 ; CHECK-NEXT: vldrb.u8 q2, [r0], #16 @@ -75,7 +75,7 @@ ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #3 ; CHECK-NEXT: dls lr, lr -; CHECK: .LBB1_2: @ %vector.body +; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.16 r2 ; CHECK-NEXT: vmov q0, q1 @@ -148,7 +148,7 @@ ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #4 ; CHECK-NEXT: dls lr, lr -; CHECK: .LBB2_2: @ %vector.body +; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.8 r2 ; CHECK-NEXT: vmov q0, q1 @@ -218,7 +218,7 @@ ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #3 ; CHECK-NEXT: dls lr, lr -; CHECK: .LBB3_2: @ %vector.body +; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.16 r2 ; CHECK-NEXT: vmov q0, q1 @@ -290,7 +290,7 @@ ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #4 ; CHECK-NEXT: dls lr, lr -; CHECK: .LBB4_2: @ %vector.body +; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.8 r2 ; CHECK-NEXT: vmov q0, q1 @@ -360,7 +360,7 @@ ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #3 ; CHECK-NEXT: dls lr, lr -; CHECK: .LBB5_2: @ %vector.body +; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.16 r2 ; CHECK-NEXT: vmov q0, q1 @@ -432,7 +432,7 @@ ; CHECK-NEXT: add.w lr, r3, r6, lsr #2 ; CHECK-NEXT: mov r3, r2 ; CHECK-NEXT: dls lr, lr -; CHECK: .LBB6_2: @ %vector.body +; CHECK-NEXT: .LBB6_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vmov q0, q1 @@ -454,7 +454,7 @@ ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vdup.32 q0, r3 ; CHECK-NEXT: vmov.32 q0[0], r12 -; CHECK: .LBB6_5: @ %vector.body46 +; CHECK-NEXT: .LBB6_5: @ %vector.body46 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: vmov q1, q0 @@ -559,7 +559,7 @@ ; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: mov r4, r1 ; CHECK-NEXT: dls lr, lr -; CHECK: .LBB7_2: @ %vector.body +; CHECK-NEXT: .LBB7_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.16 r2 ; CHECK-NEXT: vmov q0, q1 @@ -640,68 +640,67 @@ define i32 @wrongop(%struct.date* nocapture readonly %pd) { ; CHECK-LABEL: wrongop: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: ldr.w r2, [r12, #8] -; CHECK-NEXT: lsls r3, r2, #30 +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldr r1, [r0, #8] +; CHECK-NEXT: lsls r2, r1, #30 ; CHECK-NEXT: bne .LBB8_3 ; CHECK-NEXT: @ %bb.1: @ %entry -; CHECK-NEXT: movw r3, #34079 -; CHECK-NEXT: movt r3, #20971 -; CHECK-NEXT: smmul r3, r2, r3 -; CHECK-NEXT: asrs r1, r3, #5 -; CHECK-NEXT: add.w r1, r1, r3, lsr #31 +; CHECK-NEXT: movw r2, #34079 +; CHECK-NEXT: movt r2, #20971 +; CHECK-NEXT: smmul r2, r1, r2 +; CHECK-NEXT: asrs r3, r2, #5 +; CHECK-NEXT: add.w r2, r3, r2, lsr #31 ; CHECK-NEXT: movs r3, #100 -; CHECK-NEXT: mls r1, r1, r3, r2 -; CHECK-NEXT: cbz r1, .LBB8_3 +; CHECK-NEXT: mls r2, r2, r3, r1 +; CHECK-NEXT: cbz r2, .LBB8_3 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: movs r4, #1 +; CHECK-NEXT: mov.w r12, #1 ; CHECK-NEXT: b .LBB8_4 ; CHECK-NEXT: .LBB8_3: @ %lor.rhs -; CHECK-NEXT: movw r1, #47184 +; CHECK-NEXT: movw r2, #47184 ; CHECK-NEXT: movw r3, #23593 -; CHECK-NEXT: movt r1, #1310 +; CHECK-NEXT: movt r2, #1310 ; CHECK-NEXT: movt r3, #49807 -; CHECK-NEXT: mla r1, r2, r3, r1 +; CHECK-NEXT: mla r1, r1, r3, r2 ; CHECK-NEXT: movw r2, #55051 ; CHECK-NEXT: movt r2, #163 ; CHECK-NEXT: ror.w r1, r1, #4 ; CHECK-NEXT: cmp r1, r2 -; CHECK-NEXT: cset r4, lo +; CHECK-NEXT: cset r12, lo ; CHECK-NEXT: .LBB8_4: @ %lor.end -; CHECK-NEXT: ldr.w r3, [r12, #4] +; CHECK-NEXT: ldrd r2, r3, [r0] ; CHECK-NEXT: cmp r3, #1 -; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r4, pc} -; CHECK-NEXT: .LBB8_5: @ %vector.ph -; CHECK-NEXT: adds r1, r3, #3 -; CHECK-NEXT: movs r2, #1 -; CHECK-NEXT: bic r1, r1, #3 -; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: add.w lr, r2, r1, lsr #2 -; CHECK-NEXT: movw r1, :lower16:days -; CHECK-NEXT: movt r1, :upper16:days -; CHECK-NEXT: movs r2, #52 -; CHECK-NEXT: mla r1, r4, r2, r1 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: vdup.32 q0, r2 +; CHECK-NEXT: blt .LBB8_8 +; CHECK-NEXT: @ %bb.5: @ %vector.ph +; CHECK-NEXT: adds r0, r3, #3 +; CHECK-NEXT: movs r1, #1 +; CHECK-NEXT: bic r0, r0, #3 +; CHECK-NEXT: subs r0, #4 +; CHECK-NEXT: add.w lr, r1, r0, lsr #2 +; CHECK-NEXT: movw r0, :lower16:days +; CHECK-NEXT: movt r0, :upper16:days +; CHECK-NEXT: movs r1, #52 +; CHECK-NEXT: mla r0, r12, r1, r0 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: vdup.32 q0, r1 +; CHECK-NEXT: subs r1, r3, #1 +; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: subs r0, r3, #1 -; CHECK: .LBB8_6: @ %vector.body +; CHECK-NEXT: .LBB8_6: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r0 +; CHECK-NEXT: vctp.32 r1 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q0, [r1], #16 -; CHECK-NEXT: subs r0, #4 +; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 +; CHECK-NEXT: subs r1, #4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: le lr, .LBB8_6 ; CHECK-NEXT: @ %bb.7: @ %middle.block ; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: vaddv.u32 r2, q0 +; CHECK-NEXT: .LBB8_8: @ %for.end +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: pop {r7, pc} entry: %day1 = getelementptr inbounds %struct.date, %struct.date* %pd, i32 0, i32 0 %0 = load i32, i32* %day1, align 4 diff --git a/llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll b/llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll --- a/llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll +++ b/llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll @@ -14,40 +14,40 @@ ; CHECK-NEXT: .cfi_offset %rbp, -16 ; CHECK-NEXT: movq %rsp, %rbp ; CHECK-NEXT: .cfi_def_cfa_register %rbp -; CHECK-NEXT: movslq (%rdi), %rax +; CHECK-NEXT: movslq (%rdi), %rdi ; CHECK-NEXT: movslq (%rsi), %r8 ; CHECK-NEXT: movslq (%rdx), %r10 -; CHECK-NEXT: movl (%rcx), %edi -; CHECK-NEXT: movslq (%r9), %rcx -; CHECK-NEXT: movq %rsp, %rdx -; CHECK-NEXT: subl %eax, %r8d -; CHECK-NEXT: movslq %r8d, %rsi +; CHECK-NEXT: movl (%rcx), %esi +; CHECK-NEXT: movq %rsp, %rcx +; CHECK-NEXT: subl %edi, %r8d +; CHECK-NEXT: movslq %r8d, %rdx ; CHECK-NEXT: js .LBB0_1 ; CHECK-NEXT: # %bb.11: # %b63 -; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: js .LBB0_14 ; CHECK-NEXT: # %bb.12: -; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_13: # %a25b ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: testb %dil, %dil ; CHECK-NEXT: je .LBB0_13 ; CHECK-NEXT: .LBB0_14: # %b85 ; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: # %bb.15: -; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_16: # %a25b140 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: testb %dil, %dil ; CHECK-NEXT: je .LBB0_16 ; CHECK-NEXT: .LBB0_1: # %a29b -; CHECK-NEXT: cmpl %r10d, %edi +; CHECK-NEXT: cmpl %r10d, %esi ; CHECK-NEXT: js .LBB0_10 ; CHECK-NEXT: # %bb.2: # %b158 +; CHECK-NEXT: movslq (%r9), %rsi ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: movb $1, %r10b @@ -77,7 +77,7 @@ ; CHECK-NEXT: js .LBB0_4 ; CHECK-NEXT: # %bb.17: # %b179 ; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: js .LBB0_18 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_37: # %a30b @@ -97,7 +97,7 @@ ; CHECK-NEXT: je .LBB0_19 ; CHECK-NEXT: .LBB0_4: # %a33b ; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: movl %esi, %eax ; CHECK-NEXT: orl %r8d, %eax ; CHECK-NEXT: movl %eax, %r9d ; CHECK-NEXT: shrl $31, %r9d @@ -106,7 +106,7 @@ ; CHECK-NEXT: .LBB0_5: # %a50b ; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: movl %r8d, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: orl %esi, %eax ; CHECK-NEXT: movl %eax, %r11d ; CHECK-NEXT: shrl $31, %r11d ; CHECK-NEXT: testl %eax, %eax @@ -156,7 +156,7 @@ ; CHECK-NEXT: # Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: # => This Loop Header: Depth=2 ; CHECK-NEXT: # Child Loop BB0_21 Depth 3 -; CHECK-NEXT: testq %rcx, %rcx +; CHECK-NEXT: testq %rsi, %rsi ; CHECK-NEXT: js .LBB0_22 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_21: # %a35b @@ -169,14 +169,14 @@ ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_28: # %b1016 ; CHECK-NEXT: # in Loop: Header=BB0_26 Depth=2 -; CHECK-NEXT: testq %rcx, %rcx +; CHECK-NEXT: testq %rsi, %rsi ; CHECK-NEXT: jle .LBB0_6 ; CHECK-NEXT: .LBB0_26: # %b858 ; CHECK-NEXT: # Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: # => This Loop Header: Depth=2 ; CHECK-NEXT: # Child Loop BB0_38 Depth 3 ; CHECK-NEXT: # Child Loop BB0_29 Depth 3 -; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: js .LBB0_27 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_38: # %a53b @@ -194,38 +194,38 @@ ; CHECK-NEXT: # Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: # Parent Loop BB0_26 Depth=2 ; CHECK-NEXT: # => This Inner Loop Header: Depth=3 -; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: jle .LBB0_29 ; CHECK-NEXT: jmp .LBB0_28 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_32: # %b1263 ; CHECK-NEXT: # in Loop: Header=BB0_30 Depth=2 -; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: jle .LBB0_7 ; CHECK-NEXT: .LBB0_30: # %b1117 ; CHECK-NEXT: # Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: # => This Loop Header: Depth=2 ; CHECK-NEXT: # Child Loop BB0_39 Depth 3 ; CHECK-NEXT: # Child Loop BB0_33 Depth 3 -; CHECK-NEXT: testq %rcx, %rcx +; CHECK-NEXT: testq %rsi, %rsi ; CHECK-NEXT: js .LBB0_31 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_39: # %a63b ; CHECK-NEXT: # Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: # Parent Loop BB0_30 Depth=2 ; CHECK-NEXT: # => This Inner Loop Header: Depth=3 -; CHECK-NEXT: testq %rcx, %rcx +; CHECK-NEXT: testq %rsi, %rsi ; CHECK-NEXT: jle .LBB0_39 ; CHECK-NEXT: .LBB0_31: # %b1139 ; CHECK-NEXT: # in Loop: Header=BB0_30 Depth=2 -; CHECK-NEXT: testq %rcx, %rcx +; CHECK-NEXT: testq %rsi, %rsi ; CHECK-NEXT: jle .LBB0_32 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_33: # %a63b1266 ; CHECK-NEXT: # Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: # Parent Loop BB0_30 Depth=2 ; CHECK-NEXT: # => This Inner Loop Header: Depth=3 -; CHECK-NEXT: testq %rcx, %rcx +; CHECK-NEXT: testq %rsi, %rsi ; CHECK-NEXT: jle .LBB0_33 ; CHECK-NEXT: jmp .LBB0_32 ; CHECK-NEXT: .p2align 4, 0x90 @@ -237,7 +237,7 @@ ; CHECK-NEXT: # Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: # => This Loop Header: Depth=2 ; CHECK-NEXT: # Child Loop BB0_24 Depth 3 -; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: js .LBB0_25 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_24: # %a45b diff --git a/llvm/test/CodeGen/X86/MachineSink-eflags.ll b/llvm/test/CodeGen/X86/MachineSink-eflags.ll --- a/llvm/test/CodeGen/X86/MachineSink-eflags.ll +++ b/llvm/test/CodeGen/X86/MachineSink-eflags.ll @@ -16,31 +16,30 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $152, %rsp ; CHECK-NEXT: movq 48(%rdi), %rax -; CHECK-NEXT: movl 64(%rdi), %edx +; CHECK-NEXT: movl 64(%rdi), %ecx ; CHECK-NEXT: movl $200, %esi ; CHECK-NEXT: addl 68(%rdi), %esi -; CHECK-NEXT: imull $46, %edx, %ecx -; CHECK-NEXT: addq %rsi, %rcx -; CHECK-NEXT: shlq $4, %rcx -; CHECK-NEXT: imull $47, %edx, %edx +; CHECK-NEXT: imull $46, %ecx, %edx ; CHECK-NEXT: addq %rsi, %rdx ; CHECK-NEXT: shlq $4, %rdx -; CHECK-NEXT: movaps (%rax,%rdx), %xmm0 +; CHECK-NEXT: imull $47, %ecx, %ecx +; CHECK-NEXT: addq %rsi, %rcx +; CHECK-NEXT: shlq $4, %rcx ; CHECK-NEXT: cmpl $0, (%rdi) ; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: # %bb.2: # %entry -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: je .LBB0_4 -; CHECK-NEXT: jmp .LBB0_5 +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: jmp .LBB0_3 ; CHECK-NEXT: .LBB0_1: +; CHECK-NEXT: movaps (%rax,%rdx), %xmm0 +; CHECK-NEXT: .LBB0_3: # %entry ; CHECK-NEXT: movaps (%rax,%rcx), %xmm1 -; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: jne .LBB0_5 -; CHECK-NEXT: .LBB0_4: # %entry -; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: # %bb.4: # %entry +; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: .LBB0_5: # %entry -; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: addq $152, %rsp ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/avx2-masked-gather.ll b/llvm/test/CodeGen/X86/avx2-masked-gather.ll --- a/llvm/test/CodeGen/X86/avx2-masked-gather.ll +++ b/llvm/test/CodeGen/X86/avx2-masked-gather.ll @@ -358,44 +358,57 @@ ; ; NOGATHER-LABEL: masked_gather_v8i32: ; NOGATHER: # %bb.0: # %entry -; NOGATHER-NEXT: vmovdqa (%rdi), %ymm3 -; NOGATHER-NEXT: vmovdqa 32(%rdi), %ymm2 +; NOGATHER-NEXT: vmovdqa (%rdi), %ymm2 ; NOGATHER-NEXT: vpsllw $15, %xmm0, %xmm0 ; NOGATHER-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; NOGATHER-NEXT: vpmovmskb %xmm0, %eax ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB6_2 ; NOGATHER-NEXT: # %bb.1: # %cond.load -; NOGATHER-NEXT: vmovq %xmm3, %rcx +; NOGATHER-NEXT: vmovq %xmm2, %rcx ; NOGATHER-NEXT: vpinsrd $0, (%rcx), %xmm1, %xmm0 ; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; NOGATHER-NEXT: .LBB6_2: # %else ; NOGATHER-NEXT: testb $2, %al ; NOGATHER-NEXT: je .LBB6_4 ; NOGATHER-NEXT: # %bb.3: # %cond.load1 -; NOGATHER-NEXT: vpextrq $1, %xmm3, %rcx +; NOGATHER-NEXT: vpextrq $1, %xmm2, %rcx ; NOGATHER-NEXT: vpinsrd $1, (%rcx), %xmm1, %xmm0 ; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; NOGATHER-NEXT: .LBB6_4: # %else2 -; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm0 +; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0 ; NOGATHER-NEXT: testb $4, %al -; NOGATHER-NEXT: jne .LBB6_5 -; NOGATHER-NEXT: # %bb.6: # %else5 +; NOGATHER-NEXT: je .LBB6_6 +; NOGATHER-NEXT: # %bb.5: # %cond.load4 +; NOGATHER-NEXT: vmovq %xmm0, %rcx +; NOGATHER-NEXT: vpinsrd $2, (%rcx), %xmm1, %xmm2 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; NOGATHER-NEXT: .LBB6_6: # %else5 ; NOGATHER-NEXT: testb $8, %al -; NOGATHER-NEXT: jne .LBB6_7 +; NOGATHER-NEXT: je .LBB6_8 +; NOGATHER-NEXT: # %bb.7: # %cond.load7 +; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx +; NOGATHER-NEXT: vpinsrd $3, (%rcx), %xmm1, %xmm0 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; NOGATHER-NEXT: .LBB6_8: # %else8 +; NOGATHER-NEXT: vmovdqa 32(%rdi), %ymm0 ; NOGATHER-NEXT: testb $16, %al -; NOGATHER-NEXT: jne .LBB6_9 +; NOGATHER-NEXT: je .LBB6_10 +; NOGATHER-NEXT: # %bb.9: # %cond.load10 +; NOGATHER-NEXT: vmovq %xmm0, %rcx +; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 +; NOGATHER-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2 +; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; NOGATHER-NEXT: .LBB6_10: # %else11 ; NOGATHER-NEXT: testb $32, %al ; NOGATHER-NEXT: je .LBB6_12 -; NOGATHER-NEXT: .LBB6_11: # %cond.load13 -; NOGATHER-NEXT: vpextrq $1, %xmm2, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 -; NOGATHER-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0 -; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; NOGATHER-NEXT: # %bb.11: # %cond.load13 +; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx +; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 +; NOGATHER-NEXT: vpinsrd $1, (%rcx), %xmm2, %xmm2 +; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; NOGATHER-NEXT: .LBB6_12: # %else14 -; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0 +; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0 ; NOGATHER-NEXT: testb $64, %al ; NOGATHER-NEXT: jne .LBB6_13 ; NOGATHER-NEXT: # %bb.14: # %else17 @@ -404,26 +417,6 @@ ; NOGATHER-NEXT: .LBB6_16: # %else20 ; NOGATHER-NEXT: vmovaps %ymm1, %ymm0 ; NOGATHER-NEXT: retq -; NOGATHER-NEXT: .LBB6_5: # %cond.load4 -; NOGATHER-NEXT: vmovq %xmm0, %rcx -; NOGATHER-NEXT: vpinsrd $2, (%rcx), %xmm1, %xmm3 -; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; NOGATHER-NEXT: testb $8, %al -; NOGATHER-NEXT: je .LBB6_8 -; NOGATHER-NEXT: .LBB6_7: # %cond.load7 -; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx -; NOGATHER-NEXT: vpinsrd $3, (%rcx), %xmm1, %xmm0 -; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; NOGATHER-NEXT: testb $16, %al -; NOGATHER-NEXT: je .LBB6_10 -; NOGATHER-NEXT: .LBB6_9: # %cond.load10 -; NOGATHER-NEXT: vmovq %xmm2, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 -; NOGATHER-NEXT: vpinsrd $0, (%rcx), %xmm0, %xmm0 -; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; NOGATHER-NEXT: testb $32, %al -; NOGATHER-NEXT: jne .LBB6_11 -; NOGATHER-NEXT: jmp .LBB6_12 ; NOGATHER-NEXT: .LBB6_13: # %cond.load16 ; NOGATHER-NEXT: vmovq %xmm0, %rcx ; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 @@ -472,44 +465,58 @@ ; ; NOGATHER-LABEL: masked_gather_v8float: ; NOGATHER: # %bb.0: # %entry -; NOGATHER-NEXT: vmovdqa (%rdi), %ymm3 -; NOGATHER-NEXT: vmovdqa 32(%rdi), %ymm2 +; NOGATHER-NEXT: vmovdqa (%rdi), %ymm2 ; NOGATHER-NEXT: vpsllw $15, %xmm0, %xmm0 ; NOGATHER-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; NOGATHER-NEXT: vpmovmskb %xmm0, %eax ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB7_2 ; NOGATHER-NEXT: # %bb.1: # %cond.load -; NOGATHER-NEXT: vmovq %xmm3, %rcx +; NOGATHER-NEXT: vmovq %xmm2, %rcx ; NOGATHER-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7] ; NOGATHER-NEXT: .LBB7_2: # %else ; NOGATHER-NEXT: testb $2, %al ; NOGATHER-NEXT: je .LBB7_4 ; NOGATHER-NEXT: # %bb.3: # %cond.load1 -; NOGATHER-NEXT: vpextrq $1, %xmm3, %rcx +; NOGATHER-NEXT: vpextrq $1, %xmm2, %rcx ; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],mem[0],xmm1[2,3] ; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; NOGATHER-NEXT: .LBB7_4: # %else2 -; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm0 +; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0 ; NOGATHER-NEXT: testb $4, %al -; NOGATHER-NEXT: jne .LBB7_5 -; NOGATHER-NEXT: # %bb.6: # %else5 +; NOGATHER-NEXT: je .LBB7_6 +; NOGATHER-NEXT: # %bb.5: # %cond.load4 +; NOGATHER-NEXT: vmovq %xmm0, %rcx +; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm1[0,1],mem[0],xmm1[3] +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; NOGATHER-NEXT: .LBB7_6: # %else5 ; NOGATHER-NEXT: testb $8, %al -; NOGATHER-NEXT: jne .LBB7_7 +; NOGATHER-NEXT: je .LBB7_8 +; NOGATHER-NEXT: # %bb.7: # %cond.load7 +; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx +; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],mem[0] +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; NOGATHER-NEXT: .LBB7_8: # %else8 +; NOGATHER-NEXT: vmovdqa 32(%rdi), %ymm0 ; NOGATHER-NEXT: testb $16, %al -; NOGATHER-NEXT: jne .LBB7_9 +; NOGATHER-NEXT: je .LBB7_10 +; NOGATHER-NEXT: # %bb.9: # %cond.load10 +; NOGATHER-NEXT: vmovq %xmm0, %rcx +; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 +; NOGATHER-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; NOGATHER-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] +; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; NOGATHER-NEXT: .LBB7_10: # %else11 ; NOGATHER-NEXT: testb $32, %al ; NOGATHER-NEXT: je .LBB7_12 -; NOGATHER-NEXT: .LBB7_11: # %cond.load13 -; NOGATHER-NEXT: vpextrq $1, %xmm2, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 -; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; NOGATHER-NEXT: # %bb.11: # %cond.load13 +; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx +; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 +; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] +; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; NOGATHER-NEXT: .LBB7_12: # %else14 -; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0 +; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0 ; NOGATHER-NEXT: testb $64, %al ; NOGATHER-NEXT: jne .LBB7_13 ; NOGATHER-NEXT: # %bb.14: # %else17 @@ -518,27 +525,6 @@ ; NOGATHER-NEXT: .LBB7_16: # %else20 ; NOGATHER-NEXT: vmovaps %ymm1, %ymm0 ; NOGATHER-NEXT: retq -; NOGATHER-NEXT: .LBB7_5: # %cond.load4 -; NOGATHER-NEXT: vmovq %xmm0, %rcx -; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1],mem[0],xmm1[3] -; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; NOGATHER-NEXT: testb $8, %al -; NOGATHER-NEXT: je .LBB7_8 -; NOGATHER-NEXT: .LBB7_7: # %cond.load7 -; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx -; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],mem[0] -; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; NOGATHER-NEXT: testb $16, %al -; NOGATHER-NEXT: je .LBB7_10 -; NOGATHER-NEXT: .LBB7_9: # %cond.load10 -; NOGATHER-NEXT: vmovq %xmm2, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 -; NOGATHER-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; NOGATHER-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] -; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; NOGATHER-NEXT: testb $32, %al -; NOGATHER-NEXT: jne .LBB7_11 -; NOGATHER-NEXT: jmp .LBB7_12 ; NOGATHER-NEXT: .LBB7_13: # %cond.load16 ; NOGATHER-NEXT: vmovq %xmm0, %rcx ; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 diff --git a/llvm/test/CodeGen/X86/cmovcmov.ll b/llvm/test/CodeGen/X86/cmovcmov.ll --- a/llvm/test/CodeGen/X86/cmovcmov.ll +++ b/llvm/test/CodeGen/X86/cmovcmov.ll @@ -165,14 +165,13 @@ ; NOCMOV-NEXT: fnstsw %ax ; NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax ; NOCMOV-NEXT: sahf -; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %eax +; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %ecx ; NOCMOV-NEXT: jne .LBB4_3 ; NOCMOV-NEXT: # %bb.1: # %entry ; NOCMOV-NEXT: jp .LBB4_3 ; NOCMOV-NEXT: # %bb.2: # %entry -; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %eax +; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %ecx ; NOCMOV-NEXT: .LBB4_3: # %entry -; NOCMOV-NEXT: movl (%eax), %ecx ; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %edx ; NOCMOV-NEXT: jne .LBB4_6 ; NOCMOV-NEXT: # %bb.4: # %entry @@ -181,7 +180,6 @@ ; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %edx ; NOCMOV-NEXT: .LBB4_6: # %entry ; NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; NOCMOV-NEXT: movl (%edx), %edx ; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %esi ; NOCMOV-NEXT: jne .LBB4_9 ; NOCMOV-NEXT: # %bb.7: # %entry @@ -189,6 +187,8 @@ ; NOCMOV-NEXT: # %bb.8: # %entry ; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %esi ; NOCMOV-NEXT: .LBB4_9: # %entry +; NOCMOV-NEXT: movl (%ecx), %ecx +; NOCMOV-NEXT: movl (%edx), %edx ; NOCMOV-NEXT: movl (%esi), %esi ; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %edi ; NOCMOV-NEXT: jne .LBB4_12 diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll --- a/llvm/test/CodeGen/X86/select.ll +++ b/llvm/test/CodeGen/X86/select.ll @@ -557,63 +557,59 @@ ; MCU-NEXT: testb $1, %al ; MCU-NEXT: jne .LBB7_1 ; MCU-NEXT: # %bb.2: -; MCU-NEXT: leal {{[0-9]+}}(%esp), %eax -; MCU-NEXT: movl (%eax), %eax +; MCU-NEXT: leal {{[0-9]+}}(%esp), %edi ; MCU-NEXT: je .LBB7_5 ; MCU-NEXT: .LBB7_4: ; MCU-NEXT: leal {{[0-9]+}}(%esp), %ecx -; MCU-NEXT: movl (%ecx), %ecx ; MCU-NEXT: je .LBB7_8 ; MCU-NEXT: .LBB7_7: ; MCU-NEXT: leal {{[0-9]+}}(%esp), %esi -; MCU-NEXT: movl (%esi), %esi ; MCU-NEXT: je .LBB7_11 ; MCU-NEXT: .LBB7_10: -; MCU-NEXT: leal {{[0-9]+}}(%esp), %edi -; MCU-NEXT: movl (%edi), %edi +; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebp ; MCU-NEXT: je .LBB7_14 ; MCU-NEXT: .LBB7_13: -; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebx -; MCU-NEXT: movl (%ebx), %ebx -; MCU-NEXT: je .LBB7_17 -; MCU-NEXT: .LBB7_16: -; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebp -; MCU-NEXT: jmp .LBB7_18 -; MCU-NEXT: .LBB7_1: ; MCU-NEXT: leal {{[0-9]+}}(%esp), %eax -; MCU-NEXT: movl (%eax), %eax +; MCU-NEXT: jmp .LBB7_15 +; MCU-NEXT: .LBB7_1: +; MCU-NEXT: leal {{[0-9]+}}(%esp), %edi ; MCU-NEXT: jne .LBB7_4 ; MCU-NEXT: .LBB7_5: ; MCU-NEXT: leal {{[0-9]+}}(%esp), %ecx -; MCU-NEXT: movl (%ecx), %ecx ; MCU-NEXT: jne .LBB7_7 ; MCU-NEXT: .LBB7_8: ; MCU-NEXT: leal {{[0-9]+}}(%esp), %esi -; MCU-NEXT: movl (%esi), %esi ; MCU-NEXT: jne .LBB7_10 ; MCU-NEXT: .LBB7_11: -; MCU-NEXT: leal {{[0-9]+}}(%esp), %edi -; MCU-NEXT: movl (%edi), %edi +; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebp ; MCU-NEXT: jne .LBB7_13 ; MCU-NEXT: .LBB7_14: -; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebx -; MCU-NEXT: movl (%ebx), %ebx +; MCU-NEXT: leal {{[0-9]+}}(%esp), %eax +; MCU-NEXT: .LBB7_15: +; MCU-NEXT: movl (%edi), %ebx +; MCU-NEXT: movl (%ecx), %edi +; MCU-NEXT: movl (%esi), %esi +; MCU-NEXT: movl (%ebp), %ecx +; MCU-NEXT: movl (%eax), %eax ; MCU-NEXT: jne .LBB7_16 -; MCU-NEXT: .LBB7_17: +; MCU-NEXT: # %bb.17: +; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebp +; MCU-NEXT: jmp .LBB7_18 +; MCU-NEXT: .LBB7_16: ; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebp ; MCU-NEXT: .LBB7_18: ; MCU-NEXT: movl (%ebp), %ebp ; MCU-NEXT: decl %ebp -; MCU-NEXT: decl %ebx -; MCU-NEXT: decl %edi -; MCU-NEXT: decl %esi -; MCU-NEXT: decl %ecx ; MCU-NEXT: decl %eax -; MCU-NEXT: movl %eax, 20(%edx) -; MCU-NEXT: movl %ecx, 16(%edx) +; MCU-NEXT: decl %ecx +; MCU-NEXT: decl %esi +; MCU-NEXT: decl %edi +; MCU-NEXT: decl %ebx +; MCU-NEXT: movl %ebx, 20(%edx) +; MCU-NEXT: movl %edi, 16(%edx) ; MCU-NEXT: movl %esi, 12(%edx) -; MCU-NEXT: movl %edi, 8(%edx) -; MCU-NEXT: movl %ebx, 4(%edx) +; MCU-NEXT: movl %ecx, 8(%edx) +; MCU-NEXT: movl %eax, 4(%edx) ; MCU-NEXT: movl %ebp, (%edx) ; MCU-NEXT: popl %esi ; MCU-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -4362,7 +4362,6 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE2-LABEL: uitofp_load_4i64_to_4f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm2 ; SSE2-NEXT: movdqa 16(%rdi), %xmm0 ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: testq %rax, %rax @@ -4378,6 +4377,7 @@ ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 ; SSE2-NEXT: addss %xmm1, %xmm1 ; SSE2-NEXT: .LBB83_3: +; SSE2-NEXT: movdqa (%rdi), %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: testq %rax, %rax @@ -4711,40 +4711,38 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-LABEL: uitofp_load_8i64_to_8f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm5 ; SSE2-NEXT: movdqa 16(%rdi), %xmm0 -; SSE2-NEXT: movdqa 32(%rdi), %xmm2 -; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_1 ; SSE2-NEXT: # %bb.2: -; SSE2-NEXT: cvtsi2ss %rax, %xmm3 +; SSE2-NEXT: cvtsi2ss %rax, %xmm2 ; SSE2-NEXT: jmp .LBB87_3 ; SSE2-NEXT: .LBB87_1: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm3 -; SSE2-NEXT: addss %xmm3, %xmm3 +; SSE2-NEXT: cvtsi2ss %rax, %xmm2 +; SSE2-NEXT: addss %xmm2, %xmm2 ; SSE2-NEXT: .LBB87_3: +; SSE2-NEXT: movdqa (%rdi), %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_4 ; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: cvtsi2ss %rax, %xmm4 +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 ; SSE2-NEXT: jmp .LBB87_6 ; SSE2-NEXT: .LBB87_4: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm4 -; SSE2-NEXT: addss %xmm4, %xmm4 +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm1 ; SSE2-NEXT: .LBB87_6: -; SSE2-NEXT: movq %xmm5, %rax +; SSE2-NEXT: movq %xmm3, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_7 ; SSE2-NEXT: # %bb.8: @@ -4760,55 +4758,59 @@ ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 ; SSE2-NEXT: addss %xmm0, %xmm0 ; SSE2-NEXT: .LBB87_9: -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; SSE2-NEXT: movq %xmm5, %rax +; SSE2-NEXT: movdqa 48(%rdi), %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; SSE2-NEXT: movq %xmm3, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_10 ; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: cvtsi2ss %rax, %xmm6 +; SSE2-NEXT: cvtsi2ss %rax, %xmm4 ; SSE2-NEXT: jmp .LBB87_12 ; SSE2-NEXT: .LBB87_10: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm6 -; SSE2-NEXT: addss %xmm6, %xmm6 +; SSE2-NEXT: cvtsi2ss %rax, %xmm4 +; SSE2-NEXT: addss %xmm4, %xmm4 ; SSE2-NEXT: .LBB87_12: -; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: movq %xmm6, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_13 ; SSE2-NEXT: # %bb.14: -; SSE2-NEXT: xorps %xmm5, %xmm5 -; SSE2-NEXT: cvtsi2ss %rax, %xmm5 +; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: cvtsi2ss %rax, %xmm3 ; SSE2-NEXT: jmp .LBB87_15 ; SSE2-NEXT: .LBB87_13: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: xorps %xmm5, %xmm5 -; SSE2-NEXT: cvtsi2ss %rax, %xmm5 -; SSE2-NEXT: addss %xmm5, %xmm5 +; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: cvtsi2ss %rax, %xmm3 +; SSE2-NEXT: addss %xmm3, %xmm3 ; SSE2-NEXT: .LBB87_15: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: movdqa 32(%rdi), %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] +; SSE2-NEXT: movq %xmm6, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_16 ; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: cvtsi2ss %rax, %xmm7 +; SSE2-NEXT: xorps %xmm6, %xmm6 +; SSE2-NEXT: cvtsi2ss %rax, %xmm6 ; SSE2-NEXT: jmp .LBB87_18 ; SSE2-NEXT: .LBB87_16: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm7 -; SSE2-NEXT: addss %xmm7, %xmm7 +; SSE2-NEXT: xorps %xmm6, %xmm6 +; SSE2-NEXT: cvtsi2ss %rax, %xmm6 +; SSE2-NEXT: addss %xmm6, %xmm6 ; SSE2-NEXT: .LBB87_18: -; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: movq %xmm5, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_19 ; SSE2-NEXT: # %bb.20: @@ -4824,9 +4826,9 @@ ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 ; SSE2-NEXT: addss %xmm1, %xmm1 ; SSE2-NEXT: .LBB87_21: -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE2-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] ; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_22 @@ -4844,7 +4846,7 @@ ; SSE2-NEXT: addss %xmm2, %xmm2 ; SSE2-NEXT: .LBB87_24: ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: uitofp_load_8i64_to_8f32: