diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp --- a/llvm/lib/CodeGen/MachineSink.cpp +++ b/llvm/lib/CodeGen/MachineSink.cpp @@ -127,6 +127,12 @@ /// current block. DenseSet SeenDbgVars; + std::map, bool> + HasStoreCache; + std::map, + std::vector> + StoreInstrCache; + public: static char ID; // Pass identification @@ -159,6 +165,9 @@ MachineBasicBlock *From, MachineBasicBlock *To); + bool hasStoreBetween(MachineBasicBlock *From, MachineBasicBlock *To, + MachineInstr &MI); + /// Postpone the splitting of the given critical /// edge (\p From, \p To). /// @@ -359,6 +368,9 @@ EverMadeChange = true; } + HasStoreCache.clear(); + StoreInstrCache.clear(); + // Now clear any kill flags for recorded registers. for (auto I : RegsToClearKillFlags) MRI->clearKillFlags(I); @@ -919,6 +931,73 @@ } } +/// hasStoreBetween - check if there is store betweeen straight line blocks From +/// and To. +bool MachineSinking::hasStoreBetween(MachineBasicBlock *From, + MachineBasicBlock *To, MachineInstr &MI) { + // Make sure From and To are in straight line which means From dominates To + // and To post dominates From. + if (!DT->dominates(From, To) || !PDT->dominates(To, From)) + return true; + + auto BlockPair = std::make_pair(From, To); + + // Does these two blocks pair be queried before and have a definite cached + // result? + if (HasStoreCache.find(BlockPair) != HasStoreCache.end()) + return HasStoreCache[BlockPair]; + + if (StoreInstrCache.find(BlockPair) != StoreInstrCache.end()) + return std::any_of( + StoreInstrCache[BlockPair].begin(), StoreInstrCache[BlockPair].end(), + [&](MachineInstr *I) { return I->mayAlias(AA, MI, false); }); + + bool SawStore = false; + bool HasAliasedStore = false; + DenseSet HandledBlocks; + // Go through all reachable blocks from From. + for (MachineBasicBlock *BB : depth_first(From)) { + // We insert the instruction at the start of block To, so no need to worry + // about stores inside To. + // Store in block From should be already considered when just enter function + // SinkInstruction. + if (BB == To || BB == From) + continue; + + // We already handle this BB in previous iteration. + if (HandledBlocks.count(BB)) + continue; + + HandledBlocks.insert(BB); + // To post dominates BB, it must be a path from block From. + if (PDT->dominates(To, BB)) { + for (MachineInstr &I : *BB) { + // Treat as alias conservatively for a call or an ordered memory + // operation. + if (I.isCall() || I.hasOrderedMemoryRef()) { + HasStoreCache[BlockPair] = true; + return true; + } + + if (I.mayStore()) { + SawStore = true; + // We still have chance to sink MI if all stores between are not + // aliased to MI. + // Cache all store instructions, so that we don't need to go through + // all From reachable blocks for next load instruction. + if (I.mayAlias(AA, MI, false)) + HasAliasedStore = true; + StoreInstrCache[BlockPair].push_back(&I); + } + } + } + } + // If there is no store at all, cache the result. + if (!SawStore) + HasStoreCache[BlockPair] = false; + return HasAliasedStore; +} + /// SinkInstruction - Determine whether it is safe to sink the specified machine /// instruction out of its current block into a successor. bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore, @@ -979,8 +1058,9 @@ // We cannot sink a load across a critical edge - there may be stores in // other code paths. bool TryBreak = false; - bool store = true; - if (!MI.isSafeToMove(AA, store)) { + bool Store = + MI.mayLoad() ? hasStoreBetween(ParentBlock, SuccToSinkTo, MI) : true; + if (!MI.isSafeToMove(AA, Store)) { LLVM_DEBUG(dbgs() << " *** NOTE: Won't sink load along critical edge.\n"); TryBreak = true; } diff --git a/llvm/test/CodeGen/RISCV/select-optimize-multiple.ll b/llvm/test/CodeGen/RISCV/select-optimize-multiple.ll --- a/llvm/test/CodeGen/RISCV/select-optimize-multiple.ll +++ b/llvm/test/CodeGen/RISCV/select-optimize-multiple.ll @@ -40,40 +40,39 @@ ; RV32I-LABEL: cmovcc128: ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: xori a1, a1, 123 -; RV32I-NEXT: or a2, a1, a2 -; RV32I-NEXT: mv a1, a3 -; RV32I-NEXT: beqz a2, .LBB1_2 +; RV32I-NEXT: or a1, a1, a2 +; RV32I-NEXT: mv a2, a3 +; RV32I-NEXT: beqz a1, .LBB1_2 ; RV32I-NEXT: # %bb.1: # %entry -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: mv a2, a4 ; RV32I-NEXT: .LBB1_2: # %entry -; RV32I-NEXT: lw a6, 0(a1) -; RV32I-NEXT: beqz a2, .LBB1_6 +; RV32I-NEXT: beqz a1, .LBB1_5 ; RV32I-NEXT: # %bb.3: # %entry -; RV32I-NEXT: addi a1, a4, 4 -; RV32I-NEXT: lw a5, 0(a1) -; RV32I-NEXT: bnez a2, .LBB1_7 +; RV32I-NEXT: addi a7, a4, 4 +; RV32I-NEXT: bnez a1, .LBB1_6 ; RV32I-NEXT: .LBB1_4: -; RV32I-NEXT: addi a1, a3, 8 -; RV32I-NEXT: lw a1, 0(a1) -; RV32I-NEXT: bnez a2, .LBB1_8 +; RV32I-NEXT: addi a5, a3, 8 +; RV32I-NEXT: j .LBB1_7 ; RV32I-NEXT: .LBB1_5: -; RV32I-NEXT: addi a2, a3, 12 -; RV32I-NEXT: j .LBB1_9 -; RV32I-NEXT: .LBB1_6: -; RV32I-NEXT: addi a1, a3, 4 -; RV32I-NEXT: lw a5, 0(a1) -; RV32I-NEXT: beqz a2, .LBB1_4 +; RV32I-NEXT: addi a7, a3, 4 +; RV32I-NEXT: beqz a1, .LBB1_4 +; RV32I-NEXT: .LBB1_6: # %entry +; RV32I-NEXT: addi a5, a4, 8 ; RV32I-NEXT: .LBB1_7: # %entry -; RV32I-NEXT: addi a1, a4, 8 +; RV32I-NEXT: lw a6, 0(a2) +; RV32I-NEXT: lw a7, 0(a7) +; RV32I-NEXT: lw a2, 0(a5) +; RV32I-NEXT: beqz a1, .LBB1_9 +; RV32I-NEXT: # %bb.8: # %entry +; RV32I-NEXT: addi a1, a4, 12 +; RV32I-NEXT: j .LBB1_10 +; RV32I-NEXT: .LBB1_9: +; RV32I-NEXT: addi a1, a3, 12 +; RV32I-NEXT: .LBB1_10: # %entry ; RV32I-NEXT: lw a1, 0(a1) -; RV32I-NEXT: beqz a2, .LBB1_5 -; RV32I-NEXT: .LBB1_8: # %entry -; RV32I-NEXT: addi a2, a4, 12 -; RV32I-NEXT: .LBB1_9: # %entry -; RV32I-NEXT: lw a2, 0(a2) -; RV32I-NEXT: sw a2, 12(a0) -; RV32I-NEXT: sw a1, 8(a0) -; RV32I-NEXT: sw a5, 4(a0) +; RV32I-NEXT: sw a1, 12(a0) +; RV32I-NEXT: sw a2, 8(a0) +; RV32I-NEXT: sw a7, 4(a0) ; RV32I-NEXT: sw a6, 0(a0) ; RV32I-NEXT: ret ; @@ -124,40 +123,39 @@ define i128 @cmov128(i1 %a, i128 %b, i128 %c) nounwind { ; RV32I-LABEL: cmov128: ; RV32I: # %bb.0: # %entry -; RV32I-NEXT: andi a4, a1, 1 -; RV32I-NEXT: mv a1, a2 -; RV32I-NEXT: bnez a4, .LBB3_2 +; RV32I-NEXT: andi a1, a1, 1 +; RV32I-NEXT: mv a4, a2 +; RV32I-NEXT: bnez a1, .LBB3_2 ; RV32I-NEXT: # %bb.1: # %entry -; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: mv a4, a3 ; RV32I-NEXT: .LBB3_2: # %entry -; RV32I-NEXT: lw a6, 0(a1) -; RV32I-NEXT: bnez a4, .LBB3_6 +; RV32I-NEXT: bnez a1, .LBB3_5 ; RV32I-NEXT: # %bb.3: # %entry -; RV32I-NEXT: addi a1, a3, 4 -; RV32I-NEXT: lw a5, 0(a1) -; RV32I-NEXT: beqz a4, .LBB3_7 +; RV32I-NEXT: addi a7, a3, 4 +; RV32I-NEXT: beqz a1, .LBB3_6 ; RV32I-NEXT: .LBB3_4: -; RV32I-NEXT: addi a1, a2, 8 -; RV32I-NEXT: lw a1, 0(a1) -; RV32I-NEXT: beqz a4, .LBB3_8 +; RV32I-NEXT: addi a5, a2, 8 +; RV32I-NEXT: j .LBB3_7 ; RV32I-NEXT: .LBB3_5: -; RV32I-NEXT: addi a2, a2, 12 -; RV32I-NEXT: j .LBB3_9 -; RV32I-NEXT: .LBB3_6: -; RV32I-NEXT: addi a1, a2, 4 -; RV32I-NEXT: lw a5, 0(a1) -; RV32I-NEXT: bnez a4, .LBB3_4 +; RV32I-NEXT: addi a7, a2, 4 +; RV32I-NEXT: bnez a1, .LBB3_4 +; RV32I-NEXT: .LBB3_6: # %entry +; RV32I-NEXT: addi a5, a3, 8 ; RV32I-NEXT: .LBB3_7: # %entry -; RV32I-NEXT: addi a1, a3, 8 +; RV32I-NEXT: lw a6, 0(a4) +; RV32I-NEXT: lw a7, 0(a7) +; RV32I-NEXT: lw a4, 0(a5) +; RV32I-NEXT: bnez a1, .LBB3_9 +; RV32I-NEXT: # %bb.8: # %entry +; RV32I-NEXT: addi a1, a3, 12 +; RV32I-NEXT: j .LBB3_10 +; RV32I-NEXT: .LBB3_9: +; RV32I-NEXT: addi a1, a2, 12 +; RV32I-NEXT: .LBB3_10: # %entry ; RV32I-NEXT: lw a1, 0(a1) -; RV32I-NEXT: bnez a4, .LBB3_5 -; RV32I-NEXT: .LBB3_8: # %entry -; RV32I-NEXT: addi a2, a3, 12 -; RV32I-NEXT: .LBB3_9: # %entry -; RV32I-NEXT: lw a2, 0(a2) -; RV32I-NEXT: sw a2, 12(a0) -; RV32I-NEXT: sw a1, 8(a0) -; RV32I-NEXT: sw a5, 4(a0) +; RV32I-NEXT: sw a1, 12(a0) +; RV32I-NEXT: sw a4, 8(a0) +; RV32I-NEXT: sw a7, 4(a0) ; RV32I-NEXT: sw a6, 0(a0) ; RV32I-NEXT: ret ; diff --git a/llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll b/llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll --- a/llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll +++ b/llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll @@ -14,40 +14,40 @@ ; CHECK-NEXT: .cfi_offset %rbp, -16 ; CHECK-NEXT: movq %rsp, %rbp ; CHECK-NEXT: .cfi_def_cfa_register %rbp -; CHECK-NEXT: movslq (%rdi), %rax +; CHECK-NEXT: movslq (%rdi), %rdi ; CHECK-NEXT: movslq (%rsi), %r8 ; CHECK-NEXT: movslq (%rdx), %r10 -; CHECK-NEXT: movl (%rcx), %edi -; CHECK-NEXT: movslq (%r9), %rcx -; CHECK-NEXT: movq %rsp, %rdx -; CHECK-NEXT: subl %eax, %r8d -; CHECK-NEXT: movslq %r8d, %rsi +; CHECK-NEXT: movl (%rcx), %esi +; CHECK-NEXT: movq %rsp, %rcx +; CHECK-NEXT: subl %edi, %r8d +; CHECK-NEXT: movslq %r8d, %rdx ; CHECK-NEXT: js .LBB0_1 ; CHECK-NEXT: # %bb.11: # %b63 -; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: js .LBB0_14 ; CHECK-NEXT: # %bb.12: -; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_13: # %a25b ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: testb %dil, %dil ; CHECK-NEXT: je .LBB0_13 ; CHECK-NEXT: .LBB0_14: # %b85 ; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: # %bb.15: -; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_16: # %a25b140 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: testb %dil, %dil ; CHECK-NEXT: je .LBB0_16 ; CHECK-NEXT: .LBB0_1: # %a29b -; CHECK-NEXT: cmpl %r10d, %edi +; CHECK-NEXT: cmpl %r10d, %esi ; CHECK-NEXT: js .LBB0_10 ; CHECK-NEXT: # %bb.2: # %b158 +; CHECK-NEXT: movslq (%r9), %rsi ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: movb $1, %r10b @@ -77,7 +77,7 @@ ; CHECK-NEXT: js .LBB0_4 ; CHECK-NEXT: # %bb.17: # %b179 ; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: js .LBB0_18 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_37: # %a30b @@ -97,7 +97,7 @@ ; CHECK-NEXT: je .LBB0_19 ; CHECK-NEXT: .LBB0_4: # %a33b ; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: movl %esi, %eax ; CHECK-NEXT: orl %r8d, %eax ; CHECK-NEXT: movl %eax, %r9d ; CHECK-NEXT: shrl $31, %r9d @@ -106,7 +106,7 @@ ; CHECK-NEXT: .LBB0_5: # %a50b ; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: movl %r8d, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: orl %esi, %eax ; CHECK-NEXT: movl %eax, %r11d ; CHECK-NEXT: shrl $31, %r11d ; CHECK-NEXT: testl %eax, %eax @@ -156,7 +156,7 @@ ; CHECK-NEXT: # Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: # => This Loop Header: Depth=2 ; CHECK-NEXT: # Child Loop BB0_21 Depth 3 -; CHECK-NEXT: testq %rcx, %rcx +; CHECK-NEXT: testq %rsi, %rsi ; CHECK-NEXT: js .LBB0_22 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_21: # %a35b @@ -169,14 +169,14 @@ ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_28: # %b1016 ; CHECK-NEXT: # in Loop: Header=BB0_26 Depth=2 -; CHECK-NEXT: testq %rcx, %rcx +; CHECK-NEXT: testq %rsi, %rsi ; CHECK-NEXT: jle .LBB0_6 ; CHECK-NEXT: .LBB0_26: # %b858 ; CHECK-NEXT: # Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: # => This Loop Header: Depth=2 ; CHECK-NEXT: # Child Loop BB0_38 Depth 3 ; CHECK-NEXT: # Child Loop BB0_29 Depth 3 -; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: js .LBB0_27 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_38: # %a53b @@ -194,38 +194,38 @@ ; CHECK-NEXT: # Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: # Parent Loop BB0_26 Depth=2 ; CHECK-NEXT: # => This Inner Loop Header: Depth=3 -; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: jle .LBB0_29 ; CHECK-NEXT: jmp .LBB0_28 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_32: # %b1263 ; CHECK-NEXT: # in Loop: Header=BB0_30 Depth=2 -; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: jle .LBB0_7 ; CHECK-NEXT: .LBB0_30: # %b1117 ; CHECK-NEXT: # Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: # => This Loop Header: Depth=2 ; CHECK-NEXT: # Child Loop BB0_39 Depth 3 ; CHECK-NEXT: # Child Loop BB0_33 Depth 3 -; CHECK-NEXT: testq %rcx, %rcx +; CHECK-NEXT: testq %rsi, %rsi ; CHECK-NEXT: js .LBB0_31 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_39: # %a63b ; CHECK-NEXT: # Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: # Parent Loop BB0_30 Depth=2 ; CHECK-NEXT: # => This Inner Loop Header: Depth=3 -; CHECK-NEXT: testq %rcx, %rcx +; CHECK-NEXT: testq %rsi, %rsi ; CHECK-NEXT: jle .LBB0_39 ; CHECK-NEXT: .LBB0_31: # %b1139 ; CHECK-NEXT: # in Loop: Header=BB0_30 Depth=2 -; CHECK-NEXT: testq %rcx, %rcx +; CHECK-NEXT: testq %rsi, %rsi ; CHECK-NEXT: jle .LBB0_32 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_33: # %a63b1266 ; CHECK-NEXT: # Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: # Parent Loop BB0_30 Depth=2 ; CHECK-NEXT: # => This Inner Loop Header: Depth=3 -; CHECK-NEXT: testq %rcx, %rcx +; CHECK-NEXT: testq %rsi, %rsi ; CHECK-NEXT: jle .LBB0_33 ; CHECK-NEXT: jmp .LBB0_32 ; CHECK-NEXT: .p2align 4, 0x90 @@ -237,7 +237,7 @@ ; CHECK-NEXT: # Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: # => This Loop Header: Depth=2 ; CHECK-NEXT: # Child Loop BB0_24 Depth 3 -; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: js .LBB0_25 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_24: # %a45b diff --git a/llvm/test/CodeGen/X86/MachineSink-eflags.ll b/llvm/test/CodeGen/X86/MachineSink-eflags.ll --- a/llvm/test/CodeGen/X86/MachineSink-eflags.ll +++ b/llvm/test/CodeGen/X86/MachineSink-eflags.ll @@ -16,31 +16,30 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $152, %rsp ; CHECK-NEXT: movq 48(%rdi), %rax -; CHECK-NEXT: movl 64(%rdi), %edx +; CHECK-NEXT: movl 64(%rdi), %ecx ; CHECK-NEXT: movl $200, %esi ; CHECK-NEXT: addl 68(%rdi), %esi -; CHECK-NEXT: imull $46, %edx, %ecx -; CHECK-NEXT: addq %rsi, %rcx -; CHECK-NEXT: shlq $4, %rcx -; CHECK-NEXT: imull $47, %edx, %edx +; CHECK-NEXT: imull $46, %ecx, %edx ; CHECK-NEXT: addq %rsi, %rdx ; CHECK-NEXT: shlq $4, %rdx -; CHECK-NEXT: movaps (%rax,%rdx), %xmm0 +; CHECK-NEXT: imull $47, %ecx, %ecx +; CHECK-NEXT: addq %rsi, %rcx +; CHECK-NEXT: shlq $4, %rcx ; CHECK-NEXT: cmpl $0, (%rdi) ; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: # %bb.2: # %entry -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: je .LBB0_4 -; CHECK-NEXT: jmp .LBB0_5 +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: jmp .LBB0_3 ; CHECK-NEXT: .LBB0_1: +; CHECK-NEXT: movaps (%rax,%rdx), %xmm0 +; CHECK-NEXT: .LBB0_3: # %entry ; CHECK-NEXT: movaps (%rax,%rcx), %xmm1 -; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: jne .LBB0_5 -; CHECK-NEXT: .LBB0_4: # %entry -; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: # %bb.4: # %entry +; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: .LBB0_5: # %entry -; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: addq $152, %rsp ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/avx2-masked-gather.ll b/llvm/test/CodeGen/X86/avx2-masked-gather.ll --- a/llvm/test/CodeGen/X86/avx2-masked-gather.ll +++ b/llvm/test/CodeGen/X86/avx2-masked-gather.ll @@ -358,44 +358,57 @@ ; ; NOGATHER-LABEL: masked_gather_v8i32: ; NOGATHER: # %bb.0: # %entry -; NOGATHER-NEXT: vmovdqa (%rdi), %ymm3 -; NOGATHER-NEXT: vmovdqa 32(%rdi), %ymm2 +; NOGATHER-NEXT: vmovdqa (%rdi), %ymm2 ; NOGATHER-NEXT: vpsllw $15, %xmm0, %xmm0 ; NOGATHER-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; NOGATHER-NEXT: vpmovmskb %xmm0, %eax ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB6_2 ; NOGATHER-NEXT: # %bb.1: # %cond.load -; NOGATHER-NEXT: vmovq %xmm3, %rcx +; NOGATHER-NEXT: vmovq %xmm2, %rcx ; NOGATHER-NEXT: vpinsrd $0, (%rcx), %xmm1, %xmm0 ; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; NOGATHER-NEXT: .LBB6_2: # %else ; NOGATHER-NEXT: testb $2, %al ; NOGATHER-NEXT: je .LBB6_4 ; NOGATHER-NEXT: # %bb.3: # %cond.load1 -; NOGATHER-NEXT: vpextrq $1, %xmm3, %rcx +; NOGATHER-NEXT: vpextrq $1, %xmm2, %rcx ; NOGATHER-NEXT: vpinsrd $1, (%rcx), %xmm1, %xmm0 ; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; NOGATHER-NEXT: .LBB6_4: # %else2 -; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm0 +; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0 ; NOGATHER-NEXT: testb $4, %al -; NOGATHER-NEXT: jne .LBB6_5 -; NOGATHER-NEXT: # %bb.6: # %else5 +; NOGATHER-NEXT: je .LBB6_6 +; NOGATHER-NEXT: # %bb.5: # %cond.load4 +; NOGATHER-NEXT: vmovq %xmm0, %rcx +; NOGATHER-NEXT: vpinsrd $2, (%rcx), %xmm1, %xmm2 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; NOGATHER-NEXT: .LBB6_6: # %else5 ; NOGATHER-NEXT: testb $8, %al -; NOGATHER-NEXT: jne .LBB6_7 +; NOGATHER-NEXT: je .LBB6_8 +; NOGATHER-NEXT: # %bb.7: # %cond.load7 +; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx +; NOGATHER-NEXT: vpinsrd $3, (%rcx), %xmm1, %xmm0 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; NOGATHER-NEXT: .LBB6_8: # %else8 +; NOGATHER-NEXT: vmovdqa 32(%rdi), %ymm0 ; NOGATHER-NEXT: testb $16, %al -; NOGATHER-NEXT: jne .LBB6_9 +; NOGATHER-NEXT: je .LBB6_10 +; NOGATHER-NEXT: # %bb.9: # %cond.load10 +; NOGATHER-NEXT: vmovq %xmm0, %rcx +; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 +; NOGATHER-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2 +; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; NOGATHER-NEXT: .LBB6_10: # %else11 ; NOGATHER-NEXT: testb $32, %al ; NOGATHER-NEXT: je .LBB6_12 -; NOGATHER-NEXT: .LBB6_11: # %cond.load13 -; NOGATHER-NEXT: vpextrq $1, %xmm2, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 -; NOGATHER-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0 -; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; NOGATHER-NEXT: # %bb.11: # %cond.load13 +; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx +; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 +; NOGATHER-NEXT: vpinsrd $1, (%rcx), %xmm2, %xmm2 +; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; NOGATHER-NEXT: .LBB6_12: # %else14 -; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0 +; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0 ; NOGATHER-NEXT: testb $64, %al ; NOGATHER-NEXT: jne .LBB6_13 ; NOGATHER-NEXT: # %bb.14: # %else17 @@ -404,26 +417,6 @@ ; NOGATHER-NEXT: .LBB6_16: # %else20 ; NOGATHER-NEXT: vmovaps %ymm1, %ymm0 ; NOGATHER-NEXT: retq -; NOGATHER-NEXT: .LBB6_5: # %cond.load4 -; NOGATHER-NEXT: vmovq %xmm0, %rcx -; NOGATHER-NEXT: vpinsrd $2, (%rcx), %xmm1, %xmm3 -; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; NOGATHER-NEXT: testb $8, %al -; NOGATHER-NEXT: je .LBB6_8 -; NOGATHER-NEXT: .LBB6_7: # %cond.load7 -; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx -; NOGATHER-NEXT: vpinsrd $3, (%rcx), %xmm1, %xmm0 -; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; NOGATHER-NEXT: testb $16, %al -; NOGATHER-NEXT: je .LBB6_10 -; NOGATHER-NEXT: .LBB6_9: # %cond.load10 -; NOGATHER-NEXT: vmovq %xmm2, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 -; NOGATHER-NEXT: vpinsrd $0, (%rcx), %xmm0, %xmm0 -; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; NOGATHER-NEXT: testb $32, %al -; NOGATHER-NEXT: jne .LBB6_11 -; NOGATHER-NEXT: jmp .LBB6_12 ; NOGATHER-NEXT: .LBB6_13: # %cond.load16 ; NOGATHER-NEXT: vmovq %xmm0, %rcx ; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 @@ -472,44 +465,58 @@ ; ; NOGATHER-LABEL: masked_gather_v8float: ; NOGATHER: # %bb.0: # %entry -; NOGATHER-NEXT: vmovdqa (%rdi), %ymm3 -; NOGATHER-NEXT: vmovdqa 32(%rdi), %ymm2 +; NOGATHER-NEXT: vmovdqa (%rdi), %ymm2 ; NOGATHER-NEXT: vpsllw $15, %xmm0, %xmm0 ; NOGATHER-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; NOGATHER-NEXT: vpmovmskb %xmm0, %eax ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB7_2 ; NOGATHER-NEXT: # %bb.1: # %cond.load -; NOGATHER-NEXT: vmovq %xmm3, %rcx +; NOGATHER-NEXT: vmovq %xmm2, %rcx ; NOGATHER-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7] ; NOGATHER-NEXT: .LBB7_2: # %else ; NOGATHER-NEXT: testb $2, %al ; NOGATHER-NEXT: je .LBB7_4 ; NOGATHER-NEXT: # %bb.3: # %cond.load1 -; NOGATHER-NEXT: vpextrq $1, %xmm3, %rcx +; NOGATHER-NEXT: vpextrq $1, %xmm2, %rcx ; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],mem[0],xmm1[2,3] ; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; NOGATHER-NEXT: .LBB7_4: # %else2 -; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm0 +; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0 ; NOGATHER-NEXT: testb $4, %al -; NOGATHER-NEXT: jne .LBB7_5 -; NOGATHER-NEXT: # %bb.6: # %else5 +; NOGATHER-NEXT: je .LBB7_6 +; NOGATHER-NEXT: # %bb.5: # %cond.load4 +; NOGATHER-NEXT: vmovq %xmm0, %rcx +; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm1[0,1],mem[0],xmm1[3] +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; NOGATHER-NEXT: .LBB7_6: # %else5 ; NOGATHER-NEXT: testb $8, %al -; NOGATHER-NEXT: jne .LBB7_7 +; NOGATHER-NEXT: je .LBB7_8 +; NOGATHER-NEXT: # %bb.7: # %cond.load7 +; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx +; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],mem[0] +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; NOGATHER-NEXT: .LBB7_8: # %else8 +; NOGATHER-NEXT: vmovdqa 32(%rdi), %ymm0 ; NOGATHER-NEXT: testb $16, %al -; NOGATHER-NEXT: jne .LBB7_9 +; NOGATHER-NEXT: je .LBB7_10 +; NOGATHER-NEXT: # %bb.9: # %cond.load10 +; NOGATHER-NEXT: vmovq %xmm0, %rcx +; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 +; NOGATHER-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; NOGATHER-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] +; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; NOGATHER-NEXT: .LBB7_10: # %else11 ; NOGATHER-NEXT: testb $32, %al ; NOGATHER-NEXT: je .LBB7_12 -; NOGATHER-NEXT: .LBB7_11: # %cond.load13 -; NOGATHER-NEXT: vpextrq $1, %xmm2, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 -; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; NOGATHER-NEXT: # %bb.11: # %cond.load13 +; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx +; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 +; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] +; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; NOGATHER-NEXT: .LBB7_12: # %else14 -; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0 +; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0 ; NOGATHER-NEXT: testb $64, %al ; NOGATHER-NEXT: jne .LBB7_13 ; NOGATHER-NEXT: # %bb.14: # %else17 @@ -518,27 +525,6 @@ ; NOGATHER-NEXT: .LBB7_16: # %else20 ; NOGATHER-NEXT: vmovaps %ymm1, %ymm0 ; NOGATHER-NEXT: retq -; NOGATHER-NEXT: .LBB7_5: # %cond.load4 -; NOGATHER-NEXT: vmovq %xmm0, %rcx -; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1],mem[0],xmm1[3] -; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; NOGATHER-NEXT: testb $8, %al -; NOGATHER-NEXT: je .LBB7_8 -; NOGATHER-NEXT: .LBB7_7: # %cond.load7 -; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx -; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],mem[0] -; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; NOGATHER-NEXT: testb $16, %al -; NOGATHER-NEXT: je .LBB7_10 -; NOGATHER-NEXT: .LBB7_9: # %cond.load10 -; NOGATHER-NEXT: vmovq %xmm2, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 -; NOGATHER-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; NOGATHER-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] -; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; NOGATHER-NEXT: testb $32, %al -; NOGATHER-NEXT: jne .LBB7_11 -; NOGATHER-NEXT: jmp .LBB7_12 ; NOGATHER-NEXT: .LBB7_13: # %cond.load16 ; NOGATHER-NEXT: vmovq %xmm0, %rcx ; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 diff --git a/llvm/test/CodeGen/X86/cmovcmov.ll b/llvm/test/CodeGen/X86/cmovcmov.ll --- a/llvm/test/CodeGen/X86/cmovcmov.ll +++ b/llvm/test/CodeGen/X86/cmovcmov.ll @@ -165,14 +165,13 @@ ; NOCMOV-NEXT: fnstsw %ax ; NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax ; NOCMOV-NEXT: sahf -; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %eax +; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %ecx ; NOCMOV-NEXT: jne .LBB4_3 ; NOCMOV-NEXT: # %bb.1: # %entry ; NOCMOV-NEXT: jp .LBB4_3 ; NOCMOV-NEXT: # %bb.2: # %entry -; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %eax +; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %ecx ; NOCMOV-NEXT: .LBB4_3: # %entry -; NOCMOV-NEXT: movl (%eax), %ecx ; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %edx ; NOCMOV-NEXT: jne .LBB4_6 ; NOCMOV-NEXT: # %bb.4: # %entry @@ -181,7 +180,6 @@ ; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %edx ; NOCMOV-NEXT: .LBB4_6: # %entry ; NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; NOCMOV-NEXT: movl (%edx), %edx ; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %esi ; NOCMOV-NEXT: jne .LBB4_9 ; NOCMOV-NEXT: # %bb.7: # %entry @@ -189,6 +187,8 @@ ; NOCMOV-NEXT: # %bb.8: # %entry ; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %esi ; NOCMOV-NEXT: .LBB4_9: # %entry +; NOCMOV-NEXT: movl (%ecx), %ecx +; NOCMOV-NEXT: movl (%edx), %edx ; NOCMOV-NEXT: movl (%esi), %esi ; NOCMOV-NEXT: leal {{[0-9]+}}(%esp), %edi ; NOCMOV-NEXT: jne .LBB4_12 diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll --- a/llvm/test/CodeGen/X86/select.ll +++ b/llvm/test/CodeGen/X86/select.ll @@ -557,63 +557,59 @@ ; MCU-NEXT: testb $1, %al ; MCU-NEXT: jne .LBB7_1 ; MCU-NEXT: # %bb.2: -; MCU-NEXT: leal {{[0-9]+}}(%esp), %eax -; MCU-NEXT: movl (%eax), %eax +; MCU-NEXT: leal {{[0-9]+}}(%esp), %edi ; MCU-NEXT: je .LBB7_5 ; MCU-NEXT: .LBB7_4: ; MCU-NEXT: leal {{[0-9]+}}(%esp), %ecx -; MCU-NEXT: movl (%ecx), %ecx ; MCU-NEXT: je .LBB7_8 ; MCU-NEXT: .LBB7_7: ; MCU-NEXT: leal {{[0-9]+}}(%esp), %esi -; MCU-NEXT: movl (%esi), %esi ; MCU-NEXT: je .LBB7_11 ; MCU-NEXT: .LBB7_10: -; MCU-NEXT: leal {{[0-9]+}}(%esp), %edi -; MCU-NEXT: movl (%edi), %edi +; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebp ; MCU-NEXT: je .LBB7_14 ; MCU-NEXT: .LBB7_13: -; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebx -; MCU-NEXT: movl (%ebx), %ebx -; MCU-NEXT: je .LBB7_17 -; MCU-NEXT: .LBB7_16: -; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebp -; MCU-NEXT: jmp .LBB7_18 -; MCU-NEXT: .LBB7_1: ; MCU-NEXT: leal {{[0-9]+}}(%esp), %eax -; MCU-NEXT: movl (%eax), %eax +; MCU-NEXT: jmp .LBB7_15 +; MCU-NEXT: .LBB7_1: +; MCU-NEXT: leal {{[0-9]+}}(%esp), %edi ; MCU-NEXT: jne .LBB7_4 ; MCU-NEXT: .LBB7_5: ; MCU-NEXT: leal {{[0-9]+}}(%esp), %ecx -; MCU-NEXT: movl (%ecx), %ecx ; MCU-NEXT: jne .LBB7_7 ; MCU-NEXT: .LBB7_8: ; MCU-NEXT: leal {{[0-9]+}}(%esp), %esi -; MCU-NEXT: movl (%esi), %esi ; MCU-NEXT: jne .LBB7_10 ; MCU-NEXT: .LBB7_11: -; MCU-NEXT: leal {{[0-9]+}}(%esp), %edi -; MCU-NEXT: movl (%edi), %edi +; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebp ; MCU-NEXT: jne .LBB7_13 ; MCU-NEXT: .LBB7_14: -; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebx -; MCU-NEXT: movl (%ebx), %ebx +; MCU-NEXT: leal {{[0-9]+}}(%esp), %eax +; MCU-NEXT: .LBB7_15: +; MCU-NEXT: movl (%edi), %ebx +; MCU-NEXT: movl (%ecx), %edi +; MCU-NEXT: movl (%esi), %esi +; MCU-NEXT: movl (%ebp), %ecx +; MCU-NEXT: movl (%eax), %eax ; MCU-NEXT: jne .LBB7_16 -; MCU-NEXT: .LBB7_17: +; MCU-NEXT: # %bb.17: +; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebp +; MCU-NEXT: jmp .LBB7_18 +; MCU-NEXT: .LBB7_16: ; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebp ; MCU-NEXT: .LBB7_18: ; MCU-NEXT: movl (%ebp), %ebp ; MCU-NEXT: decl %ebp -; MCU-NEXT: decl %ebx -; MCU-NEXT: decl %edi -; MCU-NEXT: decl %esi -; MCU-NEXT: decl %ecx ; MCU-NEXT: decl %eax -; MCU-NEXT: movl %eax, 20(%edx) -; MCU-NEXT: movl %ecx, 16(%edx) +; MCU-NEXT: decl %ecx +; MCU-NEXT: decl %esi +; MCU-NEXT: decl %edi +; MCU-NEXT: decl %ebx +; MCU-NEXT: movl %ebx, 20(%edx) +; MCU-NEXT: movl %edi, 16(%edx) ; MCU-NEXT: movl %esi, 12(%edx) -; MCU-NEXT: movl %edi, 8(%edx) -; MCU-NEXT: movl %ebx, 4(%edx) +; MCU-NEXT: movl %ecx, 8(%edx) +; MCU-NEXT: movl %eax, 4(%edx) ; MCU-NEXT: movl %ebp, (%edx) ; MCU-NEXT: popl %esi ; MCU-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -4361,7 +4361,6 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE2-LABEL: uitofp_load_4i64_to_4f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm2 ; SSE2-NEXT: movdqa 16(%rdi), %xmm0 ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: testq %rax, %rax @@ -4377,6 +4376,7 @@ ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 ; SSE2-NEXT: addss %xmm1, %xmm1 ; SSE2-NEXT: .LBB83_3: +; SSE2-NEXT: movdqa (%rdi), %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: testq %rax, %rax @@ -4710,40 +4710,38 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE2-LABEL: uitofp_load_8i64_to_8f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm5 ; SSE2-NEXT: movdqa 16(%rdi), %xmm0 -; SSE2-NEXT: movdqa 32(%rdi), %xmm2 -; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_1 ; SSE2-NEXT: # %bb.2: -; SSE2-NEXT: cvtsi2ss %rax, %xmm3 +; SSE2-NEXT: cvtsi2ss %rax, %xmm2 ; SSE2-NEXT: jmp .LBB87_3 ; SSE2-NEXT: .LBB87_1: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm3 -; SSE2-NEXT: addss %xmm3, %xmm3 +; SSE2-NEXT: cvtsi2ss %rax, %xmm2 +; SSE2-NEXT: addss %xmm2, %xmm2 ; SSE2-NEXT: .LBB87_3: +; SSE2-NEXT: movdqa (%rdi), %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_4 ; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: cvtsi2ss %rax, %xmm4 +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 ; SSE2-NEXT: jmp .LBB87_6 ; SSE2-NEXT: .LBB87_4: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm4 -; SSE2-NEXT: addss %xmm4, %xmm4 +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm1 ; SSE2-NEXT: .LBB87_6: -; SSE2-NEXT: movq %xmm5, %rax +; SSE2-NEXT: movq %xmm3, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_7 ; SSE2-NEXT: # %bb.8: @@ -4759,55 +4757,59 @@ ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 ; SSE2-NEXT: addss %xmm0, %xmm0 ; SSE2-NEXT: .LBB87_9: -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; SSE2-NEXT: movq %xmm5, %rax +; SSE2-NEXT: movdqa 48(%rdi), %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; SSE2-NEXT: movq %xmm3, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_10 ; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: cvtsi2ss %rax, %xmm6 +; SSE2-NEXT: cvtsi2ss %rax, %xmm4 ; SSE2-NEXT: jmp .LBB87_12 ; SSE2-NEXT: .LBB87_10: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm6 -; SSE2-NEXT: addss %xmm6, %xmm6 +; SSE2-NEXT: cvtsi2ss %rax, %xmm4 +; SSE2-NEXT: addss %xmm4, %xmm4 ; SSE2-NEXT: .LBB87_12: -; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: movq %xmm6, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_13 ; SSE2-NEXT: # %bb.14: -; SSE2-NEXT: xorps %xmm5, %xmm5 -; SSE2-NEXT: cvtsi2ss %rax, %xmm5 +; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: cvtsi2ss %rax, %xmm3 ; SSE2-NEXT: jmp .LBB87_15 ; SSE2-NEXT: .LBB87_13: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: xorps %xmm5, %xmm5 -; SSE2-NEXT: cvtsi2ss %rax, %xmm5 -; SSE2-NEXT: addss %xmm5, %xmm5 +; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: cvtsi2ss %rax, %xmm3 +; SSE2-NEXT: addss %xmm3, %xmm3 ; SSE2-NEXT: .LBB87_15: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: movdqa 32(%rdi), %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] +; SSE2-NEXT: movq %xmm6, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_16 ; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: cvtsi2ss %rax, %xmm7 +; SSE2-NEXT: xorps %xmm6, %xmm6 +; SSE2-NEXT: cvtsi2ss %rax, %xmm6 ; SSE2-NEXT: jmp .LBB87_18 ; SSE2-NEXT: .LBB87_16: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm7 -; SSE2-NEXT: addss %xmm7, %xmm7 +; SSE2-NEXT: xorps %xmm6, %xmm6 +; SSE2-NEXT: cvtsi2ss %rax, %xmm6 +; SSE2-NEXT: addss %xmm6, %xmm6 ; SSE2-NEXT: .LBB87_18: -; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: movq %xmm5, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_19 ; SSE2-NEXT: # %bb.20: @@ -4823,9 +4825,9 @@ ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 ; SSE2-NEXT: addss %xmm1, %xmm1 ; SSE2-NEXT: .LBB87_21: -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE2-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] ; SSE2-NEXT: movq %xmm2, %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_22 @@ -4843,7 +4845,7 @@ ; SSE2-NEXT: addss %xmm2, %xmm2 ; SSE2-NEXT: .LBB87_24: ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: uitofp_load_8i64_to_8f32: