diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1616,11 +1616,11 @@ // Add any operands of the new node which have not yet been combined to the // worklist as well. Because the worklist uniques things already, this // won't repeatedly process the same operand. - CombinedNodes.insert(N); for (const SDValue &ChildN : N->op_values()) if (!CombinedNodes.count(ChildN.getNode())) AddToWorklist(ChildN.getNode()); + CombinedNodes.insert(N); SDValue RV = combine(N); if (!RV.getNode()) @@ -1654,10 +1654,8 @@ // out), because re-visiting the EntryToken and its users will not uncover // any additional opportunities, but there may be a large number of such // users, potentially causing compile time explosion. - if (RV.getOpcode() != ISD::EntryToken) { - AddToWorklist(RV.getNode()); - AddUsersToWorklist(RV.getNode()); - } + if (RV.getOpcode() != ISD::EntryToken) + AddToWorklistWithUsers(RV.getNode()); // Finally, if the node is now dead, remove it from the graph. The node // may not be dead if the replacement process recursively simplified to diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll --- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll @@ -268,10 +268,9 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v4, vcc +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm @@ -282,12 +281,11 @@ ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_subbrev_co_u32_e32 v0, vcc, 0, v3, vcc +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x64, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll --- a/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll @@ -23,8 +23,8 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, 1.0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 4.0, 2.0, s[0:1] +; GCN-NEXT: v_cmp_nlt_f32_e64 s[0:1], s0, 1.0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1] ; GCN-NEXT: flat_store_dword v[0:1], v0 ; GCN-NEXT: s_endpgm %c1 = fcmp olt float %x, 1.0 @@ -40,8 +40,8 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, 1.0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 4.0, 2.0, s[0:1] +; GCN-NEXT: v_cmp_nlt_f32_e64 s[0:1], s0, 1.0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1] ; GCN-NEXT: flat_store_dword v[0:1], v0 ; GCN-NEXT: s_endpgm %c1 = fcmp olt float %x, 1.0 diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll --- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll @@ -224,12 +224,12 @@ ; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v6 offset:5 ; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v1 ; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v2 offset:1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v8 offset:6 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v0 offset:7 ; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v3 offset:2 ; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v4 offset:3 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) +; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v8 offset:6 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) +; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v0 offset:7 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds8align1: @@ -296,17 +296,17 @@ ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:4 ; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 -; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:6 -; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:2 +; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 +; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:6 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) ; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v1 offset:4 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) ; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v2 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v3 offset:6 +; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v3 offset:2 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v0 offset:2 +; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v0 offset:6 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds8align2: @@ -395,24 +395,22 @@ ; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10 ; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:11 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v12, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) -; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v5 offset:4 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) -; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v6 offset:5 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v9 offset:8 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v10 offset:9 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v11 offset:10 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v0 offset:11 +; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v5 offset:4 +; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v6 offset:5 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v1 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v2 offset:1 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v3 offset:2 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v4 offset:3 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v7 offset:6 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v8 offset:7 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) +; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v11 offset:10 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) +; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v0 offset:11 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds12align1: @@ -494,23 +492,23 @@ ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 -; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 offset:2 -; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:4 -; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:8 -; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:10 -; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:6 +; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:8 +; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 +; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 +; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 +; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v6, s1 +; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:10 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v1 offset:8 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v3 offset:4 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v4 offset:8 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v5 offset:10 -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v1 -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v2 offset:2 +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v4 offset:4 +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v2 +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v3 offset:2 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v0 offset:6 +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v5 offset:6 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v0 offset:10 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds12align2: @@ -695,30 +693,25 @@ ; ALIGNED-SDAG-NEXT: ds_read_u8 v15, v0 offset:14 ; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:15 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v16, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v3 offset:2 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v4 offset:3 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v13 offset:12 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v14 offset:13 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v1 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v2 offset:1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v7 offset:6 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v8 offset:7 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v5 offset:4 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v6 offset:5 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v11 offset:10 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v12 offset:11 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v9 offset:8 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v10 offset:9 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v3 offset:2 +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v4 offset:3 +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v7 offset:6 +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v8 offset:7 +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v11 offset:10 +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v12 offset:11 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(14) ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v15 offset:14 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v0 offset:15 -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v13 offset:12 -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v14 offset:13 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds16align1: @@ -816,27 +809,29 @@ ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:12 ; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 ; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 ; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6 ; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:8 ; ALIGNED-SDAG-NEXT: ds_read_u16 v7, v0 offset:10 -; ALIGNED-SDAG-NEXT: ds_read_u16 v8, v0 offset:12 +; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v8, s1 ; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:14 -; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v3 offset:2 -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v2 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v5 offset:6 -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v4 offset:4 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v7 offset:10 -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v6 offset:8 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v1 offset:12 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v2 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v0 offset:14 -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v8 offset:12 +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v4 offset:4 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v6 offset:8 +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v3 offset:2 +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v5 offset:6 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v7 offset:10 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v0 offset:14 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds16align2: diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -657,20 +657,20 @@ ; CI-NEXT: ds_write_b8 v0, v1 offset:5 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; CI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; CI-NEXT: ds_write_b8 v0, v2 offset:13 ; CI-NEXT: ds_write_b8 v0, v1 offset:9 +; CI-NEXT: ds_write_b8 v0, v2 offset:13 ; CI-NEXT: v_lshrrev_b32_e32 v1, 24, v2 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 ; CI-NEXT: ds_write_b8 v0, v3 offset:8 ; CI-NEXT: ds_write_b8 v0, v4 offset:7 ; CI-NEXT: ds_write_b8 v0, v5 offset:6 -; CI-NEXT: ds_write_b8 v0, v1 offset:16 -; CI-NEXT: ds_write_b8 v0, v6 offset:15 -; CI-NEXT: ds_write_b8 v0, v2 offset:14 ; CI-NEXT: ds_write_b8 v0, v3 offset:12 ; CI-NEXT: ds_write_b8 v0, v4 offset:11 ; CI-NEXT: ds_write_b8 v0, v5 offset:10 +; CI-NEXT: ds_write_b8 v0, v1 offset:16 +; CI-NEXT: ds_write_b8 v0, v6 offset:15 +; CI-NEXT: ds_write_b8 v0, v2 offset:14 ; CI-NEXT: s_endpgm ; ; GFX9-ALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: @@ -686,18 +686,18 @@ ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:5 ; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v1 offset:15 -; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:13 ; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:11 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:9 +; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v1 offset:15 +; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:13 ; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v0, 24, v1 ; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v3 offset:8 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v4 offset:6 -; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:16 -; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:14 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v3 offset:12 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v4 offset:10 +; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:16 +; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:14 ; GFX9-ALIGNED-NEXT: s_endpgm ; ; GFX9-UNALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -1851,14 +1851,14 @@ ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_and_b32_e32 v3, 0xff00, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v2 +; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v6, 0xff00, v0 -; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 +; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_alignbit_b32 v3, s10, v3, 16 @@ -1866,8 +1866,8 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v5, v8, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v7, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v5, v8, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -2471,35 +2471,35 @@ ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v14, 15, v0 ; GFX7-NEXT: v_bfe_u32 v8, v2, 12, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v2 ; GFX7-NEXT: v_bfe_u32 v13, v0, 4, 4 ; GFX7-NEXT: v_bfe_u32 v15, v0, 12, 4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 +; GFX7-NEXT: v_bfe_u32 v3, v2, 20, 4 +; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 4 ; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 4 ; GFX7-NEXT: v_bfe_u32 v12, v0, 8, 4 +; GFX7-NEXT: v_alignbit_b32 v2, v9, v2, 24 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v15, 24, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v15 ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 ; GFX7-NEXT: v_alignbit_b32 v8, 0, v8, 24 -; GFX7-NEXT: v_alignbit_b32 v14, 0, v15, 24 +; GFX7-NEXT: v_alignbit_b32 v7, 0, v9, 24 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 -; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 4 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v2 ; GFX7-NEXT: v_bfe_u32 v11, v0, 16, 4 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 28, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v8, v14, v1 -; GFX7-NEXT: v_bfe_u32 v3, v2, 20, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v8, v7, v1 ; GFX7-NEXT: v_bfe_u32 v10, v0, 20, 4 -; GFX7-NEXT: v_alignbit_b32 v2, v9, v2, 24 ; GFX7-NEXT: v_alignbit_b32 v0, v16, v0, 24 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 8, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 8, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v9, v7, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v15, v9, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll --- a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll @@ -30,22 +30,22 @@ define amdgpu_vs void @test_2(<4 x i32> inreg %arg1, i32 %arg2, i32 inreg %arg3, <8 x float> addrspace(3)* %arg4) { ; CHECK-LABEL: test_2: ; CHECK: ; %bb.0: -; CHECK-NEXT: v_add_i32_e32 v5, vcc, 28, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 24, v1 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, 20, v1 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, 16, v1 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 16, v1 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 28, v1 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, 24, v1 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, 12, v1 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, 8, v1 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, 4, v1 ; CHECK-NEXT: s_mov_b32 m0, -1 -; CHECK-NEXT: ds_read_b32 v4, v2 -; CHECK-NEXT: ds_read_b32 v3, v3 -; CHECK-NEXT: ds_read_b32 v2, v6 +; CHECK-NEXT: ds_read_b32 v2, v2 +; CHECK-NEXT: ds_read_b32 v5, v4 +; CHECK-NEXT: ds_read_b32 v4, v6 ; CHECK-NEXT: ds_read_b32 v9, v7 ; CHECK-NEXT: ds_read_b32 v8, v8 ; CHECK-NEXT: ds_read_b32 v7, v10 ; CHECK-NEXT: ds_read_b32 v6, v1 -; CHECK-NEXT: ds_read_b32 v5, v5 +; CHECK-NEXT: ds_read_b32 v3, v3 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) ; CHECK-NEXT: tbuffer_store_format_xyzw v[6:9], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen glc slc ; CHECK-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -83,42 +83,42 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 -; GFX9-NEXT: ds_write_b8 v0, v2 offset:12 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:14 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: ds_write_b8 v0, v1 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:12 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:14 +; GFX9-NEXT: ds_write_b8 v0, v2 offset:8 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:10 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshr_b32 s0, s6, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:4 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s6, 24 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_lshr_b32 s0, s7, 8 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 +; GFX9-NEXT: ds_write_b8 v0, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s7, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:13 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s4, 8 +; GFX9-NEXT: s_lshr_b32 s0, s6, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:15 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s4, 24 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX9-NEXT: s_lshr_b32 s0, s6, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s5, 8 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s5, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align1: @@ -128,50 +128,50 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s3 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX7-NEXT: ds_write_b8 v0, v2 offset:12 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_write_b8 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:12 +; GFX7-NEXT: ds_write_b8 v0, v2 offset:8 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_lshr_b32 s4, s2, 8 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s4, s3, 8 +; GFX7-NEXT: ds_write_b8 v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_lshr_b32 s4, s2, 24 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX7-NEXT: s_lshr_b32 s4, s3, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:13 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_lshr_b32 s3, s3, 16 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:15 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshr_b32 s3, s2, 8 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:14 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshr_b32 s3, s2, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_lshr_b32 s2, s2, 16 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s3, 8 +; GFX7-NEXT: s_lshr_b32 s2, s1, 8 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:10 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s3, 24 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:13 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s3, 16 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:15 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s0, 8 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:14 +; GFX7-NEXT: s_lshr_b32 s2, s1, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s0, 24 +; GFX7-NEXT: s_lshr_b32 s1, s1, 16 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshr_b32 s1, s0, 8 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshr_b32 s1, s0, 24 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:1 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 8 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:2 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 24 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 16 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v4i32_align1: @@ -181,50 +181,50 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s3 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX6-NEXT: ds_write_b8 v0, v2 offset:12 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_write_b8 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:12 +; GFX6-NEXT: ds_write_b8 v0, v2 offset:8 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_lshr_b32 s4, s2, 8 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:4 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s4, s3, 8 +; GFX6-NEXT: ds_write_b8 v0, v1 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_lshr_b32 s4, s2, 24 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX6-NEXT: s_lshr_b32 s4, s3, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:13 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_lshr_b32 s3, s3, 16 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:15 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_lshr_b32 s3, s2, 8 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:14 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_lshr_b32 s3, s2, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s3, 8 +; GFX6-NEXT: s_lshr_b32 s2, s1, 8 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:10 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s3, 24 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:13 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s3, 16 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:15 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s0, 8 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:14 +; GFX6-NEXT: s_lshr_b32 s2, s1, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s0, 24 +; GFX6-NEXT: s_lshr_b32 s1, s1, 16 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s1, s0, 8 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s1, s0, 24 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:1 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 8 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:2 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 24 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 16 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v4i32_align1: @@ -234,42 +234,42 @@ ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: s_lshr_b32 s3, s7, 24 -; GFX10-NEXT: v_mov_b32_e32 v2, s7 -; GFX10-NEXT: s_lshr_b32 s0, s6, 8 -; GFX10-NEXT: s_lshr_b32 s1, s6, 24 -; GFX10-NEXT: s_lshr_b32 s6, s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_lshr_b32 s2, s7, 8 -; GFX10-NEXT: s_lshr_b32 s4, s4, 24 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: s_lshr_b32 s3, s6, 24 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: s_lshr_b32 s0, s7, 8 +; GFX10-NEXT: s_lshr_b32 s2, s6, 8 +; GFX10-NEXT: s_lshr_b32 s6, s5, 8 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: s_lshr_b32 s1, s7, 24 +; GFX10-NEXT: s_lshr_b32 s5, s5, 24 ; GFX10-NEXT: v_mov_b32_e32 v8, s3 ; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: v_mov_b32_e32 v9, s6 -; GFX10-NEXT: s_lshr_b32 s0, s5, 8 -; GFX10-NEXT: v_mov_b32_e32 v4, s5 +; GFX10-NEXT: s_lshr_b32 s0, s4, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: v_mov_b32_e32 v6, s1 ; GFX10-NEXT: v_mov_b32_e32 v7, s2 -; GFX10-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 -; GFX10-NEXT: ds_write_b8 v0, v2 offset:12 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:14 -; GFX10-NEXT: ds_write_b8 v0, v3 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:2 -; GFX10-NEXT: ds_write_b8 v0, v4 offset:4 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v4 offset:6 -; GFX10-NEXT: ds_write_b8 v0, v5 offset:9 -; GFX10-NEXT: ds_write_b8 v0, v6 offset:11 -; GFX10-NEXT: ds_write_b8 v0, v7 offset:13 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: s_lshr_b32 s1, s5, 24 +; GFX10-NEXT: ds_write_b8 v0, v1 offset:12 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:14 +; GFX10-NEXT: ds_write_b8 v0, v2 offset:8 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:10 +; GFX10-NEXT: ds_write_b8 v0, v3 offset:4 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:6 +; GFX10-NEXT: ds_write_b8 v0, v4 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v4 offset:2 +; GFX10-NEXT: ds_write_b8 v0, v5 offset:13 +; GFX10-NEXT: ds_write_b8 v0, v6 offset:15 +; GFX10-NEXT: ds_write_b8 v0, v7 offset:9 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_lshr_b32 s1, s4, 24 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: ds_write_b8 v0, v8 offset:15 -; GFX10-NEXT: ds_write_b8 v0, v9 offset:1 -; GFX10-NEXT: ds_write_b8 v0, v1 offset:3 -; GFX10-NEXT: ds_write_b8 v0, v2 offset:5 -; GFX10-NEXT: ds_write_b8 v0, v3 offset:7 +; GFX10-NEXT: ds_write_b8 v0, v8 offset:11 +; GFX10-NEXT: ds_write_b8 v0, v9 offset:5 +; GFX10-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX10-NEXT: ds_write_b8 v0, v2 offset:1 +; GFX10-NEXT: ds_write_b8 v0, v3 offset:3 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v4i32_align1: @@ -278,38 +278,37 @@ ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: s_lshr_b32 s4, s2, 8 -; GFX11-NEXT: s_lshr_b32 s5, s3, 8 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_lshr_b32 s4, s3, 8 ; GFX11-NEXT: s_lshr_b32 s3, s3, 24 -; GFX11-NEXT: s_lshr_b32 s6, s0, 8 -; GFX11-NEXT: s_lshr_b32 s0, s0, 24 -; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s4 +; GFX11-NEXT: s_lshr_b32 s5, s2, 8 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: s_lshr_b32 s2, s2, 24 -; GFX11-NEXT: s_lshr_b32 s7, s1, 8 -; GFX11-NEXT: v_dual_mov_b32 v8, s3 :: v_dual_mov_b32 v9, s6 -; GFX11-NEXT: v_mov_b32_e32 v10, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s5 -; GFX11-NEXT: ds_store_b8 v0, v1 offset:8 -; GFX11-NEXT: ds_store_b8 v0, v3 -; GFX11-NEXT: ds_store_b8_d16_hi v0, v1 offset:10 -; GFX11-NEXT: ds_store_b8 v0, v5 offset:9 -; GFX11-NEXT: ds_store_b8 v0, v2 offset:12 -; GFX11-NEXT: ds_store_b8 v0, v6 offset:11 -; GFX11-NEXT: ds_store_b8 v0, v7 offset:13 -; GFX11-NEXT: ds_store_b8_d16_hi v0, v2 offset:14 -; GFX11-NEXT: ds_store_b8 v0, v8 offset:15 -; GFX11-NEXT: v_mov_b32_e32 v1, s7 -; GFX11-NEXT: s_lshr_b32 s0, s1, 24 -; GFX11-NEXT: ds_store_b8_d16_hi v0, v3 offset:2 -; GFX11-NEXT: ds_store_b8 v0, v9 offset:1 -; GFX11-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-NEXT: ds_store_b8 v0, v4 offset:4 -; GFX11-NEXT: ds_store_b8 v0, v10 offset:3 -; GFX11-NEXT: ds_store_b8 v0, v1 offset:5 -; GFX11-NEXT: ds_store_b8_d16_hi v0, v4 offset:6 -; GFX11-NEXT: ds_store_b8 v0, v2 offset:7 +; GFX11-NEXT: s_lshr_b32 s6, s1, 8 +; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, s5 +; GFX11-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s6 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s4 +; GFX11-NEXT: s_lshr_b32 s1, s1, 24 +; GFX11-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: ds_store_b8 v0, v2 offset:8 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v2 offset:10 +; GFX11-NEXT: ds_store_b8 v0, v1 offset:12 +; GFX11-NEXT: ds_store_b8 v0, v4 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v4 offset:2 +; GFX11-NEXT: ds_store_b8 v0, v3 offset:4 +; GFX11-NEXT: ds_store_b8 v0, v5 offset:13 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v1 offset:14 +; GFX11-NEXT: ds_store_b8 v0, v6 offset:15 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v10, s7 +; GFX11-NEXT: v_mov_b32_e32 v11, s0 +; GFX11-NEXT: ds_store_b8 v0, v7 offset:9 +; GFX11-NEXT: ds_store_b8 v0, v8 offset:11 +; GFX11-NEXT: ds_store_b8 v0, v9 offset:5 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v3 offset:6 +; GFX11-NEXT: ds_store_b8 v0, v1 offset:7 +; GFX11-NEXT: ds_store_b8 v0, v10 offset:1 +; GFX11-NEXT: ds_store_b8 v0, v11 offset:3 ; GFX11-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 ret void @@ -322,18 +321,18 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 -; GFX9-NEXT: ds_write_b16 v0, v2 offset:12 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:14 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: ds_write_b16 v0, v1 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:12 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:14 +; GFX9-NEXT: ds_write_b16 v0, v2 offset:8 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:10 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align2: @@ -343,26 +342,26 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s3 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX7-NEXT: ds_write_b16 v0, v2 offset:12 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:12 +; GFX7-NEXT: ds_write_b16 v0, v2 offset:8 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s3, s3, 16 ; GFX7-NEXT: ds_write_b16 v0, v1 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_lshr_b32 s2, s2, 16 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:14 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s3, 16 +; GFX7-NEXT: s_lshr_b32 s1, s1, 16 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:10 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:14 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 16 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:2 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v4i32_align2: @@ -372,26 +371,26 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s3 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX6-NEXT: ds_write_b16 v0, v2 offset:12 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:12 +; GFX6-NEXT: ds_write_b16 v0, v2 offset:8 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:4 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s3, s3, 16 ; GFX6-NEXT: ds_write_b16 v0, v1 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:14 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s3, 16 +; GFX6-NEXT: s_lshr_b32 s1, s1, 16 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:10 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:14 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 16 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:2 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v4i32_align2: @@ -401,18 +400,18 @@ ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: v_mov_b32_e32 v2, s7 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: v_mov_b32_e32 v4, s5 -; GFX10-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 -; GFX10-NEXT: ds_write_b16 v0, v2 offset:12 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:14 -; GFX10-NEXT: ds_write_b16 v0, v3 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:2 -; GFX10-NEXT: ds_write_b16 v0, v4 offset:4 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v4 offset:6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: ds_write_b16 v0, v1 offset:12 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:14 +; GFX10-NEXT: ds_write_b16 v0, v2 offset:8 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:10 +; GFX10-NEXT: ds_write_b16 v0, v3 offset:4 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:6 +; GFX10-NEXT: ds_write_b16 v0, v4 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v4 offset:2 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v4i32_align2: @@ -421,17 +420,17 @@ ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_mov_b32_e32 v4, s1 -; GFX11-NEXT: ds_store_b16_d16_hi v0, v1 offset:10 -; GFX11-NEXT: ds_store_b16 v0, v2 offset:12 -; GFX11-NEXT: ds_store_b16_d16_hi v0, v2 offset:14 -; GFX11-NEXT: ds_store_b16 v0, v3 -; GFX11-NEXT: ds_store_b16_d16_hi v0, v3 offset:2 -; GFX11-NEXT: ds_store_b16 v0, v4 offset:4 -; GFX11-NEXT: ds_store_b16 v0, v1 offset:8 -; GFX11-NEXT: ds_store_b16_d16_hi v0, v4 offset:6 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: v_mov_b32_e32 v4, s2 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v1 offset:14 +; GFX11-NEXT: ds_store_b16 v0, v2 +; GFX11-NEXT: ds_store_b16 v0, v3 offset:4 +; GFX11-NEXT: ds_store_b16 v0, v4 offset:8 +; GFX11-NEXT: ds_store_b16 v0, v1 offset:12 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v4 offset:10 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v3 offset:6 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v2 offset:2 ; GFX11-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2 ret void diff --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll @@ -80,32 +80,32 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:8 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 -; GFX9-NEXT: ds_write_b8 v0, v2 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_lshr_b32 s0, s6, 8 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:4 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 +; GFX9-NEXT: ds_write_b8 v0, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s6, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s4, 8 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s4, 24 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s5, 8 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s5, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align1: @@ -116,12 +116,12 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX7-NEXT: ds_write_b8 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: s_lshr_b32 s3, s2, 8 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:4 +; GFX7-NEXT: ds_write_b8 v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_lshr_b32 s3, s2, 24 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 @@ -129,25 +129,25 @@ ; GFX7-NEXT: s_lshr_b32 s2, s2, 16 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s0, 8 +; GFX7-NEXT: s_lshr_b32 s2, s1, 8 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:10 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s0, 24 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX7-NEXT: s_lshr_b32 s2, s1, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_lshr_b32 s1, s1, 16 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshr_b32 s1, s0, 8 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshr_b32 s1, s0, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 8 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:2 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 24 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 16 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v3i32_align1: @@ -158,12 +158,12 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX6-NEXT: ds_write_b8 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_lshr_b32 s3, s2, 8 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:4 +; GFX6-NEXT: ds_write_b8 v0, v1 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_lshr_b32 s3, s2, 24 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:9 @@ -171,25 +171,25 @@ ; GFX6-NEXT: s_lshr_b32 s2, s2, 16 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s0, 8 +; GFX6-NEXT: s_lshr_b32 s2, s1, 8 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:10 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s0, 24 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX6-NEXT: s_lshr_b32 s2, s1, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_lshr_b32 s1, s1, 16 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s1, s0, 8 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s1, s0, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 8 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:2 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 24 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 16 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v3i32_align1: @@ -200,32 +200,32 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_lshr_b32 s0, s6, 8 ; GFX10-NEXT: s_lshr_b32 s1, s6, 24 -; GFX10-NEXT: s_lshr_b32 s2, s4, 8 -; GFX10-NEXT: s_lshr_b32 s3, s4, 24 -; GFX10-NEXT: s_lshr_b32 s4, s5, 8 -; GFX10-NEXT: s_lshr_b32 s5, s5, 24 +; GFX10-NEXT: s_lshr_b32 s2, s5, 8 +; GFX10-NEXT: s_lshr_b32 s3, s5, 24 +; GFX10-NEXT: s_lshr_b32 s5, s4, 8 +; GFX10-NEXT: s_lshr_b32 s4, s4, 24 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 ; GFX10-NEXT: v_mov_b32_e32 v7, s3 -; GFX10-NEXT: v_mov_b32_e32 v8, s4 -; GFX10-NEXT: v_mov_b32_e32 v9, s5 +; GFX10-NEXT: v_mov_b32_e32 v8, s5 +; GFX10-NEXT: v_mov_b32_e32 v9, s4 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:8 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 -; GFX10-NEXT: ds_write_b8 v0, v2 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:2 -; GFX10-NEXT: ds_write_b8 v0, v3 offset:4 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:6 +; GFX10-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 +; GFX10-NEXT: ds_write_b8 v0, v3 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:2 ; GFX10-NEXT: ds_write_b8 v0, v4 offset:9 ; GFX10-NEXT: ds_write_b8 v0, v5 offset:11 -; GFX10-NEXT: ds_write_b8 v0, v6 offset:1 -; GFX10-NEXT: ds_write_b8 v0, v7 offset:3 -; GFX10-NEXT: ds_write_b8 v0, v8 offset:5 -; GFX10-NEXT: ds_write_b8 v0, v9 offset:7 +; GFX10-NEXT: ds_write_b8 v0, v6 offset:5 +; GFX10-NEXT: ds_write_b8 v0, v7 offset:7 +; GFX10-NEXT: ds_write_b8 v0, v8 offset:1 +; GFX10-NEXT: ds_write_b8 v0, v9 offset:3 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32_align1: @@ -235,29 +235,28 @@ ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s0 ; GFX11-NEXT: s_lshr_b32 s3, s2, 8 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: s_lshr_b32 s2, s2, 24 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, s2 -; GFX11-NEXT: s_lshr_b32 s4, s0, 8 -; GFX11-NEXT: s_lshr_b32 s0, s0, 24 -; GFX11-NEXT: s_lshr_b32 s5, s1, 8 +; GFX11-NEXT: s_lshr_b32 s4, s1, 8 ; GFX11-NEXT: s_lshr_b32 s1, s1, 24 -; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v9, s1 +; GFX11-NEXT: s_lshr_b32 s5, s0, 8 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, s2 +; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s1 +; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v9, s0 ; GFX11-NEXT: ds_store_b8 v0, v1 offset:8 -; GFX11-NEXT: ds_store_b8 v0, v2 +; GFX11-NEXT: ds_store_b8 v0, v3 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v3 offset:2 +; GFX11-NEXT: ds_store_b8 v0, v2 offset:4 ; GFX11-NEXT: ds_store_b8 v0, v4 offset:9 ; GFX11-NEXT: ds_store_b8_d16_hi v0, v1 offset:10 ; GFX11-NEXT: ds_store_b8 v0, v5 offset:11 -; GFX11-NEXT: ds_store_b8_d16_hi v0, v2 offset:2 -; GFX11-NEXT: ds_store_b8 v0, v6 offset:1 -; GFX11-NEXT: ds_store_b8 v0, v3 offset:4 -; GFX11-NEXT: ds_store_b8 v0, v7 offset:3 -; GFX11-NEXT: ds_store_b8 v0, v8 offset:5 -; GFX11-NEXT: ds_store_b8_d16_hi v0, v3 offset:6 -; GFX11-NEXT: ds_store_b8 v0, v9 offset:7 +; GFX11-NEXT: ds_store_b8 v0, v6 offset:5 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v2 offset:6 +; GFX11-NEXT: ds_store_b8 v0, v7 offset:7 +; GFX11-NEXT: ds_store_b8 v0, v8 offset:1 +; GFX11-NEXT: ds_store_b8 v0, v9 offset:3 ; GFX11-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 ret void @@ -271,14 +270,14 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 -; GFX9-NEXT: ds_write_b16 v0, v2 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:6 +; GFX9-NEXT: ds_write_b16 v0, v2 offset:4 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align2: @@ -289,20 +288,20 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX7-NEXT: ds_write_b16 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_write_b16 v0, v2 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: s_lshr_b32 s2, s2, 16 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX7-NEXT: ds_write_b16 v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s0, s0, 16 +; GFX7-NEXT: s_lshr_b32 s1, s1, 16 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:10 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshr_b32 s0, s0, 16 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 16 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:2 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v3i32_align2: @@ -313,20 +312,20 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX6-NEXT: ds_write_b16 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_write_b16 v0, v2 offset:4 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX6-NEXT: ds_write_b16 v0, v1 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s0, s0, 16 +; GFX6-NEXT: s_lshr_b32 s1, s1, 16 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:10 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s0, s0, 16 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 16 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:2 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v3i32_align2: @@ -337,14 +336,14 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 -; GFX10-NEXT: ds_write_b16 v0, v2 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:2 -; GFX10-NEXT: ds_write_b16 v0, v3 offset:4 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:6 +; GFX10-NEXT: ds_write_b16 v0, v2 offset:4 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:6 +; GFX10-NEXT: ds_write_b16 v0, v3 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:2 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32_align2: @@ -357,10 +356,10 @@ ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: ds_store_b16_d16_hi v0, v1 offset:10 ; GFX11-NEXT: ds_store_b16 v0, v2 -; GFX11-NEXT: ds_store_b16_d16_hi v0, v2 offset:2 ; GFX11-NEXT: ds_store_b16 v0, v3 offset:4 ; GFX11-NEXT: ds_store_b16 v0, v1 offset:8 ; GFX11-NEXT: ds_store_b16_d16_hi v0, v3 offset:6 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v2 offset:2 ; GFX11-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2 ret void diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -234,8 +234,9 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s1, s0, 12 +; VI-NEXT: s_and_b32 s1, s0, 0xffff ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_add_i32 s1, s1, 12 ; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; VI-NEXT: s_or_b32 s0, s1, 4 ; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD diff --git a/llvm/test/CodeGen/ARM/addsubcarry-promotion.ll b/llvm/test/CodeGen/ARM/addsubcarry-promotion.ll --- a/llvm/test/CodeGen/ARM/addsubcarry-promotion.ll +++ b/llvm/test/CodeGen/ARM/addsubcarry-promotion.ll @@ -14,9 +14,8 @@ ; ARM-NEXT: adds r0, r1, r0 ; ARM-NEXT: movw r1, #65535 ; ARM-NEXT: sxth r2, r2 -; ARM-NEXT: adc r0, r2, #0 -; ARM-NEXT: uxth r0, r0 -; ARM-NEXT: cmp r0, r1 +; ARM-NEXT: adc r0, r2, #1 +; ARM-NEXT: tst r0, r1 ; ARM-NEXT: bxeq lr ; ARM-NEXT: .LBB0_1: @ %for.cond ; ARM-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -26,33 +25,25 @@ ; THUMBV6M: @ %bb.0: @ %entry ; THUMBV6M-NEXT: rsbs r2, r2, #0 ; THUMBV6M-NEXT: sxth r2, r2 -; THUMBV6M-NEXT: movs r3, #0 +; THUMBV6M-NEXT: movs r3, #1 ; THUMBV6M-NEXT: adds r0, r1, r0 ; THUMBV6M-NEXT: adcs r3, r2 -; THUMBV6M-NEXT: uxth r0, r3 -; THUMBV6M-NEXT: ldr r1, .LCPI0_0 -; THUMBV6M-NEXT: cmp r0, r1 +; THUMBV6M-NEXT: lsls r0, r3, #16 ; THUMBV6M-NEXT: beq .LBB0_2 ; THUMBV6M-NEXT: .LBB0_1: @ %for.cond ; THUMBV6M-NEXT: @ =>This Inner Loop Header: Depth=1 ; THUMBV6M-NEXT: b .LBB0_1 ; THUMBV6M-NEXT: .LBB0_2: @ %if.end ; THUMBV6M-NEXT: bx lr -; THUMBV6M-NEXT: .p2align 2 -; THUMBV6M-NEXT: @ %bb.3: -; THUMBV6M-NEXT: .LCPI0_0: -; THUMBV6M-NEXT: .long 65535 @ 0xffff ; ; THUMBV8M-BASE-LABEL: fn1: ; THUMBV8M-BASE: @ %bb.0: @ %entry ; THUMBV8M-BASE-NEXT: rsbs r2, r2, #0 ; THUMBV8M-BASE-NEXT: sxth r2, r2 -; THUMBV8M-BASE-NEXT: movs r3, #0 +; THUMBV8M-BASE-NEXT: movs r3, #1 ; THUMBV8M-BASE-NEXT: adds r0, r1, r0 ; THUMBV8M-BASE-NEXT: adcs r3, r2 -; THUMBV8M-BASE-NEXT: uxth r0, r3 -; THUMBV8M-BASE-NEXT: movw r1, #65535 -; THUMBV8M-BASE-NEXT: cmp r0, r1 +; THUMBV8M-BASE-NEXT: lsls r0, r3, #16 ; THUMBV8M-BASE-NEXT: beq .LBB0_2 ; THUMBV8M-BASE-NEXT: .LBB0_1: @ %for.cond ; THUMBV8M-BASE-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -64,11 +55,9 @@ ; THUMB: @ %bb.0: @ %entry ; THUMB-NEXT: rsbs r2, r2, #0 ; THUMB-NEXT: adds r0, r0, r1 -; THUMB-NEXT: movw r1, #65535 ; THUMB-NEXT: sxth r2, r2 -; THUMB-NEXT: adc r0, r2, #0 -; THUMB-NEXT: uxth r0, r0 -; THUMB-NEXT: cmp r0, r1 +; THUMB-NEXT: adc r0, r2, #1 +; THUMB-NEXT: lsls r0, r0, #16 ; THUMB-NEXT: it eq ; THUMB-NEXT: bxeq lr ; THUMB-NEXT: .LBB0_1: @ %for.cond diff --git a/llvm/test/CodeGen/ARM/icmp-shift-opt.ll b/llvm/test/CodeGen/ARM/icmp-shift-opt.ll --- a/llvm/test/CodeGen/ARM/icmp-shift-opt.ll +++ b/llvm/test/CodeGen/ARM/icmp-shift-opt.ll @@ -139,12 +139,11 @@ define i1 @opt_setcc_shl_ne_zero_i128(i128 %a) nounwind { ; CHECK-LABEL: opt_setcc_shl_ne_zero_i128: ; CHECK: @ %bb.0: -; CHECK-NEXT: orr r3, r1, r3 ; CHECK-NEXT: orr r0, r2, r0 -; CHECK-NEXT: orr r2, r0, r3 -; CHECK-NEXT: orr r0, r0, r1 -; CHECK-NEXT: lsr r0, r0, #15 -; CHECK-NEXT: orr r0, r0, r2, lsl #17 +; CHECK-NEXT: orr r0, r1, r0 +; CHECK-NEXT: orr r1, r0, r3 +; CHECK-NEXT: lsl r1, r1, #17 +; CHECK-NEXT: orr r0, r1, r0, lsr #15 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: movwne r0, #1 ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll b/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll --- a/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll +++ b/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll @@ -273,22 +273,22 @@ ; ASM32-LABEL: double_stack_va_arg: ; ASM32: # %bb.0: # %entry ; ASM32-NEXT: fadd 0, 1, 2 -; ASM32-NEXT: addi 4, 1, 128 -; ASM32-NEXT: lwz 3, 132(1) +; ASM32-NEXT: addi 3, 1, 128 +; ASM32-NEXT: lwz 4, 132(1) ; ASM32-NEXT: fadd 0, 0, 3 -; ASM32-NEXT: stw 4, -4(1) +; ASM32-NEXT: stw 3, -4(1) ; ASM32-NEXT: fadd 0, 0, 4 -; ASM32-NEXT: lwz 4, 128(1) +; ASM32-NEXT: lwz 3, 128(1) ; ASM32-NEXT: fadd 0, 0, 5 -; ASM32-NEXT: stw 3, -12(1) +; ASM32-NEXT: stw 3, -16(1) ; ASM32-NEXT: fadd 0, 0, 6 -; ASM32-NEXT: stw 4, -16(1) +; ASM32-NEXT: stw 4, -12(1) ; ASM32-NEXT: fadd 0, 0, 7 ; ASM32-NEXT: lfd 1, -16(1) ; ASM32-NEXT: fadd 0, 0, 8 -; ASM32-NEXT: stw 3, -20(1) +; ASM32-NEXT: stw 3, -24(1) ; ASM32-NEXT: fadd 0, 0, 9 -; ASM32-NEXT: stw 4, -24(1) +; ASM32-NEXT: stw 4, -20(1) ; ASM32-NEXT: fadd 0, 0, 10 ; ASM32-NEXT: fadd 0, 0, 11 ; ASM32-NEXT: fadd 0, 0, 12 @@ -386,3 +386,5 @@ ; 32BIT-DAG: STW renamable $r4, 0, %stack.2 :: (store (s32) into %stack.2, align 8) ; 32BIT-DAG: renamable $f1 = nofpexcept FADD killed renamable $f2, renamable $f2, implicit $rm ; 32BIT-DAG: BLR implicit $lr, implicit $rm, implicit $f1 +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; 32BIT: {{.*}} diff --git a/llvm/test/CodeGen/PowerPC/combine-fneg.ll b/llvm/test/CodeGen/PowerPC/combine-fneg.ll --- a/llvm/test/CodeGen/PowerPC/combine-fneg.ll +++ b/llvm/test/CodeGen/PowerPC/combine-fneg.ll @@ -13,10 +13,10 @@ ; CHECK-NEXT: xvredp 2, 0 ; CHECK-NEXT: xxswapd 1, 1 ; CHECK-NEXT: xxlor 3, 1, 1 -; CHECK-NEXT: xvnmsubadp 3, 0, 2 -; CHECK-NEXT: xvmaddadp 2, 2, 3 -; CHECK-NEXT: xvnmsubadp 1, 0, 2 -; CHECK-NEXT: xvnmaddadp 2, 2, 1 +; CHECK-NEXT: xvmaddadp 3, 0, 2 +; CHECK-NEXT: xvnmsubadp 2, 2, 3 +; CHECK-NEXT: xvmaddadp 1, 0, 2 +; CHECK-NEXT: xvmsubadp 2, 2, 1 ; CHECK-NEXT: xvmuldp 34, 34, 2 ; CHECK-NEXT: xvmuldp 35, 35, 2 ; CHECK-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/prefer-dqform.ll b/llvm/test/CodeGen/PowerPC/prefer-dqform.ll --- a/llvm/test/CodeGen/PowerPC/prefer-dqform.ll +++ b/llvm/test/CodeGen/PowerPC/prefer-dqform.ll @@ -14,14 +14,12 @@ ; CHECK-P9-LABEL: test: ; CHECK-P9: # %bb.0: # %test_entry ; CHECK-P9-NEXT: andi. r3, r6, 15 +; CHECK-P9-NEXT: li r3, 2 +; CHECK-P9-NEXT: li r10, 1 ; CHECK-P9-NEXT: lwz r4, 0(r4) ; CHECK-P9-NEXT: lwz r5, 0(r5) -; CHECK-P9-NEXT: li r11, 1 -; CHECK-P9-NEXT: addic r3, r3, -1 -; CHECK-P9-NEXT: subfe r10, r3, r3 -; CHECK-P9-NEXT: li r3, 2 -; CHECK-P9-NEXT: not r10, r10 -; CHECK-P9-NEXT: iseleq r3, r11, r3 +; CHECK-P9-NEXT: iseleq r3, r10, r3 +; CHECK-P9-NEXT: subfic r10, r3, 1 ; CHECK-P9-NEXT: add r4, r10, r4 ; CHECK-P9-NEXT: srawi r4, r4, 4 ; CHECK-P9-NEXT: addze r4, r4 @@ -68,14 +66,13 @@ ; ; CHECK-P10-LABEL: test: ; CHECK-P10: # %bb.0: # %test_entry -; CHECK-P10-NEXT: lwz r4, 0(r4) ; CHECK-P10-NEXT: andi. r3, r6, 15 ; CHECK-P10-NEXT: li r3, 2 ; CHECK-P10-NEXT: li r10, 1 +; CHECK-P10-NEXT: lwz r4, 0(r4) ; CHECK-P10-NEXT: lwz r5, 0(r5) ; CHECK-P10-NEXT: iseleq r3, r10, r3 -; CHECK-P10-NEXT: setnbc r10, eq -; CHECK-P10-NEXT: not r10, r10 +; CHECK-P10-NEXT: subfic r10, r3, 1 ; CHECK-P10-NEXT: add r4, r10, r4 ; CHECK-P10-NEXT: srawi r4, r4, 4 ; CHECK-P10-NEXT: addze r4, r4 diff --git a/llvm/test/CodeGen/PowerPC/select_const.ll b/llvm/test/CodeGen/PowerPC/select_const.ll --- a/llvm/test/CodeGen/PowerPC/select_const.ll +++ b/llvm/test/CodeGen/PowerPC/select_const.ll @@ -610,24 +610,13 @@ } define i8 @shl_constant_sel_constants(i1 %cond) { -; ISEL-LABEL: shl_constant_sel_constants: -; ISEL: # %bb.0: -; ISEL-NEXT: andi. 3, 3, 1 -; ISEL-NEXT: li 4, 4 -; ISEL-NEXT: li 3, 8 -; ISEL-NEXT: iselgt 3, 4, 3 -; ISEL-NEXT: blr -; -; NO_ISEL-LABEL: shl_constant_sel_constants: -; NO_ISEL: # %bb.0: -; NO_ISEL-NEXT: andi. 3, 3, 1 -; NO_ISEL-NEXT: li 4, 4 -; NO_ISEL-NEXT: li 3, 8 -; NO_ISEL-NEXT: bc 12, 1, .LBB37_1 -; NO_ISEL-NEXT: blr -; NO_ISEL-NEXT: .LBB37_1: -; NO_ISEL-NEXT: addi 3, 4, 0 -; NO_ISEL-NEXT: blr +; ALL-LABEL: shl_constant_sel_constants: +; ALL: # %bb.0: +; ALL-NEXT: clrlwi 3, 3, 31 +; ALL-NEXT: li 4, 1 +; ALL-NEXT: xori 3, 3, 3 +; ALL-NEXT: slw 3, 4, 3 +; ALL-NEXT: blr %sel = select i1 %cond, i8 2, i8 3 %bo = shl i8 1, %sel ret i8 %bo @@ -658,24 +647,13 @@ } define i8 @lshr_constant_sel_constants(i1 %cond) { -; ISEL-LABEL: lshr_constant_sel_constants: -; ISEL: # %bb.0: -; ISEL-NEXT: andi. 3, 3, 1 -; ISEL-NEXT: li 4, 16 -; ISEL-NEXT: li 3, 8 -; ISEL-NEXT: iselgt 3, 4, 3 -; ISEL-NEXT: blr -; -; NO_ISEL-LABEL: lshr_constant_sel_constants: -; NO_ISEL: # %bb.0: -; NO_ISEL-NEXT: andi. 3, 3, 1 -; NO_ISEL-NEXT: li 4, 16 -; NO_ISEL-NEXT: li 3, 8 -; NO_ISEL-NEXT: bc 12, 1, .LBB39_1 -; NO_ISEL-NEXT: blr -; NO_ISEL-NEXT: .LBB39_1: -; NO_ISEL-NEXT: addi 3, 4, 0 -; NO_ISEL-NEXT: blr +; ALL-LABEL: lshr_constant_sel_constants: +; ALL: # %bb.0: +; ALL-NEXT: clrlwi 3, 3, 31 +; ALL-NEXT: li 4, 64 +; ALL-NEXT: xori 3, 3, 3 +; ALL-NEXT: srw 3, 4, 3 +; ALL-NEXT: blr %sel = select i1 %cond, i8 2, i8 3 %bo = lshr i8 64, %sel ret i8 %bo @@ -685,7 +663,7 @@ define i8 @sel_constants_ashr_constant(i1 %cond) { ; ALL-LABEL: sel_constants_ashr_constant: ; ALL: # %bb.0: -; ALL-NEXT: clrldi 3, 3, 63 +; ALL-NEXT: clrlwi 3, 3, 31 ; ALL-NEXT: neg 3, 3 ; ALL-NEXT: blr %sel = select i1 %cond, i8 -4, i8 23 @@ -694,24 +672,13 @@ } define i8 @ashr_constant_sel_constants(i1 %cond) { -; ISEL-LABEL: ashr_constant_sel_constants: -; ISEL: # %bb.0: -; ISEL-NEXT: andi. 3, 3, 1 -; ISEL-NEXT: li 4, -32 -; ISEL-NEXT: li 3, -16 -; ISEL-NEXT: iselgt 3, 4, 3 -; ISEL-NEXT: blr -; -; NO_ISEL-LABEL: ashr_constant_sel_constants: -; NO_ISEL: # %bb.0: -; NO_ISEL-NEXT: andi. 3, 3, 1 -; NO_ISEL-NEXT: li 4, -32 -; NO_ISEL-NEXT: li 3, -16 -; NO_ISEL-NEXT: bc 12, 1, .LBB41_1 -; NO_ISEL-NEXT: blr -; NO_ISEL-NEXT: .LBB41_1: -; NO_ISEL-NEXT: addi 3, 4, 0 -; NO_ISEL-NEXT: blr +; ALL-LABEL: ashr_constant_sel_constants: +; ALL: # %bb.0: +; ALL-NEXT: clrlwi 3, 3, 31 +; ALL-NEXT: li 4, -128 +; ALL-NEXT: xori 3, 3, 3 +; ALL-NEXT: sraw 3, 4, 3 +; ALL-NEXT: blr %sel = select i1 %cond, i8 2, i8 3 %bo = ashr i8 128, %sel ret i8 %bo diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll --- a/llvm/test/CodeGen/RISCV/mul.ll +++ b/llvm/test/CodeGen/RISCV/mul.ll @@ -1299,9 +1299,8 @@ ; RV32IM-NEXT: li a5, -63 ; RV32IM-NEXT: mulhu a6, a3, a5 ; RV32IM-NEXT: slli a7, a4, 6 -; RV32IM-NEXT: sub a7, a7, a4 -; RV32IM-NEXT: sub a6, a6, a7 -; RV32IM-NEXT: neg a7, a7 +; RV32IM-NEXT: sub a7, a4, a7 +; RV32IM-NEXT: add a6, a7, a6 ; RV32IM-NEXT: sltu a7, a6, a7 ; RV32IM-NEXT: mulhu t0, a4, a5 ; RV32IM-NEXT: add a7, t0, a7 @@ -1314,9 +1313,9 @@ ; RV32IM-NEXT: add t1, a7, t1 ; RV32IM-NEXT: sub t4, t1, a4 ; RV32IM-NEXT: slli t5, a1, 6 -; RV32IM-NEXT: sub t5, t5, a1 -; RV32IM-NEXT: add t5, t5, a3 -; RV32IM-NEXT: sub t6, t4, t5 +; RV32IM-NEXT: sub t5, a1, t5 +; RV32IM-NEXT: sub t5, t5, a3 +; RV32IM-NEXT: add t6, t4, t5 ; RV32IM-NEXT: sltu s0, t6, t4 ; RV32IM-NEXT: neg s1, a4 ; RV32IM-NEXT: sltu t4, t4, s1 @@ -1324,6 +1323,7 @@ ; RV32IM-NEXT: mulhu t1, a4, t2 ; RV32IM-NEXT: add a7, t1, a7 ; RV32IM-NEXT: add a7, a7, t4 +; RV32IM-NEXT: sltu t0, t5, t0 ; RV32IM-NEXT: slli t1, a2, 6 ; RV32IM-NEXT: sub a2, a2, t1 ; RV32IM-NEXT: mulhu a5, a1, a5 @@ -1332,9 +1332,7 @@ ; RV32IM-NEXT: sub a2, t3, a3 ; RV32IM-NEXT: sub a2, a2, a4 ; RV32IM-NEXT: add a1, a2, a1 -; RV32IM-NEXT: neg a2, t5 -; RV32IM-NEXT: sltu a2, a2, t0 -; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: add a1, a1, t0 ; RV32IM-NEXT: add a1, a7, a1 ; RV32IM-NEXT: add a1, a1, s0 ; RV32IM-NEXT: slli a2, a3, 6 diff --git a/llvm/test/CodeGen/SystemZ/pr36164.ll b/llvm/test/CodeGen/SystemZ/pr36164.ll --- a/llvm/test/CodeGen/SystemZ/pr36164.ll +++ b/llvm/test/CodeGen/SystemZ/pr36164.ll @@ -17,18 +17,18 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: lhi %r0, 1 ; CHECK-NEXT: larl %r1, g_938 -; CHECK-NEXT: lhi %r2, 0 +; CHECK-NEXT: lhi %r2, 3 ; CHECK-NEXT: lhi %r3, 4 ; CHECK-NEXT: larl %r4, g_11 ; CHECK-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: strl %r0, g_73 -; CHECK-NEXT: strl %r2, g_69 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 +; CHECK-NEXT: strl %r2, g_69 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -8,15 +8,17 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrd r12, r3, [r0] -; CHECK-NEXT: ldrd lr, r2, [r0, #8] +; CHECK-NEXT: ldrd lr, r12, [r0] +; CHECK-NEXT: ldrd r3, r2, [r0, #8] ; CHECK-NEXT: ldrd r4, r0, [r0, #16] -; CHECK-NEXT: vmov.32 q1[1], r3 -; CHECK-NEXT: vmov q1[2], q1[0], r12, lr -; CHECK-NEXT: strd r2, r0, [r1, #16] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 +; CHECK-NEXT: vmov q1[2], q1[0], lr, r3 +; CHECK-NEXT: str r2, [r1, #16] +; CHECK-NEXT: vmov.32 q0[0], r4 +; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 +; CHECK-NEXT: vmov.32 q0[1], r0 ; CHECK-NEXT: vmov.f32 s8, s4 ; CHECK-NEXT: vmov.f32 s9, s6 +; CHECK-NEXT: str r0, [r1, #20] ; CHECK-NEXT: vmov.f32 s10, s0 ; CHECK-NEXT: vmov.f32 s11, s5 ; CHECK-NEXT: vstrw.32 q2, [r1] diff --git a/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll b/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll --- a/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll +++ b/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll @@ -17,12 +17,12 @@ define dso_local i32 @main() nounwind uwtable { ; CHECK-LABEL: main: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq i(%rip), %rsi -; CHECK-NEXT: movq j(%rip), %rax -; CHECK-NEXT: movq %rsi, %rdx -; CHECK-NEXT: shrq $8, %rdx +; CHECK-NEXT: movl i(%rip), %esi +; CHECK-NEXT: movl j(%rip), %eax +; CHECK-NEXT: movl %esi, %edx +; CHECK-NEXT: shrl $8, %edx ; CHECK-NEXT: movsbl %al, %ecx -; CHECK-NEXT: shrq $8, %rax +; CHECK-NEXT: shrl $8, %eax ; CHECK-NEXT: cbtw ; CHECK-NEXT: idivb %dl ; CHECK-NEXT: movl %eax, %edx diff --git a/llvm/test/CodeGen/X86/2012-08-07-CmpISelBug.ll b/llvm/test/CodeGen/X86/2012-08-07-CmpISelBug.ll --- a/llvm/test/CodeGen/X86/2012-08-07-CmpISelBug.ll +++ b/llvm/test/CodeGen/X86/2012-08-07-CmpISelBug.ll @@ -8,13 +8,11 @@ define void @foo(i8 %arg4, i32 %arg5, ptr %arg14) nounwind { ; CHECK-LABEL: foo: ; CHECK: ## %bb.0: ## %bb -; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi ; CHECK-NEXT: andl $32, %edi -; CHECK-NEXT: leal 13(%rdi), %eax -; CHECK-NEXT: xorb $-14, %al -; CHECK-NEXT: addb $82, %al -; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: movb $81, %al +; CHECK-NEXT: subb %dil, %al ; CHECK-NEXT: testl %esi, %edi +; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: movl $1, %ecx ; CHECK-NEXT: cmovnel %eax, %ecx ; CHECK-NEXT: xorb $81, %cl diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/llvm/test/CodeGen/X86/addcarry.ll --- a/llvm/test/CodeGen/X86/addcarry.ll +++ b/llvm/test/CodeGen/X86/addcarry.ll @@ -743,38 +743,34 @@ define i32 @add_U320_without_i128_add(ptr nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) nounwind { ; CHECK-LABEL: add_U320_without_i128_add: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movq 16(%rdi), %rax -; CHECK-NEXT: leaq (%rax,%rcx), %r10 +; CHECK-NEXT: movq 24(%rdi), %r10 +; CHECK-NEXT: movq 32(%rdi), %r11 ; CHECK-NEXT: addq %rsi, (%rdi) ; CHECK-NEXT: adcq %rdx, 8(%rdi) ; CHECK-NEXT: movq %rax, %rdx ; CHECK-NEXT: adcq %rcx, %rdx -; CHECK-NEXT: movq 24(%rdi), %r11 -; CHECK-NEXT: leaq (%r8,%r11), %r14 -; CHECK-NEXT: xorl %ebx, %ebx -; CHECK-NEXT: cmpq %r10, %rdx -; CHECK-NEXT: setb %bl ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: adcq %r14, %rbx -; CHECK-NEXT: movq 32(%rdi), %r10 -; CHECK-NEXT: leaq (%r9,%r10), %rcx -; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: cmpq %r14, %rbx -; CHECK-NEXT: setb %sil -; CHECK-NEXT: addq %r11, %r8 -; CHECK-NEXT: adcq %rcx, %rsi +; CHECK-NEXT: movq %r10, %rcx +; CHECK-NEXT: adcq %r8, %rcx +; CHECK-NEXT: cmpq %rax, %rdx +; CHECK-NEXT: adcq $0, %rcx +; CHECK-NEXT: leaq (%r11,%r9), %rbx +; CHECK-NEXT: addq %r8, %r10 +; CHECK-NEXT: movq %r11, %rsi +; CHECK-NEXT: adcq %r9, %rsi +; CHECK-NEXT: cmpq %r10, %rcx +; CHECK-NEXT: adcq $0, %rsi ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: cmpq %rcx, %rsi +; CHECK-NEXT: cmpq %rbx, %rsi ; CHECK-NEXT: setb %al -; CHECK-NEXT: addq %r10, %r9 +; CHECK-NEXT: addq %r9, %r11 ; CHECK-NEXT: movq %rdx, 16(%rdi) -; CHECK-NEXT: movq %rbx, 24(%rdi) +; CHECK-NEXT: movq %rcx, 24(%rdi) ; CHECK-NEXT: movq %rsi, 32(%rdi) ; CHECK-NEXT: adcl $0, %eax ; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %r14 ; CHECK-NEXT: retq %7 = load i64, ptr %0, align 8 %8 = getelementptr inbounds %struct.U320, ptr %0, i64 0, i32 0, i64 1 diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -74,8 +74,7 @@ ; X86-LABEL: mask8_zext: ; X86: ## %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: notb %al -; X86-NEXT: movzbl %al, %eax +; X86-NEXT: xorl $255, %eax ; X86-NEXT: retl %m0 = bitcast i8 %x to <8 x i1> %m1 = xor <8 x i1> %m0, diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -1967,16 +1967,16 @@ ; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] ; X86-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1] -; X86-NEXT: kmovd %k2, %esi # encoding: [0xc5,0xfb,0x93,0xf2] -; X86-NEXT: adcl %edx, %esi # encoding: [0x11,0xd6] +; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] +; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6] +; X86-NEXT: adcl %edx, %ecx # encoding: [0x11,0xd1] ; X86-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x64,0xc1] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] -; X86-NEXT: adcl %esi, %edx # encoding: [0x11,0xf2] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] +; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax # encoding: [0x03,0x44,0x24,0x08] ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x0c] ; X86-NEXT: popl %esi # encoding: [0x5e] @@ -2140,16 +2140,16 @@ ; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] ; X86-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1] -; X86-NEXT: kmovd %k2, %esi # encoding: [0xc5,0xfb,0x93,0xf2] -; X86-NEXT: adcl %edx, %esi # encoding: [0x11,0xd6] +; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] +; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6] +; X86-NEXT: adcl %edx, %ecx # encoding: [0x11,0xd1] ; X86-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x06] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] -; X86-NEXT: adcl %esi, %edx # encoding: [0x11,0xf2] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] +; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax # encoding: [0x03,0x44,0x24,0x08] ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x0c] ; X86-NEXT: popl %esi # encoding: [0x5e] diff --git a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll --- a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -2659,7 +2659,8 @@ ; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask: @@ -2667,9 +2668,8 @@ ; NoVLX-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2685,7 +2685,8 @@ ; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem: @@ -2693,9 +2694,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 ; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2713,7 +2713,8 @@ ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask: @@ -2722,9 +2723,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2744,7 +2744,8 @@ ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem: @@ -2753,9 +2754,8 @@ ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2776,16 +2776,16 @@ ; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2804,7 +2804,8 @@ ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b: @@ -2812,9 +2813,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7463,7 +7463,8 @@ ; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask: @@ -7471,9 +7472,8 @@ ; NoVLX-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7489,7 +7489,8 @@ ; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem: @@ -7497,9 +7498,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 ; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7517,7 +7517,8 @@ ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask: @@ -7526,9 +7527,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7548,7 +7548,8 @@ ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem: @@ -7557,9 +7558,8 @@ ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7580,16 +7580,16 @@ ; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7608,7 +7608,8 @@ ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b: @@ -7616,9 +7617,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12327,7 +12327,8 @@ ; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpnltq %xmm1, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask: @@ -12335,9 +12336,8 @@ ; NoVLX-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vpcmpnltq %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12353,7 +12353,8 @@ ; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem: @@ -12361,9 +12362,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 ; NoVLX-NEXT: vpcmpnltq %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12381,7 +12381,8 @@ ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpnltq %xmm1, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask: @@ -12390,9 +12391,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpnltq %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12412,7 +12412,8 @@ ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem: @@ -12421,9 +12422,8 @@ ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpnltq %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12444,16 +12444,16 @@ ; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpnltq (%rdi){1to2}, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vpcmpnltq (%rdi){1to8}, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12472,7 +12472,8 @@ ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpnltq (%rsi){1to2}, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b: @@ -12480,9 +12481,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpnltq (%rsi){1to8}, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17211,7 +17211,8 @@ ; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask: @@ -17219,9 +17220,8 @@ ; NoVLX-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17237,7 +17237,8 @@ ; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem: @@ -17245,9 +17246,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 ; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17265,7 +17265,8 @@ ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask: @@ -17274,9 +17275,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17296,7 +17296,8 @@ ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem: @@ -17305,9 +17306,8 @@ ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17328,16 +17328,16 @@ ; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17356,7 +17356,8 @@ ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b: @@ -17364,9 +17365,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21042,7 +21042,8 @@ ; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask: @@ -21050,9 +21051,8 @@ ; NoVLX-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21068,7 +21068,8 @@ ; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem: @@ -21076,9 +21077,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vmovapd (%rdi), %xmm1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21095,16 +21095,16 @@ ; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b: ; VLX: # %bb.0: # %entry ; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21123,7 +21123,8 @@ ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask: @@ -21132,9 +21133,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21153,7 +21153,8 @@ ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem: @@ -21162,9 +21163,8 @@ ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %xmm1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21184,7 +21184,8 @@ ; VLX: # %bb.0: # %entry ; VLX-NEXT: kmovd %edi, %k1 ; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1} -; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: andl $3, %eax ; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem_b: @@ -21192,9 +21193,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll --- a/llvm/test/CodeGen/X86/bitreverse.ll +++ b/llvm/test/CodeGen/X86/bitreverse.ll @@ -14,6 +14,7 @@ ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: rolw $8, %ax +; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: movl %eax, %edx ; X86-NEXT: andl $3855, %edx # imm = 0xF0F ; X86-NEXT: shll $4, %edx @@ -31,6 +32,7 @@ ; X86-NEXT: andl $21845, %eax # imm = 0x5555 ; X86-NEXT: leal (%eax,%edx,2), %eax ; X86-NEXT: rolw $8, %cx +; X86-NEXT: movzwl %cx, %ecx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: andl $3855, %edx # imm = 0xF0F ; X86-NEXT: shll $4, %edx @@ -290,6 +292,7 @@ ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: rolw $8, %ax +; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $3855, %ecx # imm = 0xF0F ; X86-NEXT: shll $4, %ecx @@ -311,19 +314,19 @@ ; ; X64-LABEL: test_bitreverse_i16: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: rolw $8, %di -; X64-NEXT: movl %edi, %eax +; X64-NEXT: movzwl %di, %eax +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: andl $3855, %ecx # imm = 0xF0F +; X64-NEXT: shll $4, %ecx +; X64-NEXT: shrl $4, %eax ; X64-NEXT: andl $3855, %eax # imm = 0xF0F -; X64-NEXT: shll $4, %eax -; X64-NEXT: shrl $4, %edi -; X64-NEXT: andl $3855, %edi # imm = 0xF0F -; X64-NEXT: orl %eax, %edi -; X64-NEXT: movl %edi, %eax +; X64-NEXT: orl %ecx, %eax +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: andl $13107, %ecx # imm = 0x3333 +; X64-NEXT: shrl $2, %eax ; X64-NEXT: andl $13107, %eax # imm = 0x3333 -; X64-NEXT: shrl $2, %edi -; X64-NEXT: andl $13107, %edi # imm = 0x3333 -; X64-NEXT: leal (%rdi,%rax,4), %eax +; X64-NEXT: leal (%rax,%rcx,4), %eax ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: andl $21845, %ecx # imm = 0x5555 ; X64-NEXT: shrl %eax diff --git a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll --- a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll +++ b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll @@ -574,7 +574,7 @@ define i16 @test_i16_2032_mask_lshr_4(i16 %a0) { ; X86-LABEL: test_i16_2032_mask_lshr_4: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl $4, %eax ; X86-NEXT: andl $127, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -594,7 +594,7 @@ define i16 @test_i16_2032_mask_lshr_5(i16 %a0) { ; X86-LABEL: test_i16_2032_mask_lshr_5: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl $5, %eax ; X86-NEXT: andl $63, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -614,7 +614,7 @@ define i16 @test_i16_2032_mask_lshr_6(i16 %a0) { ; X86-LABEL: test_i16_2032_mask_lshr_6: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl $6, %eax ; X86-NEXT: andl $31, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -755,7 +755,7 @@ define i16 @test_i16_2032_mask_ashr_4(i16 %a0) { ; X86-LABEL: test_i16_2032_mask_ashr_4: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl $4, %eax ; X86-NEXT: andl $127, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -775,7 +775,7 @@ define i16 @test_i16_2032_mask_ashr_5(i16 %a0) { ; X86-LABEL: test_i16_2032_mask_ashr_5: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl $5, %eax ; X86-NEXT: andl $63, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -795,7 +795,7 @@ define i16 @test_i16_2032_mask_ashr_6(i16 %a0) { ; X86-LABEL: test_i16_2032_mask_ashr_6: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl $6, %eax ; X86-NEXT: andl $31, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/dagcombine-cse.ll b/llvm/test/CodeGen/X86/dagcombine-cse.ll --- a/llvm/test/CodeGen/X86/dagcombine-cse.ll +++ b/llvm/test/CodeGen/X86/dagcombine-cse.ll @@ -50,6 +50,7 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %edi, %eax @@ -72,37 +73,31 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edi, %esi -; X86-NEXT: adcl $0, %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl (%esp), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl $0, %esi ; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb %al -; X86-NEXT: movl %edx, %edi -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: adcl $0, %eax +; X86-NEXT: setb %dl ; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: adcl %ecx, %esi -; X86-NEXT: setb %cl -; X86-NEXT: adcl %edx, %edi -; X86-NEXT: addb $255, %cl -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: adcl %ecx, %edi +; X86-NEXT: movzbl %dl, %ecx +; X86-NEXT: adcl %eax, %esi ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: setb %ah -; X86-NEXT: addb $255, %al -; X86-NEXT: adcl %edx, %ecx -; X86-NEXT: movzbl %ah, %ebx -; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %eax -; X86-NEXT: addl %eax, %edi -; X86-NEXT: adcl %edx, %ebx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl %edi, %edx -; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: addl %eax, %esi +; X86-NEXT: adcl %edx, %ecx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %esi, %edx +; X86-NEXT: addl $4, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/dagcombine-select.ll b/llvm/test/CodeGen/X86/dagcombine-select.ll --- a/llvm/test/CodeGen/X86/dagcombine-select.ll +++ b/llvm/test/CodeGen/X86/dagcombine-select.ll @@ -194,10 +194,12 @@ define i32 @shl_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: shl_constant_sel_constants: ; CHECK: # %bb.0: -; CHECK-NEXT: notb %dil -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: leal 4(,%rax,4), %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: xorb $3, %cl +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shll %cl, %eax ; CHECK-NEXT: retq %sel = select i1 %cond, i32 2, i32 3 %bo = shl i32 1, %sel @@ -207,9 +209,12 @@ define i32 @lshr_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: lshr_constant_sel_constants: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: leal 8(,%rdi,8), %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: xorb $3, %cl +; CHECK-NEXT: movl $64, %eax +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shrl %cl, %eax ; CHECK-NEXT: retq %sel = select i1 %cond, i32 2, i32 3 %bo = lshr i32 64, %sel @@ -219,10 +224,12 @@ define i32 @ashr_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: ashr_constant_sel_constants: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: shll $4, %edi -; CHECK-NEXT: leal 16(%rdi), %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: xorb $3, %cl +; CHECK-NEXT: movl $128, %eax +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shrl %cl, %eax ; CHECK-NEXT: retq %sel = select i1 %cond, i32 2, i32 3 %bo = ashr i32 128, %sel diff --git a/llvm/test/CodeGen/X86/field-extract-use-trunc.ll b/llvm/test/CodeGen/X86/field-extract-use-trunc.ll --- a/llvm/test/CodeGen/X86/field-extract-use-trunc.ll +++ b/llvm/test/CodeGen/X86/field-extract-use-trunc.ll @@ -73,7 +73,7 @@ define i16 @test5(i16 %f12) nounwind { ; i686-LABEL: test5: ; i686: # %bb.0: -; i686-NEXT: movl {{[0-9]+}}(%esp), %eax +; i686-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; i686-NEXT: shrl $6, %eax ; i686-NEXT: movsbl %al, %eax ; i686-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/freeze-unary.ll b/llvm/test/CodeGen/X86/freeze-unary.ll --- a/llvm/test/CodeGen/X86/freeze-unary.ll +++ b/llvm/test/CodeGen/X86/freeze-unary.ll @@ -11,6 +11,7 @@ ; X64-LABEL: freeze_sext: ; X64: # %bb.0: ; X64-NEXT: movsbl %dil, %eax +; X64-NEXT: cwtl ; X64-NEXT: retq %x = sext i8 %a0 to i16 %y = freeze i16 %x diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -32,17 +32,17 @@ ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm0 ; SSSE3-FAST-NEXT: retq ; -; AVX1-SLOW-LABEL: pair_sum_v4f32_v4f32: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 -; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1] -; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1] -; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 -; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] -; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX1-SLOW-NEXT: retq +; AVX-SLOW-LABEL: pair_sum_v4f32_v4f32: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1] +; AVX-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 +; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] +; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: pair_sum_v4f32_v4f32: ; AVX-FAST: # %bb.0: @@ -50,18 +50,6 @@ ; AVX-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm1 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: retq -; -; AVX2-SLOW-LABEL: pair_sum_v4f32_v4f32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] -; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: retq %5 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> %6 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> %7 = fadd <2 x float> %5, %6 @@ -126,34 +114,28 @@ ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; AVX1-SLOW-NEXT: retq ; -; AVX1-FAST-LABEL: pair_sum_v4i32_v4i32: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm1 -; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: retq +; AVX-FAST-LABEL: pair_sum_v4i32_v4i32: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm1 +; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq ; ; AVX2-SLOW-LABEL: pair_sum_v4i32_v4i32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm2 +; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: pair_sum_v4i32_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: retq %5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> %6 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> %7 = add <2 x i32> %5, %6 @@ -191,15 +173,14 @@ ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm2 -; SSSE3-SLOW-NEXT: movaps %xmm5, %xmm1 -; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm1 -; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm2 +; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm5 +; SSSE3-SLOW-NEXT: haddps %xmm5, %xmm2 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1,3,2] ; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6 -; SSSE3-SLOW-NEXT: haddps %xmm5, %xmm4 -; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm4 -; SSSE3-SLOW-NEXT: movaps %xmm4, %xmm1 +; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm6 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,1] +; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32: @@ -264,43 +245,43 @@ ; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm8 ; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4 +; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm0 ; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3] -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1] +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm5, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[3,1] +; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2 ; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: pair_sum_v8f32_v4f32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm8 ; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm1 -; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm4 +; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm0 ; AVX2-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3] -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] -; AVX2-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1 -; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1] +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm5, %xmm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[3,1] +; AVX2-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2 ; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] ; AVX2-FAST-NEXT: retq %9 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> %10 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> @@ -440,9 +421,11 @@ ; AVX2-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1 ; AVX2-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4 ; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3] -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -461,9 +444,11 @@ ; AVX2-FAST-NEXT: vphaddd %xmm4, %xmm4, %xmm1 ; AVX2-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4 ; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3] -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -751,16 +736,16 @@ ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] -; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm5 -; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] ; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] +; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: sequential_sum_v4i32_v4i32: @@ -775,15 +760,15 @@ ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm4 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] +; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: retq %5 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> %6 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> @@ -1046,6 +1031,9 @@ ret <4 x float> %12 } +; FIXME: This could be a serie of phaddd followed by a shufps +; See https://reviews.llvm.org/D127115?id=436232#inline-1223238 + define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) { ; SSSE3-SLOW-LABEL: reduction_sum_v4i32_v4i32: ; SSSE3-SLOW: # %bb.0: diff --git a/llvm/test/CodeGen/X86/icmp-shift-opt.ll b/llvm/test/CodeGen/X86/icmp-shift-opt.ll --- a/llvm/test/CodeGen/X86/icmp-shift-opt.ll +++ b/llvm/test/CodeGen/X86/icmp-shift-opt.ll @@ -222,11 +222,11 @@ ; X86-LABEL: opt_setcc_expanded_shl_correct_shifts: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shll $17, %ecx ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: shldl $17, %eax, %ecx +; X86-NEXT: orl %ecx, %eax ; X86-NEXT: sete %al ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll --- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll +++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll @@ -13,15 +13,17 @@ define <16 x i8> @elt0_v16i8(i8 %x) { ; X86-SSE2-LABEL: elt0_v16i8: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE2-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X86-SSE2-NEXT: andnps %xmm1, %xmm0 ; X86-SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: elt0_v16i8: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movd %edi, %xmm0 -; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE2-NEXT: movd %edi, %xmm1 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X64-SSE2-NEXT: pandn %xmm1, %xmm0 ; X64-SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/insertelement-var-index.ll b/llvm/test/CodeGen/X86/insertelement-var-index.ll --- a/llvm/test/CodeGen/X86/insertelement-var-index.ll +++ b/llvm/test/CodeGen/X86/insertelement-var-index.ll @@ -2283,18 +2283,34 @@ ; SSE-NEXT: divl %ecx ; SSE-NEXT: retq ; -; AVX-LABEL: PR44139: -; AVX: # %bb.0: -; AVX-NEXT: movl (%rdi), %eax -; AVX-NEXT: leal 2147483647(%rax), %ecx -; AVX-NEXT: testl %eax, %eax -; AVX-NEXT: cmovnsl %eax, %ecx -; AVX-NEXT: andl $-2147483648, %ecx # imm = 0x80000000 -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: # kill: def $eax killed $eax killed $rax -; AVX-NEXT: xorl %edx, %edx -; AVX-NEXT: divl %ecx -; AVX-NEXT: retq +; AVX1OR2-LABEL: PR44139: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: movl (%rdi), %eax +; AVX1OR2-NEXT: leal 2147483647(%rax), %ecx +; AVX1OR2-NEXT: testl %eax, %eax +; AVX1OR2-NEXT: cmovnsl %eax, %ecx +; AVX1OR2-NEXT: andl $-2147483648, %ecx # imm = 0x80000000 +; AVX1OR2-NEXT: addl %eax, %ecx +; AVX1OR2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX1OR2-NEXT: xorl %edx, %edx +; AVX1OR2-NEXT: divl %ecx +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: PR44139: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: leal 2147483647(%rax), %ecx +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: cmovnsl %eax, %ecx +; AVX512-NEXT: andl $-2147483648, %ecx # imm = 0x80000000 +; AVX512-NEXT: addl %eax, %ecx +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: xorl %edx, %edx +; AVX512-NEXT: divl %ecx +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq ; ; X86AVX2-LABEL: PR44139: ; X86AVX2: # %bb.0: diff --git a/llvm/test/CodeGen/X86/is_fpclass-fp80.ll b/llvm/test/CodeGen/X86/is_fpclass-fp80.ll --- a/llvm/test/CodeGen/X86/is_fpclass-fp80.ll +++ b/llvm/test/CodeGen/X86/is_fpclass-fp80.ll @@ -318,9 +318,8 @@ ; ; CHECK-64-LABEL: is_neginf_f80: ; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-64-NEXT: notl %eax -; CHECK-64-NEXT: movzwl %ax, %eax +; CHECK-64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-64-NEXT: xorq $65535, %rax # imm = 0xFFFF ; CHECK-64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 ; CHECK-64-NEXT: xorq {{[0-9]+}}(%rsp), %rcx ; CHECK-64-NEXT: orq %rax, %rcx diff --git a/llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll b/llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll --- a/llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll +++ b/llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll @@ -5,10 +5,9 @@ ; CHECK-LABEL: csrot_: ; CHECK: # %bb.0: ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: xorps %xmm0, %xmm1 -; CHECK-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],mem[1,2,3] -; CHECK-NEXT: movlps %xmm1, (%rax) +; CHECK-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] +; CHECK-NEXT: movlps %xmm0, (%rax) ; CHECK-NEXT: retq 1: %2 = load float, ptr %0, align 4 diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -51,7 +51,7 @@ ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -60,7 +60,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -133,7 +133,7 @@ ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -145,7 +145,7 @@ ; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -429,7 +429,7 @@ ; SSE-NEXT: packsswb %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pmovmskb %xmm2, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -441,7 +441,7 @@ ; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1218,7 +1218,7 @@ ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: psllw $7, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -1228,7 +1228,7 @@ ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1326,7 +1326,7 @@ ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: psllw $7, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -1339,7 +1339,7 @@ ; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1596,7 +1596,7 @@ ; SSE-NEXT: packsswb %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pmovmskb %xmm2, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -1612,7 +1612,7 @@ ; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2507,7 +2507,7 @@ ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: psllw $5, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -2517,7 +2517,7 @@ ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2615,7 +2615,7 @@ ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: psllw $5, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -2628,7 +2628,7 @@ ; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2885,7 +2885,7 @@ ; SSE-NEXT: packsswb %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pmovmskb %xmm2, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -2901,7 +2901,7 @@ ; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/mulvi32.ll b/llvm/test/CodeGen/X86/mulvi32.ll --- a/llvm/test/CodeGen/X86/mulvi32.ll +++ b/llvm/test/CodeGen/X86/mulvi32.ll @@ -134,31 +134,31 @@ define <4 x i64> @_mul4xi32toi64a(<4 x i32>, <4 x i32>) { ; SSE2-LABEL: _mul4xi32toi64a: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,1,3,3] ; SSE2-NEXT: pmuludq %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE42-LABEL: _mul4xi32toi64a: ; SSE42: # %bb.0: -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,1,3,3] ; SSE42-NEXT: pmuludq %xmm3, %xmm2 -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,3,3] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,3,3] -; SSE42-NEXT: pmuludq %xmm3, %xmm1 -; SSE42-NEXT: movdqa %xmm2, %xmm0 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: pmuludq %xmm1, %xmm0 +; SSE42-NEXT: movdqa %xmm2, %xmm1 ; SSE42-NEXT: retq ; ; AVX1-LABEL: _mul4xi32toi64a: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] -; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/nontemporal-3.ll b/llvm/test/CodeGen/X86/nontemporal-3.ll --- a/llvm/test/CodeGen/X86/nontemporal-3.ll +++ b/llvm/test/CodeGen/X86/nontemporal-3.ll @@ -494,14 +494,14 @@ ; CHECK-LABEL: test_zero_v8f64_align1: ; CHECK: # %bb.0: ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: movntiq %rax, 24(%rdi) -; CHECK-NEXT: movntiq %rax, 16(%rdi) ; CHECK-NEXT: movntiq %rax, 8(%rdi) ; CHECK-NEXT: movntiq %rax, (%rdi) -; CHECK-NEXT: movntiq %rax, 56(%rdi) -; CHECK-NEXT: movntiq %rax, 48(%rdi) +; CHECK-NEXT: movntiq %rax, 24(%rdi) +; CHECK-NEXT: movntiq %rax, 16(%rdi) ; CHECK-NEXT: movntiq %rax, 40(%rdi) ; CHECK-NEXT: movntiq %rax, 32(%rdi) +; CHECK-NEXT: movntiq %rax, 56(%rdi) +; CHECK-NEXT: movntiq %rax, 48(%rdi) ; CHECK-NEXT: retq store <8 x double> zeroinitializer, ptr %dst, align 1, !nontemporal !1 ret void @@ -511,67 +511,67 @@ ; SSE2-LABEL: test_zero_v16f32_align1: ; SSE2: # %bb.0: ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 8(%rdi) ; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 40(%rdi) ; SSE2-NEXT: movntiq %rax, 32(%rdi) +; SSE2-NEXT: movntiq %rax, 56(%rdi) +; SSE2-NEXT: movntiq %rax, 48(%rdi) ; SSE2-NEXT: retq ; ; SSE4A-LABEL: test_zero_v16f32_align1: ; SSE4A: # %bb.0: ; SSE4A-NEXT: xorl %eax, %eax -; SSE4A-NEXT: movntiq %rax, 24(%rdi) ; SSE4A-NEXT: movntiq %rax, 8(%rdi) -; SSE4A-NEXT: movntiq %rax, 56(%rdi) +; SSE4A-NEXT: movntiq %rax, 24(%rdi) ; SSE4A-NEXT: movntiq %rax, 40(%rdi) +; SSE4A-NEXT: movntiq %rax, 56(%rdi) ; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) ; SSE4A-NEXT: retq ; ; SSE41-LABEL: test_zero_v16f32_align1: ; SSE41: # %bb.0: ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 8(%rdi) ; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 40(%rdi) ; SSE41-NEXT: movntiq %rax, 32(%rdi) +; SSE41-NEXT: movntiq %rax, 56(%rdi) +; SSE41-NEXT: movntiq %rax, 48(%rdi) ; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v16f32_align1: ; AVX: # %bb.0: ; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 56(%rdi) -; AVX-NEXT: movntiq %rax, 48(%rdi) +; AVX-NEXT: movntiq %rax, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 40(%rdi) ; AVX-NEXT: movntiq %rax, 32(%rdi) +; AVX-NEXT: movntiq %rax, 56(%rdi) +; AVX-NEXT: movntiq %rax, 48(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v16f32_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 40(%rdi) ; AVX512-NEXT: movntiq %rax, 32(%rdi) +; AVX512-NEXT: movntiq %rax, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 48(%rdi) ; AVX512-NEXT: retq store <16 x float> zeroinitializer, ptr %dst, align 1, !nontemporal !1 ret void @@ -581,66 +581,66 @@ ; SSE2-LABEL: test_zero_v8i64_align1: ; SSE2: # %bb.0: ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 8(%rdi) ; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 40(%rdi) ; SSE2-NEXT: movntiq %rax, 32(%rdi) +; SSE2-NEXT: movntiq %rax, 56(%rdi) +; SSE2-NEXT: movntiq %rax, 48(%rdi) ; SSE2-NEXT: retq ; ; SSE4A-LABEL: test_zero_v8i64_align1: ; SSE4A: # %bb.0: ; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) ; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) ; SSE4A-NEXT: retq ; ; SSE41-LABEL: test_zero_v8i64_align1: ; SSE41: # %bb.0: ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 8(%rdi) ; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 40(%rdi) ; SSE41-NEXT: movntiq %rax, 32(%rdi) +; SSE41-NEXT: movntiq %rax, 56(%rdi) +; SSE41-NEXT: movntiq %rax, 48(%rdi) ; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v8i64_align1: ; AVX: # %bb.0: ; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 56(%rdi) -; AVX-NEXT: movntiq %rax, 48(%rdi) +; AVX-NEXT: movntiq %rax, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 40(%rdi) ; AVX-NEXT: movntiq %rax, 32(%rdi) +; AVX-NEXT: movntiq %rax, 56(%rdi) +; AVX-NEXT: movntiq %rax, 48(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v8i64_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 40(%rdi) ; AVX512-NEXT: movntiq %rax, 32(%rdi) +; AVX512-NEXT: movntiq %rax, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 48(%rdi) ; AVX512-NEXT: retq store <8 x i64> zeroinitializer, ptr %dst, align 1, !nontemporal !1 ret void @@ -650,66 +650,66 @@ ; SSE2-LABEL: test_zero_v16i32_align1: ; SSE2: # %bb.0: ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 8(%rdi) ; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 40(%rdi) ; SSE2-NEXT: movntiq %rax, 32(%rdi) +; SSE2-NEXT: movntiq %rax, 56(%rdi) +; SSE2-NEXT: movntiq %rax, 48(%rdi) ; SSE2-NEXT: retq ; ; SSE4A-LABEL: test_zero_v16i32_align1: ; SSE4A: # %bb.0: ; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) ; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) ; SSE4A-NEXT: retq ; ; SSE41-LABEL: test_zero_v16i32_align1: ; SSE41: # %bb.0: ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 8(%rdi) ; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 40(%rdi) ; SSE41-NEXT: movntiq %rax, 32(%rdi) +; SSE41-NEXT: movntiq %rax, 56(%rdi) +; SSE41-NEXT: movntiq %rax, 48(%rdi) ; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v16i32_align1: ; AVX: # %bb.0: ; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 56(%rdi) -; AVX-NEXT: movntiq %rax, 48(%rdi) +; AVX-NEXT: movntiq %rax, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 40(%rdi) ; AVX-NEXT: movntiq %rax, 32(%rdi) +; AVX-NEXT: movntiq %rax, 56(%rdi) +; AVX-NEXT: movntiq %rax, 48(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v16i32_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 40(%rdi) ; AVX512-NEXT: movntiq %rax, 32(%rdi) +; AVX512-NEXT: movntiq %rax, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 48(%rdi) ; AVX512-NEXT: retq store <16 x i32> zeroinitializer, ptr %dst, align 1, !nontemporal !1 ret void @@ -719,66 +719,66 @@ ; SSE2-LABEL: test_zero_v32i16_align1: ; SSE2: # %bb.0: ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 8(%rdi) ; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 40(%rdi) ; SSE2-NEXT: movntiq %rax, 32(%rdi) +; SSE2-NEXT: movntiq %rax, 56(%rdi) +; SSE2-NEXT: movntiq %rax, 48(%rdi) ; SSE2-NEXT: retq ; ; SSE4A-LABEL: test_zero_v32i16_align1: ; SSE4A: # %bb.0: ; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) ; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) ; SSE4A-NEXT: retq ; ; SSE41-LABEL: test_zero_v32i16_align1: ; SSE41: # %bb.0: ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 8(%rdi) ; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 40(%rdi) ; SSE41-NEXT: movntiq %rax, 32(%rdi) +; SSE41-NEXT: movntiq %rax, 56(%rdi) +; SSE41-NEXT: movntiq %rax, 48(%rdi) ; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v32i16_align1: ; AVX: # %bb.0: ; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 56(%rdi) -; AVX-NEXT: movntiq %rax, 48(%rdi) +; AVX-NEXT: movntiq %rax, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 40(%rdi) ; AVX-NEXT: movntiq %rax, 32(%rdi) +; AVX-NEXT: movntiq %rax, 56(%rdi) +; AVX-NEXT: movntiq %rax, 48(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v32i16_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 40(%rdi) ; AVX512-NEXT: movntiq %rax, 32(%rdi) +; AVX512-NEXT: movntiq %rax, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 48(%rdi) ; AVX512-NEXT: retq store <32 x i16> zeroinitializer, ptr %dst, align 1, !nontemporal !1 ret void @@ -788,66 +788,66 @@ ; SSE2-LABEL: test_zero_v64i8_align1: ; SSE2: # %bb.0: ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 8(%rdi) ; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 40(%rdi) ; SSE2-NEXT: movntiq %rax, 32(%rdi) +; SSE2-NEXT: movntiq %rax, 56(%rdi) +; SSE2-NEXT: movntiq %rax, 48(%rdi) ; SSE2-NEXT: retq ; ; SSE4A-LABEL: test_zero_v64i8_align1: ; SSE4A: # %bb.0: ; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) ; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) ; SSE4A-NEXT: retq ; ; SSE41-LABEL: test_zero_v64i8_align1: ; SSE41: # %bb.0: ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 8(%rdi) ; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 40(%rdi) ; SSE41-NEXT: movntiq %rax, 32(%rdi) +; SSE41-NEXT: movntiq %rax, 56(%rdi) +; SSE41-NEXT: movntiq %rax, 48(%rdi) ; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v64i8_align1: ; AVX: # %bb.0: ; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 56(%rdi) -; AVX-NEXT: movntiq %rax, 48(%rdi) +; AVX-NEXT: movntiq %rax, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 40(%rdi) ; AVX-NEXT: movntiq %rax, 32(%rdi) +; AVX-NEXT: movntiq %rax, 56(%rdi) +; AVX-NEXT: movntiq %rax, 48(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v64i8_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 40(%rdi) ; AVX512-NEXT: movntiq %rax, 32(%rdi) +; AVX512-NEXT: movntiq %rax, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 48(%rdi) ; AVX512-NEXT: retq store <64 x i8> zeroinitializer, ptr %dst, align 1, !nontemporal !1 ret void diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -322,43 +322,41 @@ define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) { ; SSE2-LABEL: and_mulhuw_v16i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [32767,32767,32767,32767] -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767] +; SSE2-NEXT: pand %xmm8, %xmm1 +; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: pmulhw %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm8, %xmm7 ; SSE2-NEXT: pand %xmm6, %xmm8 ; SSE2-NEXT: packssdw %xmm7, %xmm8 ; SSE2-NEXT: pmulhw %xmm2, %xmm8 -; SSE2-NEXT: pand %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: packssdw %xmm5, %xmm6 -; SSE2-NEXT: pmulhw %xmm6, %xmm0 ; SSE2-NEXT: movdqa %xmm8, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: and_mulhuw_v16i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [32767,32767,32767,32767] -; SSE41-NEXT: pand %xmm6, %xmm3 -; SSE41-NEXT: pand %xmm6, %xmm2 -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: pand %xmm6, %xmm1 -; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767] +; SSE41-NEXT: pand %xmm8, %xmm1 +; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: pand %xmm6, %xmm7 +; SSE41-NEXT: pand %xmm8, %xmm3 +; SSE41-NEXT: pand %xmm8, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pand %xmm8, %xmm5 +; SSE41-NEXT: pand %xmm8, %xmm4 +; SSE41-NEXT: packusdw %xmm5, %xmm4 +; SSE41-NEXT: pmulhw %xmm4, %xmm0 +; SSE41-NEXT: pand %xmm8, %xmm7 ; SSE41-NEXT: pand %xmm6, %xmm8 ; SSE41-NEXT: packusdw %xmm7, %xmm8 ; SSE41-NEXT: pmulhw %xmm2, %xmm8 -; SSE41-NEXT: pand %xmm6, %xmm5 -; SSE41-NEXT: pand %xmm4, %xmm6 -; SSE41-NEXT: packusdw %xmm5, %xmm6 -; SSE41-NEXT: pmulhw %xmm6, %xmm0 ; SSE41-NEXT: movdqa %xmm8, %xmm1 ; SSE41-NEXT: retq ; @@ -423,13 +421,6 @@ define <16 x i16> @ashr_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) { ; SSE2-LABEL: ashr_mulhuw_v16i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: packssdw %xmm5, %xmm4 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: pmulhw %xmm4, %xmm0 ; SSE2-NEXT: psrad $16, %xmm7 ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: packssdw %xmm7, %xmm6 @@ -437,25 +428,32 @@ ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: packssdw %xmm3, %xmm2 ; SSE2-NEXT: pmulhw %xmm6, %xmm2 +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: pmulhw %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: ashr_mulhuw_v16i16: ; SSE41: # %bb.0: -; SSE41-NEXT: psrld $16, %xmm3 -; SSE41-NEXT: psrld $16, %xmm2 -; SSE41-NEXT: packusdw %xmm3, %xmm2 ; SSE41-NEXT: psrld $16, %xmm1 ; SSE41-NEXT: psrld $16, %xmm0 ; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm7 -; SSE41-NEXT: psrld $16, %xmm6 -; SSE41-NEXT: packusdw %xmm7, %xmm6 -; SSE41-NEXT: pmulhw %xmm2, %xmm6 +; SSE41-NEXT: psrld $16, %xmm3 +; SSE41-NEXT: psrld $16, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 ; SSE41-NEXT: psrld $16, %xmm5 ; SSE41-NEXT: psrld $16, %xmm4 ; SSE41-NEXT: packusdw %xmm5, %xmm4 ; SSE41-NEXT: pmulhw %xmm4, %xmm0 +; SSE41-NEXT: psrld $16, %xmm7 +; SSE41-NEXT: psrld $16, %xmm6 +; SSE41-NEXT: packusdw %xmm7, %xmm6 +; SSE41-NEXT: pmulhw %xmm2, %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm1 ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll --- a/llvm/test/CodeGen/X86/popcnt.ll +++ b/llvm/test/CodeGen/X86/popcnt.ll @@ -62,7 +62,7 @@ define i16 @cnt16(i16 %x) nounwind readnone { ; X86-LABEL: cnt16: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: shrl %ecx ; X86-NEXT: andl $21845, %ecx # imm = 0x5555 @@ -1525,7 +1525,7 @@ define i32 @popcount_i16_zext(i16 zeroext %x) { ; X86-LABEL: popcount_i16_zext: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: shrl %ecx ; X86-NEXT: andl $21845, %ecx # imm = 0x5555 diff --git a/llvm/test/CodeGen/X86/pr53419.ll b/llvm/test/CodeGen/X86/pr53419.ll --- a/llvm/test/CodeGen/X86/pr53419.ll +++ b/llvm/test/CodeGen/X86/pr53419.ll @@ -60,21 +60,33 @@ } define i1 @intrinsic_v8i8(ptr align 1 %arg, ptr align 1 %arg1) { -; SSE-LABEL: intrinsic_v8i8: -; SSE: # %bb.0: # %bb -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE-NEXT: pmovmskb %xmm1, %eax -; SSE-NEXT: cmpb $-1, %al -; SSE-NEXT: sete %al -; SSE-NEXT: retq +; SSE2-LABEL: intrinsic_v8i8: +; SSE2: # %bb.0: # %bb +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: cmpb $-1, %al +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE42-LABEL: intrinsic_v8i8: +; SSE42: # %bb.0: # %bb +; SSE42-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; SSE42-NEXT: pmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; SSE42-NEXT: pcmpeqw %xmm0, %xmm1 +; SSE42-NEXT: packsswb %xmm1, %xmm1 +; SSE42-NEXT: pmovmskb %xmm1, %eax +; SSE42-NEXT: cmpb $-1, %al +; SSE42-NEXT: sete %al +; SSE42-NEXT: retq ; ; AVX-LABEL: intrinsic_v8i8: ; AVX: # %bb.0: # %bb -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpmovmskb %xmm0, %eax ; AVX-NEXT: cmpb $-1, %al ; AVX-NEXT: sete %al @@ -84,9 +96,10 @@ ; X86: # %bb.0: # %bb ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; X86-NEXT: vpmovmskb %xmm0, %eax ; X86-NEXT: cmpb $-1, %al ; X86-NEXT: sete %al @@ -226,3 +239,5 @@ %all_eq = icmp eq i32 %lhs, %rhs ret i1 %all_eq } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; SSE: {{.*}} diff --git a/llvm/test/CodeGen/X86/promote-vec3.ll b/llvm/test/CodeGen/X86/promote-vec3.ll --- a/llvm/test/CodeGen/X86/promote-vec3.ll +++ b/llvm/test/CodeGen/X86/promote-vec3.ll @@ -42,13 +42,13 @@ ; ; AVX-64-LABEL: zext_i8: ; AVX-64: # %bb.0: -; AVX-64-NEXT: movzbl %sil, %esi +; AVX-64-NEXT: movzbl %dl, %ecx +; AVX-64-NEXT: movzbl %sil, %edx ; AVX-64-NEXT: vmovd %edi, %xmm0 ; AVX-64-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-64-NEXT: movzbl %dl, %ecx ; AVX-64-NEXT: vmovd %xmm0, %eax ; AVX-64-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-64-NEXT: movl %esi, %edx +; AVX-64-NEXT: # kill: def $dx killed $dx killed $edx ; AVX-64-NEXT: # kill: def $cx killed $cx killed $ecx ; AVX-64-NEXT: retq %2 = zext <3 x i8> %0 to <3 x i16> diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -1794,58 +1794,58 @@ ; SSE2OR3-LABEL: psubus_16i32_max: ; SSE2OR3: # %bb.0: # %vector.ph ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] -; SSE2OR3-NEXT: movdqa %xmm3, %xmm8 +; SSE2OR3-NEXT: movdqa %xmm5, %xmm8 ; SSE2OR3-NEXT: pxor %xmm9, %xmm8 -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm10 = [2147549183,2147549183,2147549183,2147549183] -; SSE2OR3-NEXT: movdqa %xmm10, %xmm6 +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183,2147549183,2147549183] +; SSE2OR3-NEXT: movdqa %xmm7, %xmm6 ; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm6 ; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm8 +; SSE2OR3-NEXT: pand %xmm6, %xmm5 +; SSE2OR3-NEXT: pxor %xmm8, %xmm6 +; SSE2OR3-NEXT: por %xmm5, %xmm6 +; SSE2OR3-NEXT: pslld $16, %xmm6 +; SSE2OR3-NEXT: psrad $16, %xmm6 +; SSE2OR3-NEXT: movdqa %xmm4, %xmm10 +; SSE2OR3-NEXT: pxor %xmm9, %xmm10 +; SSE2OR3-NEXT: movdqa %xmm7, %xmm5 +; SSE2OR3-NEXT: pcmpgtd %xmm10, %xmm5 +; SSE2OR3-NEXT: pand %xmm5, %xmm4 +; SSE2OR3-NEXT: pxor %xmm8, %xmm5 +; SSE2OR3-NEXT: por %xmm4, %xmm5 +; SSE2OR3-NEXT: pslld $16, %xmm5 +; SSE2OR3-NEXT: psrad $16, %xmm5 +; SSE2OR3-NEXT: packssdw %xmm6, %xmm5 +; SSE2OR3-NEXT: movdqa %xmm3, %xmm4 +; SSE2OR3-NEXT: pxor %xmm9, %xmm4 +; SSE2OR3-NEXT: movdqa %xmm7, %xmm6 +; SSE2OR3-NEXT: pcmpgtd %xmm4, %xmm6 ; SSE2OR3-NEXT: pand %xmm6, %xmm3 ; SSE2OR3-NEXT: pxor %xmm8, %xmm6 ; SSE2OR3-NEXT: por %xmm3, %xmm6 ; SSE2OR3-NEXT: pslld $16, %xmm6 ; SSE2OR3-NEXT: psrad $16, %xmm6 -; SSE2OR3-NEXT: movdqa %xmm2, %xmm3 -; SSE2OR3-NEXT: pxor %xmm9, %xmm3 -; SSE2OR3-NEXT: movdqa %xmm10, %xmm7 -; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm7 -; SSE2OR3-NEXT: pand %xmm7, %xmm2 -; SSE2OR3-NEXT: pxor %xmm8, %xmm7 -; SSE2OR3-NEXT: por %xmm2, %xmm7 +; SSE2OR3-NEXT: pxor %xmm2, %xmm9 +; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm7 +; SSE2OR3-NEXT: pxor %xmm7, %xmm8 +; SSE2OR3-NEXT: pand %xmm2, %xmm7 +; SSE2OR3-NEXT: por %xmm8, %xmm7 ; SSE2OR3-NEXT: pslld $16, %xmm7 ; SSE2OR3-NEXT: psrad $16, %xmm7 ; SSE2OR3-NEXT: packssdw %xmm6, %xmm7 ; SSE2OR3-NEXT: psubusw %xmm7, %xmm0 -; SSE2OR3-NEXT: movdqa %xmm5, %xmm2 -; SSE2OR3-NEXT: pxor %xmm9, %xmm2 -; SSE2OR3-NEXT: movdqa %xmm10, %xmm3 -; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2OR3-NEXT: pand %xmm3, %xmm5 -; SSE2OR3-NEXT: pxor %xmm8, %xmm3 -; SSE2OR3-NEXT: por %xmm5, %xmm3 -; SSE2OR3-NEXT: pslld $16, %xmm3 -; SSE2OR3-NEXT: psrad $16, %xmm3 -; SSE2OR3-NEXT: pxor %xmm4, %xmm9 -; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm10 -; SSE2OR3-NEXT: pxor %xmm10, %xmm8 -; SSE2OR3-NEXT: pand %xmm4, %xmm10 -; SSE2OR3-NEXT: por %xmm8, %xmm10 -; SSE2OR3-NEXT: pslld $16, %xmm10 -; SSE2OR3-NEXT: psrad $16, %xmm10 -; SSE2OR3-NEXT: packssdw %xmm3, %xmm10 -; SSE2OR3-NEXT: psubusw %xmm10, %xmm1 +; SSE2OR3-NEXT: psubusw %xmm5, %xmm1 ; SSE2OR3-NEXT: retq ; ; SSE41-LABEL: psubus_16i32_max: ; SSE41: # %bb.0: # %vector.ph ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535] +; SSE41-NEXT: pminud %xmm6, %xmm5 +; SSE41-NEXT: pminud %xmm6, %xmm4 +; SSE41-NEXT: packusdw %xmm5, %xmm4 ; SSE41-NEXT: pminud %xmm6, %xmm3 ; SSE41-NEXT: pminud %xmm6, %xmm2 ; SSE41-NEXT: packusdw %xmm3, %xmm2 ; SSE41-NEXT: psubusw %xmm2, %xmm0 -; SSE41-NEXT: pminud %xmm6, %xmm5 -; SSE41-NEXT: pminud %xmm6, %xmm4 -; SSE41-NEXT: packusdw %xmm5, %xmm4 ; SSE41-NEXT: psubusw %xmm4, %xmm1 ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/shift-mask.ll b/llvm/test/CodeGen/X86/shift-mask.ll --- a/llvm/test/CodeGen/X86/shift-mask.ll +++ b/llvm/test/CodeGen/X86/shift-mask.ll @@ -142,9 +142,9 @@ define i16 @test_i16_shl_lshr_2(i16 %a0) { ; X86-LABEL: test_i16_shl_lshr_2: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl $2, %eax -; X86-NEXT: andl $16376, %eax # imm = 0x3FF8 +; X86-NEXT: andl $-8, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; @@ -411,7 +411,7 @@ define i16 @test_i16_lshr_lshr_1(i16 %a0) { ; X86-LABEL: test_i16_lshr_lshr_1: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl $2, %eax ; X86-NEXT: andl $2047, %eax # imm = 0x7FF ; X86-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll @@ -463,8 +463,8 @@ ; SSE2-LABEL: shuffle_v16i8_to_v2i8_2: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rsi) @@ -541,8 +541,8 @@ ; SSE2-LABEL: shuffle_v16i8_to_v2i8_4: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rsi) @@ -619,8 +619,8 @@ ; SSE2-LABEL: shuffle_v16i8_to_v2i8_6: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rsi) diff --git a/llvm/test/CodeGen/X86/v8i1-masks.ll b/llvm/test/CodeGen/X86/v8i1-masks.ll --- a/llvm/test/CodeGen/X86/v8i1-masks.ll +++ b/llvm/test/CodeGen/X86/v8i1-masks.ll @@ -149,14 +149,18 @@ ; X32-AVX2: ## %bb.0: ; X32-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X32-AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 -; X32-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7] +; X32-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; X32-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X32-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: and_mask_constant: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7] +; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: retq %m = icmp eq <8 x i32> %v0, zeroinitializer %mand = and <8 x i1> %m, diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll --- a/llvm/test/CodeGen/X86/vector-bitreverse.ll +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -99,19 +99,19 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind { ; SSE-LABEL: test_bitreverse_i16: ; SSE: # %bb.0: -; SSE-NEXT: # kill: def $edi killed $edi def $rdi ; SSE-NEXT: rolw $8, %di -; SSE-NEXT: movl %edi, %eax +; SSE-NEXT: movzwl %di, %eax +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: andl $3855, %ecx # imm = 0xF0F +; SSE-NEXT: shll $4, %ecx +; SSE-NEXT: shrl $4, %eax ; SSE-NEXT: andl $3855, %eax # imm = 0xF0F -; SSE-NEXT: shll $4, %eax -; SSE-NEXT: shrl $4, %edi -; SSE-NEXT: andl $3855, %edi # imm = 0xF0F -; SSE-NEXT: orl %eax, %edi -; SSE-NEXT: movl %edi, %eax +; SSE-NEXT: orl %ecx, %eax +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: andl $13107, %ecx # imm = 0x3333 +; SSE-NEXT: shrl $2, %eax ; SSE-NEXT: andl $13107, %eax # imm = 0x3333 -; SSE-NEXT: shrl $2, %edi -; SSE-NEXT: andl $13107, %edi # imm = 0x3333 -; SSE-NEXT: leal (%rdi,%rax,4), %eax +; SSE-NEXT: leal (%rax,%rcx,4), %eax ; SSE-NEXT: movl %eax, %ecx ; SSE-NEXT: andl $21845, %ecx # imm = 0x5555 ; SSE-NEXT: shrl %eax @@ -122,19 +122,19 @@ ; ; AVX-LABEL: test_bitreverse_i16: ; AVX: # %bb.0: -; AVX-NEXT: # kill: def $edi killed $edi def $rdi ; AVX-NEXT: rolw $8, %di -; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: movzwl %di, %eax +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: andl $3855, %ecx # imm = 0xF0F +; AVX-NEXT: shll $4, %ecx +; AVX-NEXT: shrl $4, %eax ; AVX-NEXT: andl $3855, %eax # imm = 0xF0F -; AVX-NEXT: shll $4, %eax -; AVX-NEXT: shrl $4, %edi -; AVX-NEXT: andl $3855, %edi # imm = 0xF0F -; AVX-NEXT: orl %eax, %edi -; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: orl %ecx, %eax +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: andl $13107, %ecx # imm = 0x3333 +; AVX-NEXT: shrl $2, %eax ; AVX-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NEXT: shrl $2, %edi -; AVX-NEXT: andl $13107, %edi # imm = 0x3333 -; AVX-NEXT: leal (%rdi,%rax,4), %eax +; AVX-NEXT: leal (%rax,%rcx,4), %eax ; AVX-NEXT: movl %eax, %ecx ; AVX-NEXT: andl $21845, %ecx # imm = 0x5555 ; AVX-NEXT: shrl %eax @@ -153,19 +153,19 @@ ; ; GFNISSE-LABEL: test_bitreverse_i16: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: # kill: def $edi killed $edi def $rdi ; GFNISSE-NEXT: rolw $8, %di -; GFNISSE-NEXT: movl %edi, %eax +; GFNISSE-NEXT: movzwl %di, %eax +; GFNISSE-NEXT: movl %eax, %ecx +; GFNISSE-NEXT: andl $3855, %ecx # imm = 0xF0F +; GFNISSE-NEXT: shll $4, %ecx +; GFNISSE-NEXT: shrl $4, %eax ; GFNISSE-NEXT: andl $3855, %eax # imm = 0xF0F -; GFNISSE-NEXT: shll $4, %eax -; GFNISSE-NEXT: shrl $4, %edi -; GFNISSE-NEXT: andl $3855, %edi # imm = 0xF0F -; GFNISSE-NEXT: orl %eax, %edi -; GFNISSE-NEXT: movl %edi, %eax +; GFNISSE-NEXT: orl %ecx, %eax +; GFNISSE-NEXT: movl %eax, %ecx +; GFNISSE-NEXT: andl $13107, %ecx # imm = 0x3333 +; GFNISSE-NEXT: shrl $2, %eax ; GFNISSE-NEXT: andl $13107, %eax # imm = 0x3333 -; GFNISSE-NEXT: shrl $2, %edi -; GFNISSE-NEXT: andl $13107, %edi # imm = 0x3333 -; GFNISSE-NEXT: leal (%rdi,%rax,4), %eax +; GFNISSE-NEXT: leal (%rax,%rcx,4), %eax ; GFNISSE-NEXT: movl %eax, %ecx ; GFNISSE-NEXT: andl $21845, %ecx # imm = 0x5555 ; GFNISSE-NEXT: shrl %eax @@ -176,19 +176,19 @@ ; ; GFNIAVX-LABEL: test_bitreverse_i16: ; GFNIAVX: # %bb.0: -; GFNIAVX-NEXT: # kill: def $edi killed $edi def $rdi ; GFNIAVX-NEXT: rolw $8, %di -; GFNIAVX-NEXT: movl %edi, %eax +; GFNIAVX-NEXT: movzwl %di, %eax +; GFNIAVX-NEXT: movl %eax, %ecx +; GFNIAVX-NEXT: andl $3855, %ecx # imm = 0xF0F +; GFNIAVX-NEXT: shll $4, %ecx +; GFNIAVX-NEXT: shrl $4, %eax ; GFNIAVX-NEXT: andl $3855, %eax # imm = 0xF0F -; GFNIAVX-NEXT: shll $4, %eax -; GFNIAVX-NEXT: shrl $4, %edi -; GFNIAVX-NEXT: andl $3855, %edi # imm = 0xF0F -; GFNIAVX-NEXT: orl %eax, %edi -; GFNIAVX-NEXT: movl %edi, %eax +; GFNIAVX-NEXT: orl %ecx, %eax +; GFNIAVX-NEXT: movl %eax, %ecx +; GFNIAVX-NEXT: andl $13107, %ecx # imm = 0x3333 +; GFNIAVX-NEXT: shrl $2, %eax ; GFNIAVX-NEXT: andl $13107, %eax # imm = 0x3333 -; GFNIAVX-NEXT: shrl $2, %edi -; GFNIAVX-NEXT: andl $13107, %edi # imm = 0x3333 -; GFNIAVX-NEXT: leal (%rdi,%rax,4), %eax +; GFNIAVX-NEXT: leal (%rax,%rcx,4), %eax ; GFNIAVX-NEXT: movl %eax, %ecx ; GFNIAVX-NEXT: andl $21845, %ecx # imm = 0x5555 ; GFNIAVX-NEXT: shrl %eax diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -999,7 +999,7 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 @@ -1010,7 +1010,7 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 @@ -1021,7 +1021,7 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 @@ -1032,7 +1032,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v16i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 @@ -1052,7 +1052,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 @@ -1087,7 +1087,7 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v16i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll --- a/llvm/test/CodeGen/X86/vector-fshl-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll @@ -580,7 +580,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 @@ -597,7 +597,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -767,7 +767,7 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX2-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX2-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -778,7 +778,7 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -789,7 +789,7 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -800,7 +800,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v16i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512BW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -811,7 +811,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -330,7 +330,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm4 ; AVX512BW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3 @@ -341,7 +341,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3 diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -1032,7 +1032,7 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1043,7 +1043,7 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1054,7 +1054,7 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1065,7 +1065,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v16i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1085,7 +1085,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1121,7 +1121,7 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v16i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -582,7 +582,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -600,7 +600,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -801,7 +801,7 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -812,7 +812,7 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -823,7 +823,7 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -834,7 +834,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v16i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -845,7 +845,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -328,7 +328,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw %xmm3, %zmm0, %zmm3 ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -339,7 +339,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm0, %zmm3 ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -650,100 +650,102 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3],ymm5[4],ymm0[5,6,7],ymm5[8],ymm0[9,10,11],ymm5[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm9 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm10 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0,1,2,3,4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rsi) ; AVX2-SLOW-NEXT: vmovdqa %ymm13, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r8) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -834,84 +836,86 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3],ymm5[4],ymm0[5,6,7],ymm5[8],ymm0[9,10,11],ymm5[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] @@ -919,7 +923,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r8) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -1441,7 +1445,7 @@ ; ; AVX2-SLOW-LABEL: vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $200, %rsp +; AVX2-SLOW-NEXT: subq $216, %rsp ; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -1450,12 +1454,16 @@ ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 @@ -1470,12 +1478,18 @@ ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 @@ -1483,55 +1497,51 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -1543,67 +1553,67 @@ ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm11 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm10 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm10 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm13 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm6 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload @@ -1616,10 +1626,10 @@ ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -1633,19 +1643,19 @@ ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -1666,12 +1676,12 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm11, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm14, 32(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-SLOW-NEXT: addq $200, %rsp +; AVX2-SLOW-NEXT: addq $216, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1842,7 +1852,7 @@ ; ; AVX2-FAST-PERLANE-LABEL: vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $184, %rsp +; AVX2-FAST-PERLANE-NEXT: subq $200, %rsp ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -1851,12 +1861,18 @@ ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 @@ -1871,12 +1887,18 @@ ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 @@ -1884,158 +1906,152 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm11, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm10, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm15, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm15, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm14, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm15[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm14[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7] @@ -2051,12 +2067,12 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-PERLANE-NEXT: addq $184, %rsp +; AVX2-FAST-PERLANE-NEXT: addq $200, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -1322,32 +1322,33 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $408, %rsp # imm = 0x198 -; SSE-NEXT: movdqa 304(%rdi), %xmm9 -; SSE-NEXT: movdqa 240(%rdi), %xmm8 -; SSE-NEXT: movdqa 256(%rdi), %xmm12 -; SSE-NEXT: movdqa 288(%rdi), %xmm14 +; SSE-NEXT: subq $424, %rsp # imm = 0x1A8 +; SSE-NEXT: movdqa 304(%rdi), %xmm10 +; SSE-NEXT: movdqa 240(%rdi), %xmm12 +; SSE-NEXT: movdqa 256(%rdi), %xmm9 +; SSE-NEXT: movdqa 288(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 272(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm13 -; SSE-NEXT: movdqa 96(%rdi), %xmm10 +; SSE-NEXT: movdqa 80(%rdi), %xmm8 +; SSE-NEXT: movdqa 96(%rdi), %xmm7 ; SSE-NEXT: movdqa 128(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm13 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,1,2,3] +; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] @@ -1360,40 +1361,44 @@ ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,1,0,3] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,2,2,3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,0,1] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm7, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: movdqa 32(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm15 ; SSE-NEXT: andps %xmm7, %xmm3 ; SSE-NEXT: orps %xmm3, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa (%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] @@ -1417,9 +1422,9 @@ ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 160(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,2,2,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] @@ -1431,17 +1436,17 @@ ; SSE-NEXT: andps %xmm7, %xmm2 ; SSE-NEXT: orps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -1449,26 +1454,23 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: psllq $48, %xmm2 ; SSE-NEXT: movaps %xmm7, %xmm3 ; SSE-NEXT: andnps %xmm2, %xmm3 ; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: orps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,3,2,3] +; SSE-NEXT: psrlq $48, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,3,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm13 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $232, (%rsp), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] @@ -1476,24 +1478,24 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 ; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pand %xmm7, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,3,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] @@ -1501,8 +1503,8 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 ; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 @@ -1512,11 +1514,12 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,3,2,3] ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -1527,111 +1530,112 @@ ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm15[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm15[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,0,1,3] -; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: andnps %xmm13, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,1,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm15[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm15[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0,1,3] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: andnps %xmm8, %xmm1 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[2,3] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0,1,3] -; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: andnps %xmm5, %xmm1 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: andnps %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm7[3,0] -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: andnps %xmm7, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm9[3,0] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: andnps %xmm9, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,7,4,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm6[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] @@ -1639,143 +1643,145 @@ ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm0[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[3,0] -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm15, %xmm11 +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: andnps %xmm15, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm3[2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm15 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,2] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm5[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2] -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm12[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm9[0,2] -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps %xmm9, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm13[3,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0] +; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm6[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm0[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm13 +; SSE-NEXT: por %xmm6, %xmm13 +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2] +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm14[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm12[0,2] +; SSE-NEXT: movaps %xmm6, %xmm12 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm3[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm10[0,2] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps %xmm10, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm8[3,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm5[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm13, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm7[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm13[0,1,2,3,7,4,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm7[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm4[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,2,2,2,4,5,6,7] -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,4,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,0] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm6[2,0] -; SSE-NEXT: por %xmm11, %xmm0 -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm6[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm8, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm7[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,4,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm7[2,0] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,0] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,0] +; SSE-NEXT: por %xmm11, %xmm3 +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm2[2,0] -; SSE-NEXT: por %xmm5, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,1,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -1800,16 +1806,16 @@ ; SSE-NEXT: movaps %xmm1, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movaps %xmm3, 32(%r8) +; SSE-NEXT: movaps %xmm13, 32(%r8) ; SSE-NEXT: movaps %xmm15, (%r8) -; SSE-NEXT: movaps %xmm8, 48(%r8) +; SSE-NEXT: movaps %xmm9, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%r8) -; SSE-NEXT: movaps %xmm10, (%r9) -; SSE-NEXT: movaps %xmm12, 32(%r9) +; SSE-NEXT: movaps %xmm5, 32(%r9) +; SSE-NEXT: movaps %xmm3, (%r9) ; SSE-NEXT: movaps %xmm0, 48(%r9) -; SSE-NEXT: movaps %xmm14, 16(%r9) -; SSE-NEXT: addq $408, %rsp # imm = 0x198 +; SSE-NEXT: movaps %xmm6, 16(%r9) +; SSE-NEXT: addq $424, %rsp # imm = 0x1A8 ; SSE-NEXT: retq ; ; AVX1-LABEL: vf32: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -1638,48 +1638,46 @@ ; SSE-NEXT: movdqa 224(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdi), %xmm2 -; SSE-NEXT: movdqa 80(%rdi), %xmm7 +; SSE-NEXT: movdqa 80(%rdi), %xmm12 ; SSE-NEXT: movdqa (%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm13 ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm7[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm12[2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm7 +; SSE-NEXT: pslld $16, %xmm12 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,3,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,1,0,2,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 192(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1687,177 +1685,178 @@ ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: movdqa 256(%rdi), %xmm4 ; SSE-NEXT: movdqa 272(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[3,0] +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa 112(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa 112(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa 160(%rdi), %xmm5 -; SSE-NEXT: movdqa 176(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[3,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[3,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[2,3] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm3 +; SSE-NEXT: pslld $16, %xmm2 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: movdqa 144(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm3[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: movdqa 144(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa 304(%rdi), %xmm15 -; SSE-NEXT: movdqa 288(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa 352(%rdi), %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa 304(%rdi), %xmm12 +; SSE-NEXT: movdqa 288(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa 352(%rdi), %xmm0 ; SSE-NEXT: movdqa 368(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[3,0] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[3,0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm9 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm11[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm11[2,3] ; SSE-NEXT: pslld $16, %xmm11 ; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] ; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,1,0,2,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm7[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm7 ; SSE-NEXT: psrld $16, %xmm7 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: pandn %xmm9, %xmm7 -; SSE-NEXT: pand %xmm10, %xmm11 +; SSE-NEXT: pand %xmm8, %xmm11 ; SSE-NEXT: por %xmm7, %xmm11 -; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm7[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm4[2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm14[0,1,1,3,4,5,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm7[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm6[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm7 ; SSE-NEXT: psrld $16, %xmm7 -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pandn %xmm8, %xmm7 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm7[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm7[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm4, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm3[2,0] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm13 -; SSE-NEXT: por %xmm2, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm6[2,0] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pandn %xmm14, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm8[2,0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm7, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm13[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm13[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: por %xmm2, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] @@ -1865,157 +1864,157 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm11[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm11[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm5 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm5[0] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm8, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm14[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm14[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] ; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $132, (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps $132, (%rsp), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0,1],mem[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm6, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm15[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm15[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm10[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm10[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,2,3,4,5,6,7] ; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm11, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,2,3,4,5,6,7] ; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[0,1],mem[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm15[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0,1],mem[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm7 ; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: psrlq $48, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm4[0] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm7, %xmm4 +; SSE-NEXT: psrlq $48, %xmm12 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm12[0] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm7, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm5 ; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm8 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm8[0] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: psrlq $48, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm9[0] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: psrlq $48, %xmm4 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm15[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm4, %xmm11 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm3, %xmm11 -; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm14 ; SSE-NEXT: psrlq $48, %xmm3 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[2,2,3,3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm1, %xmm2 @@ -2024,48 +2023,47 @@ ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm1, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa %xmm11, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,3,2,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,4,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,0,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: andps %xmm11, %xmm1 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm12[1] ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: andps %xmm10, %xmm3 -; SSE-NEXT: por %xmm3, %xmm9 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[1],mem[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: andps %xmm10, %xmm2 -; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: andps %xmm11, %xmm3 +; SSE-NEXT: por %xmm3, %xmm7 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -2082,10 +2080,10 @@ ; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: andps %xmm10, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: andps %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm5 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -2095,69 +2093,69 @@ ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[1],mem[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm14[1] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: andps %xmm10, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: andps %xmm11, %xmm1 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: psrld $16, %xmm6 +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: psrld $16, %xmm3 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm6[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm6[0],xmm2[1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: andps %xmm10, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: andps %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm6 -; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm7 +; SSE-NEXT: psrld $16, %xmm6 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm7[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm15[0],xmm2[1,2,3] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm15 -; SSE-NEXT: andps %xmm10, %xmm2 -; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: pandn %xmm6, %xmm14 +; SSE-NEXT: andps %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm6 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm7[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm6[0],xmm2[1,2,3] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: pandn %xmm8, %xmm7 -; SSE-NEXT: andps %xmm10, %xmm2 -; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: andps %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -2170,12 +2168,13 @@ ; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] ; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm6[1] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm8[0],xmm2[1,2,3] -; SSE-NEXT: andps %xmm10, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: andps %xmm11, %xmm2 +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] -; SSE-NEXT: pandn %xmm6, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm6, %xmm11 +; SSE-NEXT: por %xmm2, %xmm11 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -2186,34 +2185,35 @@ ; SSE-NEXT: movaps %xmm2, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 48(%rdx) -; SSE-NEXT: movaps %xmm13, 16(%rdx) +; SSE-NEXT: movaps %xmm15, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rcx) ; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm14, 48(%r8) -; SSE-NEXT: movdqa %xmm11, 16(%r8) +; SSE-NEXT: movdqa %xmm10, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movdqa %xmm3, 48(%r9) -; SSE-NEXT: movdqa %xmm4, 16(%r9) -; SSE-NEXT: movdqa %xmm5, 32(%r9) -; SSE-NEXT: movdqa %xmm9, (%r9) +; SSE-NEXT: movdqa %xmm4, 48(%r9) +; SSE-NEXT: movdqa %xmm5, 16(%r9) +; SSE-NEXT: movdqa %xmm7, (%r9) +; SSE-NEXT: movdqa %xmm9, 32(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm10, 48(%rax) -; SSE-NEXT: movdqa %xmm7, 16(%rax) -; SSE-NEXT: movdqa %xmm15, 32(%rax) -; SSE-NEXT: movdqa %xmm1, (%rax) +; SSE-NEXT: movdqa %xmm11, 48(%rax) +; SSE-NEXT: movdqa %xmm1, 16(%rax) +; SSE-NEXT: movdqa %xmm14, 32(%rax) +; SSE-NEXT: movdqa %xmm3, (%rax) ; SSE-NEXT: addq $456, %rsp # imm = 0x1C8 ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll @@ -107,23 +107,23 @@ ; ; AVX1-LABEL: load_i32_stride4_vf4: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX1-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-NEXT: vmovaps (%rdi), %xmm3 -; AVX1-NEXT: vmovaps 16(%rdi), %xmm4 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,0] +; AVX1-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX1-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX1-NEXT: vmovaps 48(%rdi), %xmm3 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] ; AVX1-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm3[1],xmm4[1],zero,zero -; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm0[2],xmm1[2] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm7 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm4[3,0],xmm3[3,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[2,3] -; AVX1-NEXT: vmovaps %xmm2, (%rsi) +; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm0[1],xmm1[1],zero,zero +; AVX1-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm2[2],xmm3[2] +; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,0],xmm0[3,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; AVX1-NEXT: vmovaps %xmm4, (%rsi) ; AVX1-NEXT: vmovaps %xmm5, (%rdx) ; AVX1-NEXT: vmovaps %xmm6, (%rcx) ; AVX1-NEXT: vmovaps %xmm0, (%r8) @@ -247,33 +247,33 @@ ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1] ; AVX1-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[4],ymm0[4],ymm4[5],ymm0[5] ; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,1],ymm3[2,0],ymm5[4,5],ymm3[6,4] -; AVX1-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX1-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm6[0],xmm5[0] -; AVX1-NEXT: vmovaps (%rdi), %xmm3 -; AVX1-NEXT: vmovaps 16(%rdi), %xmm7 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; AVX1-NEXT: vmovaps (%rdi), %xmm5 +; AVX1-NEXT: vmovaps 16(%rdi), %xmm6 +; AVX1-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX1-NEXT: vmovaps 48(%rdi), %xmm3 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm3[0],xmm7[0] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,0] ; AVX1-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX1-NEXT: vunpcklps {{.*#+}} ymm8 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[4],ymm1[4],ymm10[5],ymm1[5] ; AVX1-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,0],ymm4[1,0],ymm0[5,4],ymm4[5,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm9[2,0],ymm8[2,3],ymm9[6,4],ymm8[6,7] -; AVX1-NEXT: vunpcklps {{.*#+}} xmm9 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[1],xmm7[1],zero,zero -; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm9 = xmm5[1],xmm6[1],zero,zero +; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm1[1],ymm10[1],ymm1[3],ymm10[3] ; AVX1-NEXT: vunpckhps {{.*#+}} ymm9 = ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[6],ymm0[6],ymm4[7],ymm0[7] ; AVX1-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,0],ymm9[4,5],ymm8[6,4] -; AVX1-NEXT: vinsertps {{.*#+}} xmm9 = zero,zero,xmm5[2],xmm6[2] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm9 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm7[2],xmm3[2] +; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX1-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[6],ymm1[6],ymm10[7],ymm1[7] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm4[3,0],ymm0[7,4],ymm4[7,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,3],ymm0[6,4],ymm1[6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm7[3,0],xmm3[3,0] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm6[3,0],xmm5[3,0] ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,0],xmm1[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovaps %ymm11, (%rsi) @@ -479,115 +479,110 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: subq $312, %rsp # imm = 0x138 ; AVX1-NEXT: vmovaps 64(%rdi), %ymm5 -; AVX1-NEXT: vmovaps 96(%rdi), %ymm8 -; AVX1-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX1-NEXT: vmovaps 224(%rdi), %ymm15 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm15[2,3,0,1] -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] -; AVX1-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX1-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[4],ymm2[4],ymm7[5],ymm2[5] -; AVX1-NEXT: vmovaps %ymm2, %ymm4 -; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovaps 96(%rdi), %ymm9 +; AVX1-NEXT: vmovaps 192(%rdi), %ymm12 +; AVX1-NEXT: vmovaps 224(%rdi), %ymm11 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm11[2,3,0,1] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm13[0],ymm11[2],ymm13[2] +; AVX1-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm12[2,3,0,1] +; AVX1-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm12[0],ymm7[1],ymm12[1],ymm7[4],ymm12[4],ymm7[5],ymm12[5] ; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 176(%rdi), %xmm0 -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0] -; AVX1-NEXT: vmovaps 144(%rdi), %xmm0 -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 176(%rdi), %xmm8 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm1[0] +; AVX1-NEXT: vmovaps 144(%rdi), %xmm4 ; AVX1-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,0] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm10[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3,0,1] +; AVX1-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3,0,1] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX1-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3,0,1] ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5] ; AVX1-NEXT: vshufps {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vmovaps %xmm1, %xmm14 -; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps (%rdi), %xmm12 +; AVX1-NEXT: vmovaps (%rdi), %xmm1 ; AVX1-NEXT: vmovaps 16(%rdi), %xmm6 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm10 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] +; AVX1-NEXT: vmovaps 32(%rdi), %xmm15 +; AVX1-NEXT: vmovaps 48(%rdi), %xmm14 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm15[0] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm10 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; AVX1-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm1, %xmm5 +; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,0] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm15[0],ymm11[1],ymm15[1],ymm11[4],ymm15[4],ymm11[5],ymm15[5] -; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm7[1,0],ymm4[5,4],ymm7[5,4] -; AVX1-NEXT: vmovaps %ymm7, %ymm11 +; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5] +; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,0],ymm7[1,0],ymm12[5,4],ymm7[5,4] +; AVX1-NEXT: vmovaps %ymm7, %ymm9 ; AVX1-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm4[1],zero,zero -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-NEXT: vunpcklps {{.*#+}} xmm13 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm13 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm3[0],ymm8[0],ymm3[1],ymm8[1],ymm3[4],ymm8[4],ymm3[5],ymm8[5] -; AVX1-NEXT: vmovaps %ymm5, %ymm2 +; AVX1-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX1-NEXT: # ymm0 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[4],mem[4],ymm3[5],mem[5] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,0],ymm1[1,0],ymm5[5,4],ymm1[5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,0],ymm1[1,0],ymm2[5,4],ymm1[5,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] -; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm12[1],xmm6[1],zero,zero -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[1],xmm6[1],zero,zero +; AVX1-NEXT: vunpcklps {{.*#+}} xmm6 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm14[2],ymm11[3],ymm14[3],ymm11[6],ymm14[6],ymm11[7],ymm14[7] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] +; AVX1-NEXT: vunpckhps {{.*#+}} ymm6 = ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[6],ymm12[6],ymm9[7],ymm12[7] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,0],ymm6[4,5],ymm0[6,4] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; AVX1-NEXT: vmovaps %xmm9, %xmm10 +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX1-NEXT: vmovaps %xmm8, %xmm10 ; AVX1-NEXT: vmovaps %xmm7, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm7[2],xmm9[2] +; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm7[2],xmm8[2] ; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm8[1],ymm3[1],ymm8[3],ymm3[3] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] ; AVX1-NEXT: vunpckhps {{.*#+}} ymm7 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX1-NEXT: vmovaps %ymm2, %ymm9 ; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm1[2],xmm5[2] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm3[2],xmm12[3],xmm3[3] -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm15[2],xmm14[2] +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} ymm6 = ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[6],ymm15[6],ymm13[7],ymm15[7] -; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload -; AVX1-NEXT: # ymm2 = ymm14[3,0],mem[3,0],ymm14[7,4],mem[7,4] +; AVX1-NEXT: vunpckhps {{.*#+}} ymm6 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] +; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX1-NEXT: # ymm2 = ymm12[3,0],mem[3,0],ymm12[7,4],mem[7,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm6[2,3],ymm2[6,4],ymm6[6,7] ; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,0],xmm11[3,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,0],xmm9[3,0] ; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,0],xmm4[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[6],ymm8[6],ymm4[7],ymm8[7] -; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload -; AVX1-NEXT: # ymm6 = ymm9[3,0],mem[3,0],ymm9[7,4],mem[7,4] +; AVX1-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm5[3,0],ymm1[3,0],ymm5[7,4],ymm1[7,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm6[2,0],ymm4[2,3],ymm6[6,4],ymm4[6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm5 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,0],xmm12[3,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[2,3] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,0],xmm8[3,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,0],xmm1[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm3, 32(%rsi) @@ -1054,11 +1049,10 @@ ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] ; AVX1-NEXT: vmovaps %ymm1, %ymm6 ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps %ymm3, %ymm5 +; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-NEXT: vmovaps 288(%rdi), %xmm2 ; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1073,24 +1067,23 @@ ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX1-NEXT: vmovaps 192(%rdi), %ymm9 ; AVX1-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1] ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX1-NEXT: vmovaps %ymm3, %ymm13 +; AVX1-NEXT: vmovaps %ymm3, %ymm15 ; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps %ymm1, %ymm12 +; AVX1-NEXT: vmovaps %ymm1, %ymm13 ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3,0,1] -; AVX1-NEXT: vunpcklps {{.*#+}} ymm3 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[2,3,0,1] +; AVX1-NEXT: vunpcklps {{.*#+}} ymm3 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[4],ymm9[4],ymm12[5],ymm9[5] +; AVX1-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps %ymm0, %ymm15 -; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4] ; AVX1-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovaps 176(%rdi), %xmm1 -; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] ; AVX1-NEXT: vmovaps 144(%rdi), %xmm4 ; AVX1-NEXT: vmovaps 128(%rdi), %xmm8 @@ -1111,15 +1104,15 @@ ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vunpcklps {{.*#+}} ymm11 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4] -; AVX1-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 48(%rdi), %xmm2 -; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0] ; AVX1-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovaps 16(%rdi), %xmm3 ; AVX1-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 48(%rdi), %xmm5 +; AVX1-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0] ; AVX1-NEXT: vunpcklps {{.*#+}} xmm14 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,0] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -1127,7 +1120,8 @@ ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[4],ymm10[4],ymm6[5],ymm10[5] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm6[1,0],ymm5[5,4],ymm6[5,4] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm6[1,0],ymm1[5,4],ymm6[5,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -1138,12 +1132,12 @@ ; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] -; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,0],ymm9[1,0],ymm15[5,4],ymm9[5,4] +; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[4],ymm13[4],ymm15[5],ymm13[5] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,0],ymm12[1,0],ymm9[5,4],ymm12[5,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[1],xmm4[1],zero,zero -; AVX1-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload -; AVX1-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vunpcklps (%rsp), %xmm2, %xmm7 # 16-byte Folded Reload ; AVX1-NEXT: # xmm7 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -1172,12 +1166,12 @@ ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm13[1,0],ymm1[5,4],ymm13[5,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; AVX1-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload +; AVX1-NEXT: # xmm1 = mem[0],xmm15[1],zero,zero ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-NEXT: # xmm7 = mem[0],xmm7[1],zero,zero -; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX1-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1] +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload @@ -1202,8 +1196,8 @@ ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload +; AVX1-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm5[2],xmm6[2] ; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -1225,11 +1219,11 @@ ; AVX1-NEXT: # ymm1 = ymm13[2],mem[2],ymm13[3],mem[3],ymm13[6],mem[6],ymm13[7],mem[7] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm13[2],xmm15[2] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm8[2],xmm9[2] +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -1275,8 +1269,8 @@ ; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-NEXT: # ymm3 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,0],ymm2[2,3],ymm3[6,4],ymm2[6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,0],xmm8[3,0] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm15[3,0],xmm13[3,0] ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm5[2,0],xmm3[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -298,13 +298,15 @@ ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512-NEXT: vpextrd $2, %xmm5, %eax ; AVX512-NEXT: vpinsrd $3, %eax, %xmm4, %xmm8 +; AVX512-NEXT: vpextrd $3, %xmm0, %eax +; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,1,1] +; AVX512-NEXT: vpinsrd $1, %eax, %xmm6, %xmm6 ; AVX512-NEXT: vpextrd $1, %xmm3, %eax -; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm0[2,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] ; AVX512-NEXT: vpinsrd $2, %eax, %xmm6, %xmm6 ; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm5[3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm1[0,1],xmm2[2,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,0,2,3] +; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm7 +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vpinsrd $1, %eax, %xmm7, %xmm7 ; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm3[2],xmm7[3] ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm4 ; AVX512-NEXT: vmovd %xmm4, %eax @@ -321,9 +323,10 @@ ; AVX512-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 ; AVX512-NEXT: vpextrd $2, %xmm4, %eax ; AVX512-NEXT: vpinsrd $3, %eax, %xmm3, %xmm3 +; AVX512-NEXT: vpextrd $3, %xmm1, %eax +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX512-NEXT: vpextrd $1, %xmm5, %eax -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3] ; AVX512-NEXT: vmovdqa %xmm8, (%rsi) @@ -879,32 +882,29 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i32_stride6_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $376, %rsp # imm = 0x178 -; SSE-NEXT: movdqa 144(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm8 -; SSE-NEXT: movdqa 96(%rdi), %xmm12 +; SSE-NEXT: subq $360, %rsp # imm = 0x168 +; SSE-NEXT: movdqa 144(%rdi), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm10 -; SSE-NEXT: movdqa 240(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm7 -; SSE-NEXT: movdqa 192(%rdi), %xmm0 +; SSE-NEXT: movdqa 160(%rdi), %xmm9 +; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm1 -; SSE-NEXT: movdqa 64(%rdi), %xmm6 -; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa 112(%rdi), %xmm11 +; SSE-NEXT: movdqa 240(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm14 +; SSE-NEXT: movdqa 256(%rdi), %xmm7 +; SSE-NEXT: movdqa 192(%rdi), %xmm14 +; SSE-NEXT: movdqa 208(%rdi), %xmm1 +; SSE-NEXT: movdqa 64(%rdi), %xmm8 +; SSE-NEXT: movdqa (%rdi), %xmm15 +; SSE-NEXT: movdqa 16(%rdi), %xmm10 ; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa %xmm14, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm6, %xmm15 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] @@ -912,30 +912,28 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE-NEXT: movdqa %xmm1, %xmm13 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 288(%rdi), %xmm9 -; SSE-NEXT: movdqa 304(%rdi), %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 304(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: movdqa 336(%rdi), %xmm12 @@ -946,18 +944,16 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa (%rsp), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,3,2,3] @@ -971,155 +967,152 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] ; SSE-NEXT: movdqa 80(%rdi), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa %xmm8, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,3,2,3] ; SSE-NEXT: movdqa 32(%rdi), %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE-NEXT: movdqa %xmm4, %xmm5 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm11 ; SSE-NEXT: movdqa 272(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm6 ; SSE-NEXT: movdqa 224(%rdi), %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa 368(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] -; SSE-NEXT: movdqa 320(%rdi), %xmm11 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] ; SSE-NEXT: movdqa 176(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa %xmm13, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa 128(%rdi), %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: movdqa 128(%rdi), %xmm14 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa 368(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa 320(%rdi), %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm5, %xmm13 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm10[2,3,2,3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa %xmm4, %xmm14 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, %xmm11 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: pshufd $85, (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -1127,47 +1120,46 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) ; SSE-NEXT: movapd %xmm10, 16(%r8) ; SSE-NEXT: movapd %xmm12, 48(%r8) -; SSE-NEXT: movapd %xmm13, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movapd %xmm11, 32(%r8) +; SSE-NEXT: movapd %xmm15, (%r8) ; SSE-NEXT: movapd %xmm3, 48(%r9) ; SSE-NEXT: movapd %xmm5, 16(%r9) -; SSE-NEXT: movapd %xmm6, 32(%r9) -; SSE-NEXT: movapd %xmm8, (%r9) +; SSE-NEXT: movapd %xmm6, (%r9) +; SSE-NEXT: movapd %xmm9, 32(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm14, 16(%rax) -; SSE-NEXT: movapd %xmm7, (%rax) -; SSE-NEXT: movapd %xmm9, 32(%rax) -; SSE-NEXT: movapd %xmm11, 48(%rax) -; SSE-NEXT: addq $376, %rsp # imm = 0x178 +; SSE-NEXT: movapd %xmm13, 16(%rax) +; SSE-NEXT: movapd %xmm14, (%rax) +; SSE-NEXT: movapd %xmm8, 32(%rax) +; SSE-NEXT: movapd %xmm0, 48(%rax) +; SSE-NEXT: addq $360, %rsp # imm = 0x168 ; SSE-NEXT: retq ; ; AVX1-LABEL: load_i32_stride6_vf16: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll @@ -282,16 +282,16 @@ ; AVX1-NEXT: vmovaps 96(%rdi), %xmm4 ; AVX1-NEXT: vmovaps 64(%rdi), %xmm5 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm15 = xmm5[0],xmm4[0] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm0[1] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm6[1] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] +; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm3[1],xmm0[1] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm7[1],xmm6[1] ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm1[1] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm4[1] -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] ; AVX1-NEXT: vmovaps 48(%rdi), %xmm5 ; AVX1-NEXT: vmovaps 16(%rdi), %xmm6 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX1-NEXT: vmovaps 176(%rdi), %xmm1 ; AVX1-NEXT: vmovaps 144(%rdi), %xmm0 @@ -309,13 +309,13 @@ ; AVX1-NEXT: vmovaps %xmm1, 32(%rsi) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-NEXT: vmovaps %xmm1, (%rsi) -; AVX1-NEXT: vmovaps %xmm2, 16(%rdx) ; AVX1-NEXT: vmovaps %xmm3, 48(%rdx) -; AVX1-NEXT: vmovaps %xmm8, (%rdx) +; AVX1-NEXT: vmovaps %xmm4, (%rdx) +; AVX1-NEXT: vmovaps %xmm8, 32(%rdx) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm1, 32(%rdx) +; AVX1-NEXT: vmovaps %xmm1, 16(%rdx) ; AVX1-NEXT: vmovaps %ymm7, 32(%rcx) -; AVX1-NEXT: vmovaps %ymm4, (%rcx) +; AVX1-NEXT: vmovaps %ymm2, (%rcx) ; AVX1-NEXT: vmovaps %ymm0, 32(%r8) ; AVX1-NEXT: vmovaps %ymm5, (%r8) ; AVX1-NEXT: vzeroupper @@ -587,143 +587,143 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: subq $296, %rsp # imm = 0x128 ; AVX1-NEXT: vmovaps 224(%rdi), %xmm8 -; AVX1-NEXT: vmovaps 192(%rdi), %xmm9 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm9[0],xmm8[0] +; AVX1-NEXT: vmovaps 192(%rdi), %xmm11 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm8[0] ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 96(%rdi), %xmm10 -; AVX1-NEXT: vmovaps 64(%rdi), %xmm11 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm10[0] +; AVX1-NEXT: vmovaps 96(%rdi), %xmm13 +; AVX1-NEXT: vmovaps 64(%rdi), %xmm14 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm13[0] ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 352(%rdi), %xmm4 +; AVX1-NEXT: vmovaps 352(%rdi), %xmm9 ; AVX1-NEXT: vmovaps 320(%rdi), %xmm5 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm4[0] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 160(%rdi), %xmm12 -; AVX1-NEXT: vmovaps 128(%rdi), %xmm1 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm12[0] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-NEXT: vmovaps (%rdi), %xmm3 -; AVX1-NEXT: vmovaps 288(%rdi), %xmm6 -; AVX1-NEXT: vmovaps 256(%rdi), %xmm0 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] -; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm6[0] -; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm2[0] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm8[1] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm9[0] ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX1-NEXT: vmovaps (%rdi), %xmm7 ; AVX1-NEXT: vmovaps 416(%rdi), %xmm0 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; AVX1-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX1-NEXT: vmovaps 480(%rdi), %xmm1 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm11[1],xmm10[1] +; AVX1-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX1-NEXT: vmovaps 480(%rdi), %xmm2 +; AVX1-NEXT: vmovaps 448(%rdi), %xmm3 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] ; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 448(%rdi), %xmm4 ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm1[0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm6[0] +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm13[1] +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 288(%rdi), %xmm0 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1] ; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX1-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm9[1] +; AVX1-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm8[1] +; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 224(%rdi), %ymm13 -; AVX1-NEXT: vmovaps 192(%rdi), %ymm12 -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm12[0],ymm13[0],ymm12[2],ymm13[2] -; AVX1-NEXT: vmovaps 176(%rdi), %xmm11 -; AVX1-NEXT: vmovaps 144(%rdi), %xmm10 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm10[0],xmm11[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-NEXT: vmovaps 304(%rdi), %xmm15 ; AVX1-NEXT: vmovaps 272(%rdi), %xmm14 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm14[0],xmm15[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm14[0],xmm15[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps 480(%rdi), %ymm8 -; AVX1-NEXT: vmovaps 448(%rdi), %ymm7 -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX1-NEXT: vmovaps 432(%rdi), %xmm5 -; AVX1-NEXT: vmovaps 400(%rdi), %xmm4 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm4[0],xmm5[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-NEXT: vmovaps 224(%rdi), %ymm12 +; AVX1-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] +; AVX1-NEXT: vmovaps 176(%rdi), %xmm10 +; AVX1-NEXT: vmovaps 144(%rdi), %xmm8 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm10[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX1-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX1-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm0[0],xmm1[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm11[1] -; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: vmovaps 96(%rdi), %ymm7 +; AVX1-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX1-NEXT: vmovaps 48(%rdi), %xmm5 +; AVX1-NEXT: vmovaps 16(%rdi), %xmm4 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm13 = xmm4[0],xmm5[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovaps 480(%rdi), %ymm3 +; AVX1-NEXT: vmovaps 448(%rdi), %ymm2 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-NEXT: vmovaps 432(%rdi), %xmm1 +; AVX1-NEXT: vmovaps 400(%rdi), %xmm0 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm1[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm10[1] +; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm5[1] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm14[1],xmm15[1] ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 112(%rsi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 96(%rsi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 64(%rsi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, (%rsi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 32(%rsi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 80(%rsi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 16(%rsi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 48(%rsi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 32(%rdx) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 48(%rdx) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 64(%rdx) +; AVX1-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 80(%rdx) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, (%rdx) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 16(%rdx) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 96(%rdx) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 112(%rdx) +; AVX1-NEXT: vmovaps %ymm9, 96(%rcx) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm14[1],xmm15[1] -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 112(%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 96(%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 64(%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, (%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 32(%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 80(%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 16(%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 48(%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 96(%rdx) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 112(%rdx) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, (%rdx) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 16(%rdx) -; AVX1-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 32(%rdx) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 48(%rdx) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 64(%rdx) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 80(%rdx) -; AVX1-NEXT: vmovaps %ymm6, (%rcx) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX1-NEXT: vmovaps %ymm2, 64(%r8) -; AVX1-NEXT: vmovaps %ymm1, 96(%r8) -; AVX1-NEXT: vmovaps %ymm0, (%r8) -; AVX1-NEXT: vmovaps %ymm10, 32(%r8) +; AVX1-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX1-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-NEXT: vmovaps %ymm0, 96(%r8) +; AVX1-NEXT: vmovaps %ymm4, (%r8) +; AVX1-NEXT: vmovaps %ymm8, 32(%r8) ; AVX1-NEXT: addq $296, %rsp # imm = 0x128 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -731,97 +731,97 @@ ; AVX2-LABEL: load_i64_stride4_vf16: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $296, %rsp # imm = 0x128 -; AVX2-NEXT: vmovaps 224(%rdi), %xmm11 +; AVX2-NEXT: vmovaps 224(%rdi), %xmm10 ; AVX2-NEXT: vmovaps 192(%rdi), %xmm15 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm15[0],xmm11[0] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm15[0],xmm10[0] ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX2-NEXT: vmovaps (%rdi), %xmm9 +; AVX2-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX2-NEXT: vmovaps (%rdi), %xmm14 ; AVX2-NEXT: vmovaps 64(%rdi), %xmm4 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm7[0] -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 352(%rdi), %xmm8 -; AVX2-NEXT: vmovaps 320(%rdi), %xmm6 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm8[0] -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 160(%rdi), %xmm3 -; AVX2-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm3[0] -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX2-NEXT: vmovaps 256(%rdi), %xmm5 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm1[0] -; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm8[1] -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 416(%rdi), %xmm0 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm1[1] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm0[0] ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm15[1],xmm11[1] -; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 480(%rdi), %xmm11 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX2-NEXT: vmovaps 352(%rdi), %xmm7 +; AVX2-NEXT: vmovaps 320(%rdi), %xmm1 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm7[0] ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 448(%rdi), %xmm2 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm7[1] +; AVX2-NEXT: vmovaps 160(%rdi), %xmm6 +; AVX2-NEXT: vmovaps 416(%rdi), %xmm13 +; AVX2-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX2-NEXT: vmovaps 480(%rdi), %xmm12 +; AVX2-NEXT: vmovaps 448(%rdi), %xmm5 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm5[0],xmm12[0] ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm11[0] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm12[1] ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] -; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm13[0] +; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm13[1] ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-NEXT: vmovaps 288(%rdi), %xmm2 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1] +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm2[0] +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm10[1] +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm6[0] +; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm0[0] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm0[0] ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm0[1] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1] ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 160(%rdi), %ymm9 -; AVX2-NEXT: vmovaps 128(%rdi), %ymm8 -; AVX2-NEXT: vmovaps 224(%rdi), %ymm5 -; AVX2-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 256(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 352(%rdi), %ymm14 -; AVX2-NEXT: vmovaps 320(%rdi), %ymm13 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm11[2,3] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 416(%rdi), %ymm10 -; AVX2-NEXT: vmovaps 384(%rdi), %ymm7 -; AVX2-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-NEXT: vmovaps 448(%rdi), %ymm3 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm15[2,3] +; AVX2-NEXT: vmovaps 352(%rdi), %ymm13 +; AVX2-NEXT: vmovaps 320(%rdi), %ymm11 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm13[0],ymm11[2],ymm13[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm12[2,3] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 32(%rdi), %ymm11 -; AVX2-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX2-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm11[0],ymm2[2],ymm11[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm4[2,3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm11[1],ymm2[3],ymm11[3] +; AVX2-NEXT: vmovaps 160(%rdi), %ymm15 +; AVX2-NEXT: vmovaps 128(%rdi), %ymm7 +; AVX2-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX2-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm7[0],ymm15[0],ymm7[2],ymm15[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3],ymm12[2,3] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 32(%rdi), %ymm9 +; AVX2-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX2-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm12[2,3],ymm14[2,3] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 416(%rdi), %ymm12 +; AVX2-NEXT: vmovaps 384(%rdi), %ymm6 +; AVX2-NEXT: vmovaps 480(%rdi), %ymm4 +; AVX2-NEXT: vmovaps 448(%rdi), %ymm2 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm6[0],ymm12[0],ymm6[2],ymm12[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm14[2,3],ymm10[2,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm7[1],ymm15[1],ymm7[3],ymm15[3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm3[2,3],ymm0[2,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm6[1],ymm12[1],ymm6[3],ymm12[3] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] @@ -830,7 +830,7 @@ ; AVX2-NEXT: vmovaps %xmm3, 112(%rsi) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-NEXT: vmovaps %xmm3, 96(%rsi) -; AVX2-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-NEXT: vmovaps %xmm3, 64(%rsi) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-NEXT: vmovaps %xmm3, (%rsi) @@ -843,32 +843,32 @@ ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-NEXT: vmovaps %xmm3, 48(%rsi) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vmovaps %xmm3, 96(%rdx) +; AVX2-NEXT: vmovaps %xmm3, 32(%rdx) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vmovaps %xmm3, 112(%rdx) +; AVX2-NEXT: vmovaps %xmm3, 48(%rdx) +; AVX2-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload +; AVX2-NEXT: vmovaps %xmm3, 64(%rdx) +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-NEXT: vmovaps %xmm3, 80(%rdx) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-NEXT: vmovaps %xmm3, (%rdx) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-NEXT: vmovaps %xmm3, 16(%rdx) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vmovaps %xmm3, 32(%rdx) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vmovaps %xmm3, 48(%rdx) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vmovaps %xmm3, 64(%rdx) +; AVX2-NEXT: vmovaps %xmm3, 96(%rdx) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vmovaps %xmm3, 80(%rdx) -; AVX2-NEXT: vmovaps %ymm12, (%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-NEXT: vmovaps %xmm3, 112(%rdx) +; AVX2-NEXT: vmovaps %ymm10, 96(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-NEXT: vmovaps %ymm3, (%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 64(%rcx) ; AVX2-NEXT: vmovaps %ymm2, 64(%r8) ; AVX2-NEXT: vmovaps %ymm1, 96(%r8) ; AVX2-NEXT: vmovaps %ymm0, (%r8) -; AVX2-NEXT: vmovaps %ymm5, 32(%r8) +; AVX2-NEXT: vmovaps %ymm15, 32(%r8) ; AVX2-NEXT: addq $296, %rsp # imm = 0x128 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll @@ -298,12 +298,12 @@ ; SSE-LABEL: load_i64_stride6_vf8: ; SSE: # %bb.0: ; SSE-NEXT: subq $40, %rsp -; SSE-NEXT: movaps 160(%rdi), %xmm8 +; SSE-NEXT: movaps 160(%rdi), %xmm9 ; SSE-NEXT: movaps 112(%rdi), %xmm0 ; SSE-NEXT: movaps 352(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 256(%rdi), %xmm12 -; SSE-NEXT: movaps 208(%rdi), %xmm9 +; SSE-NEXT: movaps 208(%rdi), %xmm8 ; SSE-NEXT: movaps 64(%rdi), %xmm7 ; SSE-NEXT: movaps (%rdi), %xmm11 ; SSE-NEXT: movaps 16(%rdi), %xmm10 @@ -336,34 +336,34 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: movaps %xmm8, %xmm11 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm12[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm0, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 304(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: movaps 304(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm8 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movaps 80(%rdi), %xmm1 -; SSE-NEXT: movaps 32(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] -; SSE-NEXT: movaps 272(%rdi), %xmm1 -; SSE-NEXT: movaps 224(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: movaps 176(%rdi), %xmm1 -; SSE-NEXT: movaps 128(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps 80(%rdi), %xmm0 +; SSE-NEXT: movaps 32(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: movaps 272(%rdi), %xmm0 +; SSE-NEXT: movaps 224(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps 176(%rdi), %xmm0 +; SSE-NEXT: movaps 128(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] ; SSE-NEXT: movaps 368(%rdi), %xmm1 ; SSE-NEXT: movaps 320(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm2 @@ -383,11 +383,11 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rdx) ; SSE-NEXT: movaps %xmm12, 16(%rcx) -; SSE-NEXT: movaps %xmm9, 48(%rcx) +; SSE-NEXT: movaps %xmm8, 48(%rcx) ; SSE-NEXT: movaps %xmm11, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps %xmm8, 48(%r8) +; SSE-NEXT: movaps %xmm7, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -395,14 +395,14 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%r8) ; SSE-NEXT: movaps %xmm2, 48(%r9) -; SSE-NEXT: movaps %xmm5, 16(%r9) -; SSE-NEXT: movaps %xmm6, 32(%r9) +; SSE-NEXT: movaps %xmm4, 16(%r9) ; SSE-NEXT: movaps %xmm10, (%r9) +; SSE-NEXT: movaps %xmm6, 32(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps %xmm4, 16(%rax) -; SSE-NEXT: movaps %xmm3, 32(%rax) -; SSE-NEXT: movaps %xmm7, (%rax) +; SSE-NEXT: movaps %xmm3, 16(%rax) +; SSE-NEXT: movaps %xmm5, 32(%rax) +; SSE-NEXT: movaps %xmm9, (%rax) ; SSE-NEXT: addq $40, %rsp ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll @@ -417,13 +417,13 @@ ; ; AVX1-LABEL: store_i64_stride6_vf8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovapd (%rdi), %ymm15 -; AVX1-NEXT: vmovapd 32(%rdi), %ymm12 -; AVX1-NEXT: vmovapd (%rsi), %ymm9 +; AVX1-NEXT: vmovapd (%rdi), %ymm6 +; AVX1-NEXT: vmovapd 32(%rdi), %ymm10 +; AVX1-NEXT: vmovapd (%rsi), %ymm7 ; AVX1-NEXT: vmovapd 32(%rsi), %ymm13 -; AVX1-NEXT: vmovapd (%r8), %ymm10 +; AVX1-NEXT: vmovapd (%r8), %ymm8 ; AVX1-NEXT: vmovapd 32(%r8), %ymm14 -; AVX1-NEXT: vmovapd 32(%r9), %ymm2 +; AVX1-NEXT: vmovapd 32(%r9), %ymm12 ; AVX1-NEXT: vmovaps 48(%rsi), %xmm0 ; AVX1-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] @@ -431,76 +431,78 @@ ; AVX1-NEXT: vbroadcastsd 48(%rcx), %ymm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-NEXT: vmovaps (%rsi), %xmm3 -; AVX1-NEXT: vmovaps 16(%rsi), %xmm5 -; AVX1-NEXT: vmovaps 32(%rsi), %xmm6 -; AVX1-NEXT: vmovaps (%rdi), %xmm4 -; AVX1-NEXT: vmovaps 16(%rdi), %xmm11 -; AVX1-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm0[1],xmm6[1] -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-NEXT: vblendpd {{.*#+}} ymm7 = ymm14[0],ymm7[1,2,3] -; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3] -; AVX1-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] -; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] -; AVX1-NEXT: vbroadcastsd 16(%rcx), %ymm7 -; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3,4,5],ymm7[6,7] -; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vmovaps 32(%rsi), %xmm4 +; AVX1-NEXT: vmovaps 32(%rdi), %xmm5 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm10[0],ymm1[1,2,3] -; AVX1-NEXT: vblendpd {{.*#+}} ymm11 = ymm1[0],ymm7[1],ymm1[2,3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm13[2,3] -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm14[2,3],ymm7[2,3] -; AVX1-NEXT: vshufpd {{.*#+}} ymm12 = ymm7[0],ymm1[0],ymm7[2],ymm1[3] +; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm14[0],ymm1[1,2,3] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] +; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovaps 16(%rsi), %xmm0 +; AVX1-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] +; AVX1-NEXT: vbroadcastsd 16(%rcx), %ymm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX1-NEXT: vmovaps (%rsi), %xmm0 +; AVX1-NEXT: vmovaps (%rdi), %xmm1 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm1[1],xmm0[1] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm8[0],ymm3[1,2,3] +; AVX1-NEXT: vblendpd {{.*#+}} ymm15 = ymm3[0],ymm9[1],ymm3[2,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm12[2,3],ymm13[2,3] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm13[1],ymm10[3],ymm13[3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm14[2,3],ymm10[2,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[0],ymm3[0],ymm10[2],ymm3[3] ; AVX1-NEXT: vmovaps 32(%rcx), %xmm14 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-NEXT: vbroadcastsd 40(%r8), %ymm7 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5],ymm1[6,7] -; AVX1-NEXT: vinsertf128 $1, 32(%r9), %ymm14, %ymm7 -; AVX1-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1],ymm7[2,3],ymm1[4,5],ymm7[6,7] -; AVX1-NEXT: vmovapd (%r9), %ymm1 -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm15[1],ymm9[1],ymm15[3],ymm9[3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm1[2,3],ymm9[2,3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3],ymm7[2,3] -; AVX1-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[2],ymm9[3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm9 = mem[2,3,2,3] -; AVX1-NEXT: vbroadcastsd 8(%r8), %ymm10 -; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] -; AVX1-NEXT: vmovaps (%rcx), %xmm10 -; AVX1-NEXT: vinsertf128 $1, (%r9), %ymm10, %ymm15 -; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm15[2,3],ymm9[4,5],ymm15[6,7] -; AVX1-NEXT: vmovapd 48(%rdx), %xmm5 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],mem[1] -; AVX1-NEXT: vbroadcastsd 56(%r8), %ymm15 -; AVX1-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3] -; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3] -; AVX1-NEXT: vmovapd 16(%rdx), %xmm5 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],mem[1] -; AVX1-NEXT: vbroadcastsd 24(%r8), %ymm15 -; AVX1-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3] -; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] +; AVX1-NEXT: vbroadcastsd 40(%r8), %ymm13 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5],ymm3[6,7] +; AVX1-NEXT: vinsertf128 $1, 32(%r9), %ymm14, %ymm13 +; AVX1-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1],ymm13[2,3],ymm3[4,5],ymm13[6,7] +; AVX1-NEXT: vmovapd (%r9), %ymm3 +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3],ymm7[2,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm8[2,3],ymm6[2,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm9 = ymm6[0],ymm7[0],ymm6[2],ymm7[3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = mem[2,3,2,3] +; AVX1-NEXT: vbroadcastsd 8(%r8), %ymm8 +; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] +; AVX1-NEXT: vmovaps (%rcx), %xmm8 +; AVX1-NEXT: vinsertf128 $1, (%r9), %ymm8, %ymm11 +; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2,3],ymm7[4,5],ymm11[6,7] +; AVX1-NEXT: vmovapd 48(%rdx), %xmm2 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],mem[1] +; AVX1-NEXT: vbroadcastsd 56(%r8), %ymm11 +; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3] +; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm12[3] +; AVX1-NEXT: vmovapd 16(%rdx), %xmm6 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],mem[1] +; AVX1-NEXT: vbroadcastsd 24(%r8), %ymm11 +; AVX1-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm11[2],ymm6[3] +; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0] ; AVX1-NEXT: vmovaps 32(%rdx), %xmm5 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm5[0],xmm14[0] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX1-NEXT: vmovaps (%rdx), %xmm4 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm10[0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovaps (%rdx), %xmm1 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-NEXT: vmovaps %xmm4, 16(%rax) -; AVX1-NEXT: vmovaps %xmm3, (%rax) +; AVX1-NEXT: vmovaps %xmm1, 16(%rax) +; AVX1-NEXT: vmovaps %xmm0, (%rax) ; AVX1-NEXT: vmovaps %xmm5, 208(%rax) -; AVX1-NEXT: vmovaps %xmm0, 192(%rax) -; AVX1-NEXT: vmovaps %ymm9, 64(%rax) -; AVX1-NEXT: vmovapd %ymm7, 128(%rax) +; AVX1-NEXT: vmovaps %xmm4, 192(%rax) +; AVX1-NEXT: vmovaps %ymm7, 64(%rax) +; AVX1-NEXT: vmovapd %ymm9, 128(%rax) ; AVX1-NEXT: vmovaps %ymm13, 256(%rax) -; AVX1-NEXT: vmovapd %ymm12, 320(%rax) -; AVX1-NEXT: vmovapd %ymm11, 32(%rax) -; AVX1-NEXT: vmovaps %ymm8, 96(%rax) -; AVX1-NEXT: vmovapd %ymm1, 160(%rax) +; AVX1-NEXT: vmovapd %ymm10, 320(%rax) +; AVX1-NEXT: vmovapd %ymm15, 32(%rax) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-NEXT: vmovapd %ymm3, 160(%rax) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm0, 224(%rax) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll @@ -572,7 +572,7 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: setne %al @@ -588,7 +588,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: setne %al @@ -604,7 +604,7 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: testl %eax, %eax ; AVX512-NEXT: setne %al @@ -643,7 +643,7 @@ ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al @@ -660,7 +660,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al @@ -674,11 +674,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: testl %eax, %eax ; AVX512-NEXT: sete %al @@ -719,11 +719,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: setne %al @@ -738,11 +738,11 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: setne %al @@ -757,11 +757,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: testl %eax, %eax ; AVX512-NEXT: setne %al diff --git a/llvm/test/CodeGen/X86/vector-reduce-and.ll b/llvm/test/CodeGen/X86/vector-reduce-and.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and.ll @@ -489,11 +489,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -504,11 +504,11 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -519,11 +519,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -559,7 +559,7 @@ ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -575,7 +575,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -588,11 +588,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -630,11 +630,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -648,11 +648,11 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -666,11 +666,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-or.ll b/llvm/test/CodeGen/X86/vector-reduce-or.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or.ll @@ -489,11 +489,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -504,11 +504,11 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -519,11 +519,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -559,7 +559,7 @@ ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -575,7 +575,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -588,11 +588,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -630,11 +630,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -648,11 +648,11 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -666,11 +666,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor.ll b/llvm/test/CodeGen/X86/vector-reduce-xor.ll --- a/llvm/test/CodeGen/X86/vector-reduce-xor.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor.ll @@ -489,11 +489,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -504,11 +504,11 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -519,11 +519,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -559,7 +559,7 @@ ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -575,7 +575,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -588,11 +588,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -630,11 +630,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -648,11 +648,11 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -666,11 +666,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -658,7 +658,7 @@ ; ; AVX2-LABEL: splatvar_rotate_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX2-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX2-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -669,7 +669,7 @@ ; ; AVX512F-LABEL: splatvar_rotate_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -680,7 +680,7 @@ ; ; AVX512VL-LABEL: splatvar_rotate_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -691,7 +691,7 @@ ; ; AVX512BW-LABEL: splatvar_rotate_v16i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512BW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -702,7 +702,7 @@ ; ; AVX512VLBW-LABEL: splatvar_rotate_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll --- a/llvm/test/CodeGen/X86/vector-rotate-512.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll @@ -343,7 +343,7 @@ ; ; AVX512BW-LABEL: splatvar_rotate_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm4 ; AVX512BW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3 @@ -354,7 +354,7 @@ ; ; AVX512VLBW-LABEL: splatvar_rotate_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -2982,40 +2982,56 @@ define <8 x i16> @shuffle_extract_concat_insert(<4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i16> %b) { ; SSE2-LABEL: shuffle_extract_concat_insert: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: pextrw $5, %xmm2, %edx +; SSE2-NEXT: pextrw $7, %xmm2, %esi +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE2-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-NEXT: pinsrw $5, %edx, %xmm0 +; SSE2-NEXT: pinsrw $6, %eax, %xmm0 +; SSE2-NEXT: pinsrw $7, %esi, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_extract_concat_insert: ; SSSE3: # %bb.0: -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pextrw $2, %xmm1, %eax +; SSSE3-NEXT: pextrw $5, %xmm2, %ecx +; SSSE3-NEXT: movd %xmm1, %edx +; SSSE3-NEXT: pextrw $7, %xmm2, %esi ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,0,1,14,15,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pinsrw $4, %eax, %xmm0 +; SSSE3-NEXT: pinsrw $5, %ecx, %xmm0 +; SSSE3-NEXT: pinsrw $6, %edx, %xmm0 +; SSSE3-NEXT: pinsrw $7, %esi, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_extract_concat_insert: ; SSE41: # %bb.0: -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] +; SSE41-NEXT: pextrw $2, %xmm1, %eax ; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,0,1,14,15,u,u,u,u,12,13,14,15] +; SSE41-NEXT: movd %xmm1, %ecx +; SSE41-NEXT: pinsrw $4, %eax, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5],xmm0[6,7] +; SSE41-NEXT: pinsrw $6, %ecx, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_extract_concat_insert: ; AVX: # %bb.0: -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpextrw $2, %xmm1, %eax +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,0,1,14,15,u,u,u,u,12,13,14,15] +; AVX-NEXT: vmovd %xmm1, %ecx +; AVX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5],xmm0[6,7] +; AVX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] ; AVX-NEXT: retq %a = shufflevector <4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i32> %a0 = extractelement <8 x i16> %a, i32 0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll @@ -364,8 +364,8 @@ ; AMD10H: # %bb.0: ; AMD10H-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; AMD10H-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AMD10H-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AMD10H-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; AMD10H-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AMD10H-NEXT: packuswb %xmm0, %xmm0 ; AMD10H-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -2571,10 +2571,9 @@ ; ; SSE41-LABEL: splatshuf_zext_v4i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatshuf_zext_v4i64: @@ -2711,11 +2710,8 @@ ; ; SSE41-LABEL: splatshuf_zext_v16i16: ; SSE41: # %bb.0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatshuf_zext_v16i16: diff --git a/llvm/test/CodeGen/X86/xor.ll b/llvm/test/CodeGen/X86/xor.ll --- a/llvm/test/CodeGen/X86/xor.ll +++ b/llvm/test/CodeGen/X86/xor.ll @@ -409,8 +409,8 @@ ; ; X64-WIN-LABEL: PR17487: ; X64-WIN: # %bb.0: +; X64-WIN-NEXT: andb $1, %cl ; X64-WIN-NEXT: movzbl %cl, %eax -; X64-WIN-NEXT: andl $1, %eax ; X64-WIN-NEXT: retq %tmp = insertelement <2 x i1> undef, i1 %tobool, i32 1 %tmp1 = zext <2 x i1> %tmp to <2 x i64>