diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1619,11 +1619,11 @@ // Add any operands of the new node which have not yet been combined to the // worklist as well. Because the worklist uniques things already, this // won't repeatedly process the same operand. - CombinedNodes.insert(N); for (const SDValue &ChildN : N->op_values()) if (!CombinedNodes.count(ChildN.getNode())) AddToWorklist(ChildN.getNode()); + CombinedNodes.insert(N); SDValue RV = combine(N); if (!RV.getNode()) @@ -1657,10 +1657,8 @@ // out), because re-visiting the EntryToken and its users will not uncover // any additional opportunities, but there may be a large number of such // users, potentially causing compile time explosion. - if (RV.getOpcode() != ISD::EntryToken) { - AddToWorklist(RV.getNode()); - AddUsersToWorklist(RV.getNode()); - } + if (RV.getOpcode() != ISD::EntryToken) + AddToWorklistWithUsers(RV.getNode()); // Finally, if the node is now dead, remove it from the graph. The node // may not be dead if the replacement process recursively simplified to diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll --- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll @@ -268,10 +268,9 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v4, vcc +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm @@ -282,12 +281,11 @@ ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_subbrev_co_u32_e32 v0, vcc, 0, v3, vcc +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x64, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll --- a/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll @@ -23,8 +23,8 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, 1.0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 4.0, 2.0, s[0:1] +; GCN-NEXT: v_cmp_nlt_f32_e64 s[0:1], s0, 1.0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1] ; GCN-NEXT: flat_store_dword v[0:1], v0 ; GCN-NEXT: s_endpgm %c1 = fcmp olt float %x, 1.0 @@ -40,8 +40,8 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, 1.0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 4.0, 2.0, s[0:1] +; GCN-NEXT: v_cmp_nlt_f32_e64 s[0:1], s0, 1.0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, 4.0, s[0:1] ; GCN-NEXT: flat_store_dword v[0:1], v0 ; GCN-NEXT: s_endpgm %c1 = fcmp olt float %x, 1.0 diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll --- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll @@ -224,12 +224,12 @@ ; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v6 offset:5 ; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v1 ; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v2 offset:1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v8 offset:6 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v0 offset:7 ; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v3 offset:2 ; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v4 offset:3 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) +; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v8 offset:6 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) +; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v0 offset:7 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds8align1: @@ -296,17 +296,17 @@ ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:4 ; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 -; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:6 -; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:2 +; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 +; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:6 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) ; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v1 offset:4 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) ; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v2 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v3 offset:6 +; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v3 offset:2 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v0 offset:2 +; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v0 offset:6 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds8align2: @@ -395,24 +395,22 @@ ; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10 ; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:11 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v12, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) -; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v5 offset:4 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) -; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v6 offset:5 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v9 offset:8 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v10 offset:9 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v11 offset:10 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v0 offset:11 +; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v5 offset:4 +; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v6 offset:5 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v1 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v2 offset:1 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v3 offset:2 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v4 offset:3 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v7 offset:6 ; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v8 offset:7 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) +; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v11 offset:10 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) +; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v0 offset:11 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds12align1: @@ -494,23 +492,23 @@ ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 -; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 offset:2 -; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:4 -; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:8 -; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:10 -; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:6 +; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:8 +; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 +; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 +; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 +; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v6, s1 +; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:10 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v1 offset:8 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v3 offset:4 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v4 offset:8 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v5 offset:10 -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v1 -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v2 offset:2 +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v4 offset:4 +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v2 +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v3 offset:2 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) -; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v0 offset:6 +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v5 offset:6 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v0 offset:10 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds12align2: @@ -695,30 +693,25 @@ ; ALIGNED-SDAG-NEXT: ds_read_u8 v15, v0 offset:14 ; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:15 ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v16, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v3 offset:2 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v4 offset:3 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v13 offset:12 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v14 offset:13 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v1 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v2 offset:1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v7 offset:6 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v8 offset:7 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v5 offset:4 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v6 offset:5 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v11 offset:10 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v12 offset:11 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v9 offset:8 ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v10 offset:9 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v3 offset:2 +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v4 offset:3 +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v7 offset:6 +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v8 offset:7 +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v11 offset:10 +; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v12 offset:11 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(14) ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v15 offset:14 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) ; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v0 offset:15 -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v13 offset:12 -; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v14 offset:13 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds16align1: @@ -816,27 +809,29 @@ ; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:12 ; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 ; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 ; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 ; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6 ; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:8 ; ALIGNED-SDAG-NEXT: ds_read_u16 v7, v0 offset:10 -; ALIGNED-SDAG-NEXT: ds_read_u16 v8, v0 offset:12 +; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v8, s1 ; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:14 -; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v3 offset:2 -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v2 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v5 offset:6 -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v4 offset:4 -; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v7 offset:10 -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v6 offset:8 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v1 offset:12 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v2 ; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v0 offset:14 -; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v8 offset:12 +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v4 offset:4 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v6 offset:8 +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v3 offset:2 +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v5 offset:6 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v7 offset:10 +; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) +; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v0 offset:14 ; ALIGNED-SDAG-NEXT: s_endpgm ; ; ALIGNED-GISEL-LABEL: ds16align2: diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -657,20 +657,20 @@ ; CI-NEXT: ds_write_b8 v0, v1 offset:5 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; CI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; CI-NEXT: ds_write_b8 v0, v2 offset:13 ; CI-NEXT: ds_write_b8 v0, v1 offset:9 +; CI-NEXT: ds_write_b8 v0, v2 offset:13 ; CI-NEXT: v_lshrrev_b32_e32 v1, 24, v2 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 ; CI-NEXT: ds_write_b8 v0, v3 offset:8 ; CI-NEXT: ds_write_b8 v0, v4 offset:7 ; CI-NEXT: ds_write_b8 v0, v5 offset:6 -; CI-NEXT: ds_write_b8 v0, v1 offset:16 -; CI-NEXT: ds_write_b8 v0, v6 offset:15 -; CI-NEXT: ds_write_b8 v0, v2 offset:14 ; CI-NEXT: ds_write_b8 v0, v3 offset:12 ; CI-NEXT: ds_write_b8 v0, v4 offset:11 ; CI-NEXT: ds_write_b8 v0, v5 offset:10 +; CI-NEXT: ds_write_b8 v0, v1 offset:16 +; CI-NEXT: ds_write_b8 v0, v6 offset:15 +; CI-NEXT: ds_write_b8 v0, v2 offset:14 ; CI-NEXT: s_endpgm ; ; GFX9-ALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: @@ -686,18 +686,18 @@ ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:5 ; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v1 offset:15 -; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:13 ; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:11 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:9 +; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v1 offset:15 +; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:13 ; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v0, 24, v1 ; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v3 offset:8 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v4 offset:6 -; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:16 -; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:14 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v3 offset:12 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v4 offset:10 +; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:16 +; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:14 ; GFX9-ALIGNED-NEXT: s_endpgm ; ; GFX9-UNALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64: diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -1851,14 +1851,14 @@ ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_and_b32_e32 v3, 0xff00, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v2 +; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v6, 0xff00, v0 -; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 +; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_alignbit_b32 v3, s10, v3, 16 @@ -1866,8 +1866,8 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v5, v8, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v7, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v5, v8, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -2818,50 +2818,50 @@ ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_i32 v8, v2, 0, 4 -; GFX7-NEXT: v_ashrrev_i32_e32 v3, 28, v2 +; GFX7-NEXT: v_bfe_i32 v7, v2, 0, 4 +; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v15, v0, 0, 4 -; GFX7-NEXT: v_bfe_i32 v4, v2, 24, 4 -; GFX7-NEXT: v_bfe_i32 v5, v2, 20, 4 -; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4 -; GFX7-NEXT: v_bfe_i32 v7, v2, 8, 4 +; GFX7-NEXT: v_bfe_i32 v14, v0, 0, 4 +; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4 +; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4 +; GFX7-NEXT: v_bfe_i32 v6, v2, 8, 4 +; GFX7-NEXT: v_ashrrev_i32_e32 v8, 28, v2 ; GFX7-NEXT: v_bfe_i32 v9, v2, 12, 4 ; GFX7-NEXT: v_bfe_i32 v2, v2, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX7-NEXT: v_ashrrev_i32_e32 v10, 28, v0 -; GFX7-NEXT: v_bfe_i32 v11, v0, 24, 4 -; GFX7-NEXT: v_bfe_i32 v12, v0, 20, 4 -; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4 -; GFX7-NEXT: v_bfe_i32 v14, v0, 8, 4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4 +; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4 +; GFX7-NEXT: v_bfe_i32 v12, v0, 16, 4 +; GFX7-NEXT: v_bfe_i32 v13, v0, 8, 4 +; GFX7-NEXT: v_ashrrev_i32_e32 v15, 28, v0 ; GFX7-NEXT: v_bfe_i32 v16, v0, 12, 4 ; GFX7-NEXT: v_bfe_i32 v0, v0, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v14 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 -; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v13 ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 24, v16 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: v_alignbit_b32 v9, 0, v9, 24 ; GFX7-NEXT: v_alignbit_b32 v16, 0, v16, 24 -; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 ; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 ; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v10 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v15 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -2445,35 +2445,35 @@ ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_and_b32_e32 v8, 15, v2 -; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4 +; GFX7-NEXT: v_bfe_u32 v7, v2, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v15, 15, v0 -; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4 -; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4 -; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 4 -; GFX7-NEXT: v_bfe_u32 v7, v2, 4, 4 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 12, v2 -; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4 -; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4 -; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4 -; GFX7-NEXT: v_bfe_u32 v13, v0, 8, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 12, v2 ; GFX7-NEXT: v_bfe_u32 v14, v0, 4, 4 -; GFX7-NEXT: v_lshrrev_b32_e32 v16, 28, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 12, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 12, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xf000000, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xf000000, v0 +; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 4 +; GFX7-NEXT: v_bfe_u32 v13, v0, 8, 4 +; GFX7-NEXT: v_and_b32_e32 v9, 0xf000000, v9 +; GFX7-NEXT: v_and_b32_e32 v16, 0xf000000, v16 ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 -; GFX7-NEXT: v_alignbit_b32 v2, s10, v2, 24 -; GFX7-NEXT: v_alignbit_b32 v0, 0, v0, 24 +; GFX7-NEXT: v_alignbit_b32 v9, s10, v9, 24 +; GFX7-NEXT: v_alignbit_b32 v8, 0, v16, 24 ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 +; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4 +; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v9, v8, v1 +; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4 +; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 +; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4 +; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4 +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 28, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 28, v0 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll --- a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll @@ -30,22 +30,22 @@ define amdgpu_vs void @test_2(<4 x i32> inreg %arg1, i32 %arg2, i32 inreg %arg3, <8 x float> addrspace(3)* %arg4) { ; CHECK-LABEL: test_2: ; CHECK: ; %bb.0: -; CHECK-NEXT: v_add_i32_e32 v5, vcc, 28, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 24, v1 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, 20, v1 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, 16, v1 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 16, v1 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 28, v1 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, 24, v1 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, 12, v1 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, 8, v1 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, 4, v1 ; CHECK-NEXT: s_mov_b32 m0, -1 -; CHECK-NEXT: ds_read_b32 v4, v2 -; CHECK-NEXT: ds_read_b32 v3, v3 -; CHECK-NEXT: ds_read_b32 v2, v6 +; CHECK-NEXT: ds_read_b32 v2, v2 +; CHECK-NEXT: ds_read_b32 v5, v4 +; CHECK-NEXT: ds_read_b32 v4, v6 ; CHECK-NEXT: ds_read_b32 v9, v7 ; CHECK-NEXT: ds_read_b32 v8, v8 ; CHECK-NEXT: ds_read_b32 v7, v10 ; CHECK-NEXT: ds_read_b32 v6, v1 -; CHECK-NEXT: ds_read_b32 v5, v5 +; CHECK-NEXT: ds_read_b32 v3, v3 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) ; CHECK-NEXT: tbuffer_store_format_xyzw v[6:9], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen glc slc ; CHECK-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -83,42 +83,42 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 -; GFX9-NEXT: ds_write_b8 v0, v2 offset:12 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:14 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: ds_write_b8 v0, v1 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:12 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:14 +; GFX9-NEXT: ds_write_b8 v0, v2 offset:8 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:10 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_lshr_b32 s0, s6, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:4 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s6, 24 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_lshr_b32 s0, s7, 8 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 +; GFX9-NEXT: ds_write_b8 v0, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s7, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:13 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s4, 8 +; GFX9-NEXT: s_lshr_b32 s0, s6, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:15 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s4, 24 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX9-NEXT: s_lshr_b32 s0, s6, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s5, 8 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s5, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align1: @@ -128,50 +128,50 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s3 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX7-NEXT: ds_write_b8 v0, v2 offset:12 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_write_b8 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:12 +; GFX7-NEXT: ds_write_b8 v0, v2 offset:8 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_lshr_b32 s4, s2, 8 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s4, s3, 8 +; GFX7-NEXT: ds_write_b8 v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_lshr_b32 s4, s2, 24 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX7-NEXT: s_lshr_b32 s4, s3, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:13 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_lshr_b32 s3, s3, 16 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:15 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshr_b32 s3, s2, 8 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:14 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshr_b32 s3, s2, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_lshr_b32 s2, s2, 16 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s3, 8 +; GFX7-NEXT: s_lshr_b32 s2, s1, 8 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:10 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s3, 24 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:13 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s3, 16 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:15 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s0, 8 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:14 +; GFX7-NEXT: s_lshr_b32 s2, s1, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s0, 24 +; GFX7-NEXT: s_lshr_b32 s1, s1, 16 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshr_b32 s1, s0, 8 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshr_b32 s1, s0, 24 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:1 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 8 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:2 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 24 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 16 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v4i32_align1: @@ -181,50 +181,50 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s3 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX6-NEXT: ds_write_b8 v0, v2 offset:12 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_write_b8 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:12 +; GFX6-NEXT: ds_write_b8 v0, v2 offset:8 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_lshr_b32 s4, s2, 8 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:4 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s4, s3, 8 +; GFX6-NEXT: ds_write_b8 v0, v1 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: s_lshr_b32 s4, s2, 24 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX6-NEXT: s_lshr_b32 s4, s3, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:13 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_lshr_b32 s3, s3, 16 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:15 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_lshr_b32 s3, s2, 8 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:14 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_lshr_b32 s3, s2, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s3, 8 +; GFX6-NEXT: s_lshr_b32 s2, s1, 8 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:10 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s3, 24 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:13 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s3, 16 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:15 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s0, 8 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:14 +; GFX6-NEXT: s_lshr_b32 s2, s1, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s0, 24 +; GFX6-NEXT: s_lshr_b32 s1, s1, 16 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s1, s0, 8 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s1, s0, 24 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:1 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 8 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:2 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 24 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 16 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v4i32_align1: @@ -234,42 +234,42 @@ ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: s_lshr_b32 s3, s7, 24 -; GFX10-NEXT: v_mov_b32_e32 v2, s7 -; GFX10-NEXT: s_lshr_b32 s0, s6, 8 -; GFX10-NEXT: s_lshr_b32 s1, s6, 24 -; GFX10-NEXT: s_lshr_b32 s6, s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_lshr_b32 s2, s7, 8 -; GFX10-NEXT: s_lshr_b32 s4, s4, 24 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: s_lshr_b32 s3, s6, 24 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: s_lshr_b32 s0, s7, 8 +; GFX10-NEXT: s_lshr_b32 s2, s6, 8 +; GFX10-NEXT: s_lshr_b32 s6, s5, 8 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: s_lshr_b32 s1, s7, 24 +; GFX10-NEXT: s_lshr_b32 s5, s5, 24 ; GFX10-NEXT: v_mov_b32_e32 v8, s3 ; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: v_mov_b32_e32 v9, s6 -; GFX10-NEXT: s_lshr_b32 s0, s5, 8 -; GFX10-NEXT: v_mov_b32_e32 v4, s5 +; GFX10-NEXT: s_lshr_b32 s0, s4, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: v_mov_b32_e32 v6, s1 ; GFX10-NEXT: v_mov_b32_e32 v7, s2 -; GFX10-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 -; GFX10-NEXT: ds_write_b8 v0, v2 offset:12 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:14 -; GFX10-NEXT: ds_write_b8 v0, v3 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:2 -; GFX10-NEXT: ds_write_b8 v0, v4 offset:4 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v4 offset:6 -; GFX10-NEXT: ds_write_b8 v0, v5 offset:9 -; GFX10-NEXT: ds_write_b8 v0, v6 offset:11 -; GFX10-NEXT: ds_write_b8 v0, v7 offset:13 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: s_lshr_b32 s1, s5, 24 +; GFX10-NEXT: ds_write_b8 v0, v1 offset:12 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:14 +; GFX10-NEXT: ds_write_b8 v0, v2 offset:8 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:10 +; GFX10-NEXT: ds_write_b8 v0, v3 offset:4 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:6 +; GFX10-NEXT: ds_write_b8 v0, v4 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v4 offset:2 +; GFX10-NEXT: ds_write_b8 v0, v5 offset:13 +; GFX10-NEXT: ds_write_b8 v0, v6 offset:15 +; GFX10-NEXT: ds_write_b8 v0, v7 offset:9 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_lshr_b32 s1, s4, 24 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: ds_write_b8 v0, v8 offset:15 -; GFX10-NEXT: ds_write_b8 v0, v9 offset:1 -; GFX10-NEXT: ds_write_b8 v0, v1 offset:3 -; GFX10-NEXT: ds_write_b8 v0, v2 offset:5 -; GFX10-NEXT: ds_write_b8 v0, v3 offset:7 +; GFX10-NEXT: ds_write_b8 v0, v8 offset:11 +; GFX10-NEXT: ds_write_b8 v0, v9 offset:5 +; GFX10-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX10-NEXT: ds_write_b8 v0, v2 offset:1 +; GFX10-NEXT: ds_write_b8 v0, v3 offset:3 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v4i32_align1: @@ -278,38 +278,37 @@ ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: s_lshr_b32 s4, s2, 8 -; GFX11-NEXT: s_lshr_b32 s5, s3, 8 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_lshr_b32 s4, s3, 8 ; GFX11-NEXT: s_lshr_b32 s3, s3, 24 -; GFX11-NEXT: s_lshr_b32 s6, s0, 8 -; GFX11-NEXT: s_lshr_b32 s0, s0, 24 -; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s4 +; GFX11-NEXT: s_lshr_b32 s5, s2, 8 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: s_lshr_b32 s2, s2, 24 -; GFX11-NEXT: s_lshr_b32 s7, s1, 8 -; GFX11-NEXT: v_dual_mov_b32 v8, s3 :: v_dual_mov_b32 v9, s6 -; GFX11-NEXT: v_mov_b32_e32 v10, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s5 -; GFX11-NEXT: ds_store_b8 v0, v1 offset:8 -; GFX11-NEXT: ds_store_b8 v0, v3 -; GFX11-NEXT: ds_store_b8_d16_hi v0, v1 offset:10 -; GFX11-NEXT: ds_store_b8 v0, v5 offset:9 -; GFX11-NEXT: ds_store_b8 v0, v2 offset:12 -; GFX11-NEXT: ds_store_b8 v0, v6 offset:11 -; GFX11-NEXT: ds_store_b8 v0, v7 offset:13 -; GFX11-NEXT: ds_store_b8_d16_hi v0, v2 offset:14 -; GFX11-NEXT: ds_store_b8 v0, v8 offset:15 -; GFX11-NEXT: v_mov_b32_e32 v1, s7 -; GFX11-NEXT: s_lshr_b32 s0, s1, 24 -; GFX11-NEXT: ds_store_b8_d16_hi v0, v3 offset:2 -; GFX11-NEXT: ds_store_b8 v0, v9 offset:1 -; GFX11-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-NEXT: ds_store_b8 v0, v4 offset:4 -; GFX11-NEXT: ds_store_b8 v0, v10 offset:3 -; GFX11-NEXT: ds_store_b8 v0, v1 offset:5 -; GFX11-NEXT: ds_store_b8_d16_hi v0, v4 offset:6 -; GFX11-NEXT: ds_store_b8 v0, v2 offset:7 +; GFX11-NEXT: s_lshr_b32 s6, s1, 8 +; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, s5 +; GFX11-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s6 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s4 +; GFX11-NEXT: s_lshr_b32 s1, s1, 24 +; GFX11-NEXT: s_lshr_b32 s7, s0, 8 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: ds_store_b8 v0, v2 offset:8 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v2 offset:10 +; GFX11-NEXT: ds_store_b8 v0, v1 offset:12 +; GFX11-NEXT: ds_store_b8 v0, v4 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v4 offset:2 +; GFX11-NEXT: ds_store_b8 v0, v3 offset:4 +; GFX11-NEXT: ds_store_b8 v0, v5 offset:13 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v1 offset:14 +; GFX11-NEXT: ds_store_b8 v0, v6 offset:15 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v10, s7 +; GFX11-NEXT: v_mov_b32_e32 v11, s0 +; GFX11-NEXT: ds_store_b8 v0, v7 offset:9 +; GFX11-NEXT: ds_store_b8 v0, v8 offset:11 +; GFX11-NEXT: ds_store_b8 v0, v9 offset:5 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v3 offset:6 +; GFX11-NEXT: ds_store_b8 v0, v1 offset:7 +; GFX11-NEXT: ds_store_b8 v0, v10 offset:1 +; GFX11-NEXT: ds_store_b8 v0, v11 offset:3 ; GFX11-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 ret void @@ -322,18 +321,18 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 -; GFX9-NEXT: ds_write_b16 v0, v2 offset:12 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:14 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: ds_write_b16 v0, v1 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:12 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:14 +; GFX9-NEXT: ds_write_b16 v0, v2 offset:8 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:10 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align2: @@ -343,26 +342,26 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s3 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX7-NEXT: ds_write_b16 v0, v2 offset:12 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:12 +; GFX7-NEXT: ds_write_b16 v0, v2 offset:8 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s3, s3, 16 ; GFX7-NEXT: ds_write_b16 v0, v1 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_lshr_b32 s2, s2, 16 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:14 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s3, 16 +; GFX7-NEXT: s_lshr_b32 s1, s1, 16 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:10 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:14 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 16 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:2 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v4i32_align2: @@ -372,26 +371,26 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s3 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX6-NEXT: ds_write_b16 v0, v2 offset:12 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:12 +; GFX6-NEXT: ds_write_b16 v0, v2 offset:8 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:4 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s3, s3, 16 ; GFX6-NEXT: ds_write_b16 v0, v1 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:14 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s3, 16 +; GFX6-NEXT: s_lshr_b32 s1, s1, 16 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:10 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:14 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 16 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:2 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v4i32_align2: @@ -401,18 +400,18 @@ ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: v_mov_b32_e32 v2, s7 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: v_mov_b32_e32 v4, s5 -; GFX10-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 -; GFX10-NEXT: ds_write_b16 v0, v2 offset:12 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:14 -; GFX10-NEXT: ds_write_b16 v0, v3 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:2 -; GFX10-NEXT: ds_write_b16 v0, v4 offset:4 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v4 offset:6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: ds_write_b16 v0, v1 offset:12 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:14 +; GFX10-NEXT: ds_write_b16 v0, v2 offset:8 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:10 +; GFX10-NEXT: ds_write_b16 v0, v3 offset:4 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:6 +; GFX10-NEXT: ds_write_b16 v0, v4 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v4 offset:2 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v4i32_align2: @@ -421,17 +420,17 @@ ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_mov_b32_e32 v4, s1 -; GFX11-NEXT: ds_store_b16_d16_hi v0, v1 offset:10 -; GFX11-NEXT: ds_store_b16 v0, v2 offset:12 -; GFX11-NEXT: ds_store_b16_d16_hi v0, v2 offset:14 -; GFX11-NEXT: ds_store_b16 v0, v3 -; GFX11-NEXT: ds_store_b16_d16_hi v0, v3 offset:2 -; GFX11-NEXT: ds_store_b16 v0, v4 offset:4 -; GFX11-NEXT: ds_store_b16 v0, v1 offset:8 -; GFX11-NEXT: ds_store_b16_d16_hi v0, v4 offset:6 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: v_mov_b32_e32 v4, s2 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v1 offset:14 +; GFX11-NEXT: ds_store_b16 v0, v2 +; GFX11-NEXT: ds_store_b16 v0, v3 offset:4 +; GFX11-NEXT: ds_store_b16 v0, v4 offset:8 +; GFX11-NEXT: ds_store_b16 v0, v1 offset:12 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v4 offset:10 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v3 offset:6 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v2 offset:2 ; GFX11-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2 ret void diff --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll @@ -80,32 +80,32 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:8 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 -; GFX9-NEXT: ds_write_b8 v0, v2 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_lshr_b32 s0, s6, 8 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:4 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 +; GFX9-NEXT: ds_write_b8 v0, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s6, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s4, 8 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s4, 24 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s5, 8 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s5, 24 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align1: @@ -116,12 +116,12 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX7-NEXT: ds_write_b8 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: s_lshr_b32 s3, s2, 8 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:4 +; GFX7-NEXT: ds_write_b8 v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_lshr_b32 s3, s2, 24 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 @@ -129,25 +129,25 @@ ; GFX7-NEXT: s_lshr_b32 s2, s2, 16 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s0, 8 +; GFX7-NEXT: s_lshr_b32 s2, s1, 8 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:10 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s0, 24 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX7-NEXT: s_lshr_b32 s2, s1, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_lshr_b32 s1, s1, 16 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshr_b32 s1, s0, 8 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshr_b32 s1, s0, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 8 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:2 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 24 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 16 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v3i32_align1: @@ -158,12 +158,12 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX6-NEXT: ds_write_b8 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_lshr_b32 s3, s2, 8 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:4 +; GFX6-NEXT: ds_write_b8 v0, v1 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_lshr_b32 s3, s2, 24 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:9 @@ -171,25 +171,25 @@ ; GFX6-NEXT: s_lshr_b32 s2, s2, 16 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s0, 8 +; GFX6-NEXT: s_lshr_b32 s2, s1, 8 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:10 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s0, 24 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX6-NEXT: s_lshr_b32 s2, s1, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_lshr_b32 s1, s1, 16 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s1, s0, 8 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s1, s0, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 8 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:2 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 24 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 16 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v3i32_align1: @@ -200,32 +200,32 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_lshr_b32 s0, s6, 8 ; GFX10-NEXT: s_lshr_b32 s1, s6, 24 -; GFX10-NEXT: s_lshr_b32 s2, s4, 8 -; GFX10-NEXT: s_lshr_b32 s3, s4, 24 -; GFX10-NEXT: s_lshr_b32 s4, s5, 8 -; GFX10-NEXT: s_lshr_b32 s5, s5, 24 +; GFX10-NEXT: s_lshr_b32 s2, s5, 8 +; GFX10-NEXT: s_lshr_b32 s3, s5, 24 +; GFX10-NEXT: s_lshr_b32 s5, s4, 8 +; GFX10-NEXT: s_lshr_b32 s4, s4, 24 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 ; GFX10-NEXT: v_mov_b32_e32 v7, s3 -; GFX10-NEXT: v_mov_b32_e32 v8, s4 -; GFX10-NEXT: v_mov_b32_e32 v9, s5 +; GFX10-NEXT: v_mov_b32_e32 v8, s5 +; GFX10-NEXT: v_mov_b32_e32 v9, s4 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:8 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 -; GFX10-NEXT: ds_write_b8 v0, v2 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:2 -; GFX10-NEXT: ds_write_b8 v0, v3 offset:4 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:6 +; GFX10-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 +; GFX10-NEXT: ds_write_b8 v0, v3 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:2 ; GFX10-NEXT: ds_write_b8 v0, v4 offset:9 ; GFX10-NEXT: ds_write_b8 v0, v5 offset:11 -; GFX10-NEXT: ds_write_b8 v0, v6 offset:1 -; GFX10-NEXT: ds_write_b8 v0, v7 offset:3 -; GFX10-NEXT: ds_write_b8 v0, v8 offset:5 -; GFX10-NEXT: ds_write_b8 v0, v9 offset:7 +; GFX10-NEXT: ds_write_b8 v0, v6 offset:5 +; GFX10-NEXT: ds_write_b8 v0, v7 offset:7 +; GFX10-NEXT: ds_write_b8 v0, v8 offset:1 +; GFX10-NEXT: ds_write_b8 v0, v9 offset:3 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32_align1: @@ -235,29 +235,28 @@ ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s0 ; GFX11-NEXT: s_lshr_b32 s3, s2, 8 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: s_lshr_b32 s2, s2, 24 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, s2 -; GFX11-NEXT: s_lshr_b32 s4, s0, 8 -; GFX11-NEXT: s_lshr_b32 s0, s0, 24 -; GFX11-NEXT: s_lshr_b32 s5, s1, 8 +; GFX11-NEXT: s_lshr_b32 s4, s1, 8 ; GFX11-NEXT: s_lshr_b32 s1, s1, 24 -; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v9, s1 +; GFX11-NEXT: s_lshr_b32 s5, s0, 8 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, s2 +; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s1 +; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v9, s0 ; GFX11-NEXT: ds_store_b8 v0, v1 offset:8 -; GFX11-NEXT: ds_store_b8 v0, v2 +; GFX11-NEXT: ds_store_b8 v0, v3 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v3 offset:2 +; GFX11-NEXT: ds_store_b8 v0, v2 offset:4 ; GFX11-NEXT: ds_store_b8 v0, v4 offset:9 ; GFX11-NEXT: ds_store_b8_d16_hi v0, v1 offset:10 ; GFX11-NEXT: ds_store_b8 v0, v5 offset:11 -; GFX11-NEXT: ds_store_b8_d16_hi v0, v2 offset:2 -; GFX11-NEXT: ds_store_b8 v0, v6 offset:1 -; GFX11-NEXT: ds_store_b8 v0, v3 offset:4 -; GFX11-NEXT: ds_store_b8 v0, v7 offset:3 -; GFX11-NEXT: ds_store_b8 v0, v8 offset:5 -; GFX11-NEXT: ds_store_b8_d16_hi v0, v3 offset:6 -; GFX11-NEXT: ds_store_b8 v0, v9 offset:7 +; GFX11-NEXT: ds_store_b8 v0, v6 offset:5 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v2 offset:6 +; GFX11-NEXT: ds_store_b8 v0, v7 offset:7 +; GFX11-NEXT: ds_store_b8 v0, v8 offset:1 +; GFX11-NEXT: ds_store_b8 v0, v9 offset:3 ; GFX11-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 ret void @@ -271,14 +270,14 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 -; GFX9-NEXT: ds_write_b16 v0, v2 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:6 +; GFX9-NEXT: ds_write_b16 v0, v2 offset:4 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align2: @@ -289,20 +288,20 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX7-NEXT: ds_write_b16 v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_write_b16 v0, v2 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: s_lshr_b32 s2, s2, 16 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX7-NEXT: ds_write_b16 v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s0, s0, 16 +; GFX7-NEXT: s_lshr_b32 s1, s1, 16 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:10 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshr_b32 s0, s0, 16 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 16 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:2 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v3i32_align2: @@ -313,20 +312,20 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX6-NEXT: ds_write_b16 v0, v2 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_write_b16 v0, v2 offset:4 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX6-NEXT: ds_write_b16 v0, v1 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s0, s0, 16 +; GFX6-NEXT: s_lshr_b32 s1, s1, 16 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:10 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s0, s0, 16 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 16 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:2 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v3i32_align2: @@ -337,14 +336,14 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 -; GFX10-NEXT: ds_write_b16 v0, v2 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:2 -; GFX10-NEXT: ds_write_b16 v0, v3 offset:4 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:6 +; GFX10-NEXT: ds_write_b16 v0, v2 offset:4 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:6 +; GFX10-NEXT: ds_write_b16 v0, v3 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:2 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32_align2: @@ -357,10 +356,10 @@ ; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: ds_store_b16_d16_hi v0, v1 offset:10 ; GFX11-NEXT: ds_store_b16 v0, v2 -; GFX11-NEXT: ds_store_b16_d16_hi v0, v2 offset:2 ; GFX11-NEXT: ds_store_b16 v0, v3 offset:4 ; GFX11-NEXT: ds_store_b16 v0, v1 offset:8 ; GFX11-NEXT: ds_store_b16_d16_hi v0, v3 offset:6 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v2 offset:2 ; GFX11-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2 ret void diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -234,8 +234,9 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s1, s0, 12 +; VI-NEXT: s_and_b32 s1, s0, 0xffff ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_add_i32 s1, s1, 12 ; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; VI-NEXT: s_or_b32 s0, s1, 4 ; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD diff --git a/llvm/test/CodeGen/ARM/addsubcarry-promotion.ll b/llvm/test/CodeGen/ARM/addsubcarry-promotion.ll --- a/llvm/test/CodeGen/ARM/addsubcarry-promotion.ll +++ b/llvm/test/CodeGen/ARM/addsubcarry-promotion.ll @@ -14,9 +14,8 @@ ; ARM-NEXT: adds r0, r1, r0 ; ARM-NEXT: movw r1, #65535 ; ARM-NEXT: sxth r2, r2 -; ARM-NEXT: adc r0, r2, #0 -; ARM-NEXT: uxth r0, r0 -; ARM-NEXT: cmp r0, r1 +; ARM-NEXT: adc r0, r2, #1 +; ARM-NEXT: tst r0, r1 ; ARM-NEXT: bxeq lr ; ARM-NEXT: .LBB0_1: @ %for.cond ; ARM-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -26,33 +25,25 @@ ; THUMBV6M: @ %bb.0: @ %entry ; THUMBV6M-NEXT: rsbs r2, r2, #0 ; THUMBV6M-NEXT: sxth r2, r2 -; THUMBV6M-NEXT: movs r3, #0 +; THUMBV6M-NEXT: movs r3, #1 ; THUMBV6M-NEXT: adds r0, r1, r0 ; THUMBV6M-NEXT: adcs r3, r2 -; THUMBV6M-NEXT: uxth r0, r3 -; THUMBV6M-NEXT: ldr r1, .LCPI0_0 -; THUMBV6M-NEXT: cmp r0, r1 +; THUMBV6M-NEXT: lsls r0, r3, #16 ; THUMBV6M-NEXT: beq .LBB0_2 ; THUMBV6M-NEXT: .LBB0_1: @ %for.cond ; THUMBV6M-NEXT: @ =>This Inner Loop Header: Depth=1 ; THUMBV6M-NEXT: b .LBB0_1 ; THUMBV6M-NEXT: .LBB0_2: @ %if.end ; THUMBV6M-NEXT: bx lr -; THUMBV6M-NEXT: .p2align 2 -; THUMBV6M-NEXT: @ %bb.3: -; THUMBV6M-NEXT: .LCPI0_0: -; THUMBV6M-NEXT: .long 65535 @ 0xffff ; ; THUMBV8M-BASE-LABEL: fn1: ; THUMBV8M-BASE: @ %bb.0: @ %entry ; THUMBV8M-BASE-NEXT: rsbs r2, r2, #0 ; THUMBV8M-BASE-NEXT: sxth r2, r2 -; THUMBV8M-BASE-NEXT: movs r3, #0 +; THUMBV8M-BASE-NEXT: movs r3, #1 ; THUMBV8M-BASE-NEXT: adds r0, r1, r0 ; THUMBV8M-BASE-NEXT: adcs r3, r2 -; THUMBV8M-BASE-NEXT: uxth r0, r3 -; THUMBV8M-BASE-NEXT: movw r1, #65535 -; THUMBV8M-BASE-NEXT: cmp r0, r1 +; THUMBV8M-BASE-NEXT: lsls r0, r3, #16 ; THUMBV8M-BASE-NEXT: beq .LBB0_2 ; THUMBV8M-BASE-NEXT: .LBB0_1: @ %for.cond ; THUMBV8M-BASE-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -64,11 +55,9 @@ ; THUMB: @ %bb.0: @ %entry ; THUMB-NEXT: rsbs r2, r2, #0 ; THUMB-NEXT: adds r0, r0, r1 -; THUMB-NEXT: movw r1, #65535 ; THUMB-NEXT: sxth r2, r2 -; THUMB-NEXT: adc r0, r2, #0 -; THUMB-NEXT: uxth r0, r0 -; THUMB-NEXT: cmp r0, r1 +; THUMB-NEXT: adc r0, r2, #1 +; THUMB-NEXT: lsls r0, r0, #16 ; THUMB-NEXT: it eq ; THUMB-NEXT: bxeq lr ; THUMB-NEXT: .LBB0_1: @ %for.cond diff --git a/llvm/test/CodeGen/ARM/icmp-shift-opt.ll b/llvm/test/CodeGen/ARM/icmp-shift-opt.ll --- a/llvm/test/CodeGen/ARM/icmp-shift-opt.ll +++ b/llvm/test/CodeGen/ARM/icmp-shift-opt.ll @@ -136,12 +136,11 @@ define i1 @opt_setcc_shl_ne_zero_i128(i128 %a) nounwind { ; CHECK-LABEL: opt_setcc_shl_ne_zero_i128: ; CHECK: @ %bb.0: -; CHECK-NEXT: orr r3, r1, r3 ; CHECK-NEXT: orr r0, r2, r0 -; CHECK-NEXT: orr r2, r0, r3 -; CHECK-NEXT: orr r0, r0, r1 -; CHECK-NEXT: lsr r0, r0, #15 -; CHECK-NEXT: orrs r0, r0, r2, lsl #17 +; CHECK-NEXT: orr r0, r1, r0 +; CHECK-NEXT: orr r1, r0, r3 +; CHECK-NEXT: lsl r1, r1, #17 +; CHECK-NEXT: orrs r0, r1, r0, lsr #15 ; CHECK-NEXT: movwne r0, #1 ; CHECK-NEXT: bx lr %shl = shl i128 %a, 17 diff --git a/llvm/test/CodeGen/ARM/reg_sequence.ll b/llvm/test/CodeGen/ARM/reg_sequence.ll --- a/llvm/test/CodeGen/ARM/reg_sequence.ll +++ b/llvm/test/CodeGen/ARM/reg_sequence.ll @@ -271,9 +271,8 @@ ; PR7162 define arm_aapcs_vfpcc i32 @t10(float %x) nounwind { ; CHECK-LABEL: t10: -; CHECK: vdup.32 [[Q0:q[0-9]+]], d0[0] ; CHECK: vmov.i32 [[Q9:q[0-9]+]], #0x3f000000 -; CHECK: vmul.f32 [[Q8:q[0-9]+]], [[Q0]], [[Q0]] +; CHECK: vmul.f32 [[Q8:q[0-9]+]], [[Q0:q[0-9]+]], [[Q0]] ; CHECK-NEXT: vadd.f32 [[Q8]], [[Q8]], [[Q8]] ; CHECK-NEXT: vadd.f32 [[Q1:q[0-9]+]], [[Q8]], [[Q8]] ; CHECK-NEXT: vmul.f32 [[Q8]], [[Q9]], d1[0] diff --git a/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll b/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll --- a/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll +++ b/llvm/test/CodeGen/PowerPC/aix32-cc-abi-vaarg.ll @@ -259,36 +259,45 @@ ; 32BIT-LABEL: body: | ; 32BIT-DAG: liveins: $f1, $r5, $r6, $r7, $r8, $r9, $r10 ; 32BIT-DAG: renamable $r3 = ADDI %fixed-stack.0, 0 +; 32BIT-DAG: STW killed renamable $r7, 8, %fixed-stack.0 :: (store (s32), align 8) ; 32BIT-DAG: STW renamable $r5, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 16) ; 32BIT-DAG: STW renamable $r6, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4) -; 32BIT-DAG: STW killed renamable $r7, 8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8, align 8) ; 32BIT-DAG: STW killed renamable $r8, 12, %fixed-stack.0 :: (store (s32)) ; 32BIT-DAG: STW killed renamable $r9, 16, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 16, align 16) ; 32BIT-DAG: STW killed renamable $r10, 20, %fixed-stack.0 :: (store (s32)) ; 32BIT-DAG: STW renamable $r3, 0, %stack.0.arg1 :: (store (s32) into %ir.0) ; 32BIT-DAG: STW killed renamable $r3, 0, %stack.1.arg2 :: (store (s32) into %ir.1) +; 32BIT-DAG: STW renamable $r5, 0, %stack.2 :: (store (s32) into %stack.2, align 8) +; 32BIT-DAG: STW renamable $r6, 4, %stack.2 :: (store (s32) into %stack.2 + 4) +; 32BIT-DAG: renamable $f0 = LFD 0, %stack.2 :: (load (s64) from %stack.2) +; 32BIT-DAG: STW killed renamable $r5, 0, %stack.3 :: (store (s32) into %stack.3, align 8) +; 32BIT-DAG: STW killed renamable $r6, 4, %stack.3 :: (store (s32) into %stack.3 + 4) +; 32BIT-DAG: renamable $f2 = LFD 0, %stack.3 :: (load (s64) from %stack.3) +; 32BIT-DAG: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm +; 32BIT-DAG: renamable $f1 = nofpexcept FADD killed renamable $f2, renamable $f2, implicit $rm +; 32BIT-DAG: renamable $f1 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm ; 32BIT-DAG: BLR implicit $lr, implicit $rm, implicit $f1 define double @double_stack_va_arg(double %one, double %two, double %three, double %four, double %five, double %six, double %seven, double %eight, double %nine, double %ten, double %eleven, double %twelve, double %thirteen, ...) local_unnamed_addr { ; ASM32-LABEL: double_stack_va_arg: ; ASM32: # %bb.0: # %entry ; ASM32-NEXT: fadd 0, 1, 2 -; ASM32-NEXT: addi 4, 1, 128 -; ASM32-NEXT: lwz 3, 132(1) +; ASM32-NEXT: addi 3, 1, 128 +; ASM32-NEXT: lwz 4, 132(1) ; ASM32-NEXT: fadd 0, 0, 3 -; ASM32-NEXT: stw 4, -4(1) +; ASM32-NEXT: stw 3, -4(1) ; ASM32-NEXT: fadd 0, 0, 4 -; ASM32-NEXT: lwz 4, 128(1) +; ASM32-NEXT: lwz 3, 128(1) ; ASM32-NEXT: fadd 0, 0, 5 -; ASM32-NEXT: stw 3, -12(1) +; ASM32-NEXT: stw 3, -16(1) ; ASM32-NEXT: fadd 0, 0, 6 -; ASM32-NEXT: stw 4, -16(1) +; ASM32-NEXT: stw 4, -12(1) ; ASM32-NEXT: fadd 0, 0, 7 ; ASM32-NEXT: lfd 1, -16(1) ; ASM32-NEXT: fadd 0, 0, 8 -; ASM32-NEXT: stw 3, -20(1) +; ASM32-NEXT: stw 3, -24(1) ; ASM32-NEXT: fadd 0, 0, 9 -; ASM32-NEXT: stw 4, -24(1) +; ASM32-NEXT: stw 4, -20(1) ; ASM32-NEXT: fadd 0, 0, 10 ; ASM32-NEXT: fadd 0, 0, 11 ; ASM32-NEXT: fadd 0, 0, 12 @@ -360,29 +369,32 @@ ; 32BIT-LABEL: body: | ; 32BIT-DAG: liveins: $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13 -; 32BIT-DAG: renamable $r4 = ADDI %fixed-stack.0, 0 -; 32BIT-DAG: STW killed renamable $r4, 0, %stack.0.arg1 :: (store (s32) into %ir.0) -; 32BIT-DAG: renamable $r4 = LWZ 0, %fixed-stack.0 :: (load (s32) from %ir.argp.cur142, align 16) -; 32BIT-DAG: renamable $f1 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm +; 32BIT-DAG: renamable $r3 = ADDI %fixed-stack.0, 0 +; 32BIT-DAG: STW killed renamable $r3, 0, %stack.0.arg1 :: (store (s32) into %ir.0) +; 32BIT-DAG: renamable $r3 = LWZ 0, %fixed-stack.0 :: (load (s32) from %ir.argp.cur142, align 16) ; 32BIT-DAG: renamable $f0 = nofpexcept FADD killed renamable $f1, killed renamable $f2, implicit $rm ; 32BIT-DAG: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f3, implicit $rm +; 32BIT-DAG: STW renamable $r3, 0, %stack.2 :: (store (s32) into %stack.2, align 8) ; 32BIT-DAG: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f4, implicit $rm +; 32BIT-DAG: renamable $r4 = LWZ 4, %fixed-stack.0 :: (load (s32) from %ir.argp.cur142 + 4) ; 32BIT-DAG: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f5, implicit $rm ; 32BIT-DAG: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f6, implicit $rm ; 32BIT-DAG: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f7, implicit $rm +; 32BIT-DAG: STW renamable $r4, 4, %stack.2 :: (store (s32) into %stack.2 + 4) ; 32BIT-DAG: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f8, implicit $rm +; 32BIT-DAG: renamable $f1 = LFD 0, %stack.2 :: (load (s64) from %stack.2) ; 32BIT-DAG: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f9, implicit $rm +; 32BIT-DAG: STW killed renamable $r3, 0, %stack.3 :: (store (s32) into %stack.3, align 8) ; 32BIT-DAG: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f10, implicit $rm +; 32BIT-DAG: STW killed renamable $r4, 4, %stack.3 :: (store (s32) into %stack.3 + 4) ; 32BIT-DAG: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f11, implicit $rm +; 32BIT-DAG: renamable $f2 = LFD 0, %stack.3 :: (load (s64) from %stack.3) ; 32BIT-DAG: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f12, implicit $rm ; 32BIT-DAG: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f13, implicit $rm -; 32BIT-DAG: renamable $r3 = LWZ 4, %fixed-stack.0 :: (load (s32) from %ir.argp.cur142 + 4) -; 32BIT-DAG: STW renamable $r3, 4, %stack.2 :: (store (s32) into %stack.2 + 4) -; 32BIT-DAG: renamable $f1 = LFD 0, %stack.2 :: (load (s64) from %stack.2) -; 32BIT-DAG: STW killed renamable $r4, 0, %stack.3 :: (store (s32) into %stack.3, align 8) -; 32BIT-DAG: STW killed renamable $r3, 4, %stack.3 :: (store (s32) into %stack.3 + 4) -; 32BIT-DAG: renamable $f2 = LFD 0, %stack.3 :: (load (s64) from %stack.3) ; 32BIT-DAG: renamable $f0 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm -; 32BIT-DAG: STW renamable $r4, 0, %stack.2 :: (store (s32) into %stack.2, align 8) ; 32BIT-DAG: renamable $f1 = nofpexcept FADD killed renamable $f2, renamable $f2, implicit $rm +; 32BIT-DAG: renamable $f1 = nofpexcept FADD killed renamable $f0, killed renamable $f1, implicit $rm ; 32BIT-DAG: BLR implicit $lr, implicit $rm, implicit $f1 + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; 32BIT: {{.*}} diff --git a/llvm/test/CodeGen/PowerPC/combine-fneg.ll b/llvm/test/CodeGen/PowerPC/combine-fneg.ll --- a/llvm/test/CodeGen/PowerPC/combine-fneg.ll +++ b/llvm/test/CodeGen/PowerPC/combine-fneg.ll @@ -13,10 +13,10 @@ ; CHECK-NEXT: xvredp 2, 0 ; CHECK-NEXT: xxswapd 1, 1 ; CHECK-NEXT: xxlor 3, 1, 1 -; CHECK-NEXT: xvnmsubadp 3, 0, 2 -; CHECK-NEXT: xvmaddadp 2, 2, 3 -; CHECK-NEXT: xvnmsubadp 1, 0, 2 -; CHECK-NEXT: xvnmaddadp 2, 2, 1 +; CHECK-NEXT: xvmaddadp 3, 0, 2 +; CHECK-NEXT: xvnmsubadp 2, 2, 3 +; CHECK-NEXT: xvmaddadp 1, 0, 2 +; CHECK-NEXT: xvmsubadp 2, 2, 1 ; CHECK-NEXT: xvmuldp 34, 34, 2 ; CHECK-NEXT: xvmuldp 35, 35, 2 ; CHECK-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/select_const.ll b/llvm/test/CodeGen/PowerPC/select_const.ll --- a/llvm/test/CodeGen/PowerPC/select_const.ll +++ b/llvm/test/CodeGen/PowerPC/select_const.ll @@ -610,24 +610,13 @@ } define i8 @shl_constant_sel_constants(i1 %cond) { -; ISEL-LABEL: shl_constant_sel_constants: -; ISEL: # %bb.0: -; ISEL-NEXT: andi. 3, 3, 1 -; ISEL-NEXT: li 4, 4 -; ISEL-NEXT: li 3, 8 -; ISEL-NEXT: iselgt 3, 4, 3 -; ISEL-NEXT: blr -; -; NO_ISEL-LABEL: shl_constant_sel_constants: -; NO_ISEL: # %bb.0: -; NO_ISEL-NEXT: andi. 3, 3, 1 -; NO_ISEL-NEXT: li 4, 4 -; NO_ISEL-NEXT: li 3, 8 -; NO_ISEL-NEXT: bc 12, 1, .LBB37_1 -; NO_ISEL-NEXT: blr -; NO_ISEL-NEXT: .LBB37_1: -; NO_ISEL-NEXT: addi 3, 4, 0 -; NO_ISEL-NEXT: blr +; ALL-LABEL: shl_constant_sel_constants: +; ALL: # %bb.0: +; ALL-NEXT: clrlwi 3, 3, 31 +; ALL-NEXT: li 4, 1 +; ALL-NEXT: xori 3, 3, 3 +; ALL-NEXT: slw 3, 4, 3 +; ALL-NEXT: blr %sel = select i1 %cond, i8 2, i8 3 %bo = shl i8 1, %sel ret i8 %bo @@ -658,24 +647,13 @@ } define i8 @lshr_constant_sel_constants(i1 %cond) { -; ISEL-LABEL: lshr_constant_sel_constants: -; ISEL: # %bb.0: -; ISEL-NEXT: andi. 3, 3, 1 -; ISEL-NEXT: li 4, 16 -; ISEL-NEXT: li 3, 8 -; ISEL-NEXT: iselgt 3, 4, 3 -; ISEL-NEXT: blr -; -; NO_ISEL-LABEL: lshr_constant_sel_constants: -; NO_ISEL: # %bb.0: -; NO_ISEL-NEXT: andi. 3, 3, 1 -; NO_ISEL-NEXT: li 4, 16 -; NO_ISEL-NEXT: li 3, 8 -; NO_ISEL-NEXT: bc 12, 1, .LBB39_1 -; NO_ISEL-NEXT: blr -; NO_ISEL-NEXT: .LBB39_1: -; NO_ISEL-NEXT: addi 3, 4, 0 -; NO_ISEL-NEXT: blr +; ALL-LABEL: lshr_constant_sel_constants: +; ALL: # %bb.0: +; ALL-NEXT: clrlwi 3, 3, 31 +; ALL-NEXT: li 4, 64 +; ALL-NEXT: xori 3, 3, 3 +; ALL-NEXT: srw 3, 4, 3 +; ALL-NEXT: blr %sel = select i1 %cond, i8 2, i8 3 %bo = lshr i8 64, %sel ret i8 %bo @@ -685,7 +663,7 @@ define i8 @sel_constants_ashr_constant(i1 %cond) { ; ALL-LABEL: sel_constants_ashr_constant: ; ALL: # %bb.0: -; ALL-NEXT: clrldi 3, 3, 63 +; ALL-NEXT: clrlwi 3, 3, 31 ; ALL-NEXT: neg 3, 3 ; ALL-NEXT: blr %sel = select i1 %cond, i8 -4, i8 23 @@ -694,24 +672,13 @@ } define i8 @ashr_constant_sel_constants(i1 %cond) { -; ISEL-LABEL: ashr_constant_sel_constants: -; ISEL: # %bb.0: -; ISEL-NEXT: andi. 3, 3, 1 -; ISEL-NEXT: li 4, -32 -; ISEL-NEXT: li 3, -16 -; ISEL-NEXT: iselgt 3, 4, 3 -; ISEL-NEXT: blr -; -; NO_ISEL-LABEL: ashr_constant_sel_constants: -; NO_ISEL: # %bb.0: -; NO_ISEL-NEXT: andi. 3, 3, 1 -; NO_ISEL-NEXT: li 4, -32 -; NO_ISEL-NEXT: li 3, -16 -; NO_ISEL-NEXT: bc 12, 1, .LBB41_1 -; NO_ISEL-NEXT: blr -; NO_ISEL-NEXT: .LBB41_1: -; NO_ISEL-NEXT: addi 3, 4, 0 -; NO_ISEL-NEXT: blr +; ALL-LABEL: ashr_constant_sel_constants: +; ALL: # %bb.0: +; ALL-NEXT: clrlwi 3, 3, 31 +; ALL-NEXT: li 4, -128 +; ALL-NEXT: xori 3, 3, 3 +; ALL-NEXT: sraw 3, 4, 3 +; ALL-NEXT: blr %sel = select i1 %cond, i8 2, i8 3 %bo = ashr i8 128, %sel ret i8 %bo diff --git a/llvm/test/CodeGen/PowerPC/store-forward-be64.ll b/llvm/test/CodeGen/PowerPC/store-forward-be64.ll --- a/llvm/test/CodeGen/PowerPC/store-forward-be64.ll +++ b/llvm/test/CodeGen/PowerPC/store-forward-be64.ll @@ -119,8 +119,9 @@ ; CHECK-LABEL: tc41: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mr 4, 3 -; CHECK-NEXT: sradi 3, 3, 56 +; CHECK-NEXT: rldicl 3, 3, 8, 56 ; CHECK-NEXT: std 4, 48(1) +; CHECK-NEXT: extsb 3, 3 ; CHECK-NEXT: blr entry: %0 = load i32, ptr %s, align 8 @@ -133,8 +134,9 @@ ; CHECK-LABEL: tc42: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mr 4, 3 -; CHECK-NEXT: sradi 3, 3, 48 +; CHECK-NEXT: rldicl 3, 3, 16, 48 ; CHECK-NEXT: std 4, 48(1) +; CHECK-NEXT: extsh 3, 3 ; CHECK-NEXT: blr entry: %0 = load i32, ptr %s, align 8 diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll --- a/llvm/test/CodeGen/RISCV/mul.ll +++ b/llvm/test/CodeGen/RISCV/mul.ll @@ -1299,9 +1299,8 @@ ; RV32IM-NEXT: li a5, -63 ; RV32IM-NEXT: mulhu a6, a3, a5 ; RV32IM-NEXT: slli a7, a4, 6 -; RV32IM-NEXT: sub a7, a7, a4 -; RV32IM-NEXT: sub a6, a6, a7 -; RV32IM-NEXT: neg a7, a7 +; RV32IM-NEXT: sub a7, a4, a7 +; RV32IM-NEXT: add a6, a7, a6 ; RV32IM-NEXT: sltu a7, a6, a7 ; RV32IM-NEXT: mulhu t0, a4, a5 ; RV32IM-NEXT: add a7, t0, a7 @@ -1314,9 +1313,9 @@ ; RV32IM-NEXT: add t1, a7, t1 ; RV32IM-NEXT: sub t4, t1, a4 ; RV32IM-NEXT: slli t5, a1, 6 -; RV32IM-NEXT: sub t5, t5, a1 -; RV32IM-NEXT: add t5, t5, a3 -; RV32IM-NEXT: sub t6, t4, t5 +; RV32IM-NEXT: sub t5, a1, t5 +; RV32IM-NEXT: sub t5, t5, a3 +; RV32IM-NEXT: add t6, t4, t5 ; RV32IM-NEXT: sltu s0, t6, t4 ; RV32IM-NEXT: neg s1, a4 ; RV32IM-NEXT: sltu t4, t4, s1 @@ -1324,6 +1323,7 @@ ; RV32IM-NEXT: mulhu t1, a4, t2 ; RV32IM-NEXT: add a7, t1, a7 ; RV32IM-NEXT: add a7, a7, t4 +; RV32IM-NEXT: sltu t0, t5, t0 ; RV32IM-NEXT: slli t1, a2, 6 ; RV32IM-NEXT: sub a2, a2, t1 ; RV32IM-NEXT: mulhu a5, a1, a5 @@ -1332,9 +1332,7 @@ ; RV32IM-NEXT: sub a2, t3, a3 ; RV32IM-NEXT: sub a2, a2, a4 ; RV32IM-NEXT: add a1, a2, a1 -; RV32IM-NEXT: neg a2, t5 -; RV32IM-NEXT: sltu a2, a2, t0 -; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: add a1, a1, t0 ; RV32IM-NEXT: add a1, a7, a1 ; RV32IM-NEXT: add a1, a1, s0 ; RV32IM-NEXT: slli a2, a3, 6 diff --git a/llvm/test/CodeGen/RISCV/pr58511.ll b/llvm/test/CodeGen/RISCV/pr58511.ll --- a/llvm/test/CodeGen/RISCV/pr58511.ll +++ b/llvm/test/CodeGen/RISCV/pr58511.ll @@ -4,14 +4,14 @@ define i32 @f(i1 %0, i32 %1, ptr %2) { ; CHECK-LABEL: f: ; CHECK: # %bb.0: # %BB -; CHECK-NEXT: slliw a3, a1, 11 -; CHECK-NEXT: slliw a1, a1, 12 -; CHECK-NEXT: subw a1, a1, a3 ; CHECK-NEXT: andi a0, a0, 1 ; CHECK-NEXT: neg a0, a0 ; CHECK-NEXT: lui a3, 1 ; CHECK-NEXT: addiw a3, a3, -2048 ; CHECK-NEXT: or a0, a0, a3 +; CHECK-NEXT: slliw a3, a1, 11 +; CHECK-NEXT: slliw a1, a1, 12 +; CHECK-NEXT: subw a1, a1, a3 ; CHECK-NEXT: sw a1, 0(a2) ; CHECK-NEXT: ret BB: @@ -25,14 +25,14 @@ define i32 @g(i1 %0, i32 %1, ptr %2) { ; CHECK-LABEL: g: ; CHECK: # %bb.0: # %BB -; CHECK-NEXT: slliw a3, a1, 11 -; CHECK-NEXT: slliw a1, a1, 12 -; CHECK-NEXT: subw a1, a1, a3 ; CHECK-NEXT: andi a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: lui a3, 1 ; CHECK-NEXT: addiw a3, a3, -2048 ; CHECK-NEXT: or a0, a0, a3 +; CHECK-NEXT: slliw a3, a1, 11 +; CHECK-NEXT: slliw a1, a1, 12 +; CHECK-NEXT: subw a1, a1, a3 ; CHECK-NEXT: sw a1, 0(a2) ; CHECK-NEXT: ret BB: @@ -46,11 +46,11 @@ define i32 @h(i1 %0, i32 %1, ptr %2) { ; CHECK-LABEL: h: ; CHECK: # %bb.0: # %BB +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: slli a0, a0, 11 ; CHECK-NEXT: slliw a3, a1, 11 ; CHECK-NEXT: slliw a1, a1, 12 ; CHECK-NEXT: subw a1, a1, a3 -; CHECK-NEXT: andi a0, a0, 1 -; CHECK-NEXT: slli a0, a0, 11 ; CHECK-NEXT: sw a1, 0(a2) ; CHECK-NEXT: ret BB: diff --git a/llvm/test/CodeGen/SystemZ/pr36164.ll b/llvm/test/CodeGen/SystemZ/pr36164.ll --- a/llvm/test/CodeGen/SystemZ/pr36164.ll +++ b/llvm/test/CodeGen/SystemZ/pr36164.ll @@ -17,18 +17,18 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: lhi %r0, 1 ; CHECK-NEXT: larl %r1, g_938 -; CHECK-NEXT: lhi %r2, 0 +; CHECK-NEXT: lhi %r2, 3 ; CHECK-NEXT: lhi %r3, 4 ; CHECK-NEXT: larl %r4, g_11 ; CHECK-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: strl %r0, g_73 -; CHECK-NEXT: strl %r2, g_69 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 +; CHECK-NEXT: strl %r2, g_69 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 ; CHECK-NEXT: lrl %r5, g_832 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -8,15 +8,17 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrd r12, r3, [r0] -; CHECK-NEXT: ldrd lr, r2, [r0, #8] +; CHECK-NEXT: ldrd lr, r12, [r0] +; CHECK-NEXT: ldrd r3, r2, [r0, #8] ; CHECK-NEXT: ldrd r4, r0, [r0, #16] -; CHECK-NEXT: vmov.32 q1[1], r3 -; CHECK-NEXT: vmov q1[2], q1[0], r12, lr -; CHECK-NEXT: strd r2, r0, [r1, #16] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 +; CHECK-NEXT: vmov q1[2], q1[0], lr, r3 +; CHECK-NEXT: str r2, [r1, #16] +; CHECK-NEXT: vmov.32 q0[0], r4 +; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 +; CHECK-NEXT: vmov.32 q0[1], r0 ; CHECK-NEXT: vmov.f32 s8, s4 ; CHECK-NEXT: vmov.f32 s9, s6 +; CHECK-NEXT: str r0, [r1, #20] ; CHECK-NEXT: vmov.f32 s10, s0 ; CHECK-NEXT: vmov.f32 s11, s5 ; CHECK-NEXT: vstrw.32 q2, [r1] diff --git a/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll b/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll --- a/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll +++ b/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll @@ -17,12 +17,12 @@ define dso_local i32 @main() nounwind uwtable { ; CHECK-LABEL: main: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq i(%rip), %rsi -; CHECK-NEXT: movq j(%rip), %rax -; CHECK-NEXT: movq %rsi, %rdx -; CHECK-NEXT: shrq $8, %rdx +; CHECK-NEXT: movl i(%rip), %esi +; CHECK-NEXT: movl j(%rip), %eax +; CHECK-NEXT: movl %esi, %edx +; CHECK-NEXT: shrl $8, %edx ; CHECK-NEXT: movsbl %al, %ecx -; CHECK-NEXT: shrq $8, %rax +; CHECK-NEXT: shrl $8, %eax ; CHECK-NEXT: cbtw ; CHECK-NEXT: idivb %dl ; CHECK-NEXT: movl %eax, %edx diff --git a/llvm/test/CodeGen/X86/2012-08-07-CmpISelBug.ll b/llvm/test/CodeGen/X86/2012-08-07-CmpISelBug.ll --- a/llvm/test/CodeGen/X86/2012-08-07-CmpISelBug.ll +++ b/llvm/test/CodeGen/X86/2012-08-07-CmpISelBug.ll @@ -8,13 +8,11 @@ define void @foo(i8 %arg4, i32 %arg5, ptr %arg14) nounwind { ; CHECK-LABEL: foo: ; CHECK: ## %bb.0: ## %bb -; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi ; CHECK-NEXT: andl $32, %edi -; CHECK-NEXT: leal 13(%rdi), %eax -; CHECK-NEXT: xorb $-14, %al -; CHECK-NEXT: addb $82, %al -; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: movb $81, %al +; CHECK-NEXT: subb %dil, %al ; CHECK-NEXT: testl %esi, %edi +; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: movl $1, %ecx ; CHECK-NEXT: cmovnel %eax, %ecx ; CHECK-NEXT: xorb $81, %cl diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/llvm/test/CodeGen/X86/addcarry.ll --- a/llvm/test/CodeGen/X86/addcarry.ll +++ b/llvm/test/CodeGen/X86/addcarry.ll @@ -743,38 +743,32 @@ define i32 @add_U320_without_i128_add(ptr nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) nounwind { ; CHECK-LABEL: add_U320_without_i128_add: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movq 16(%rdi), %rax -; CHECK-NEXT: leaq (%rax,%rcx), %r10 +; CHECK-NEXT: movq 24(%rdi), %r10 +; CHECK-NEXT: movq 32(%rdi), %r11 ; CHECK-NEXT: addq %rsi, (%rdi) ; CHECK-NEXT: adcq %rdx, 8(%rdi) ; CHECK-NEXT: movq %rax, %rdx ; CHECK-NEXT: adcq %rcx, %rdx -; CHECK-NEXT: movq 24(%rdi), %rsi -; CHECK-NEXT: leaq (%r8,%rsi), %r11 -; CHECK-NEXT: xorl %ebx, %ebx -; CHECK-NEXT: cmpq %r10, %rdx -; CHECK-NEXT: setb %bl ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: adcq %r11, %rbx -; CHECK-NEXT: movq 32(%rdi), %rcx -; CHECK-NEXT: leaq (%r9,%rcx), %r10 -; CHECK-NEXT: xorl %r14d, %r14d -; CHECK-NEXT: cmpq %r11, %rbx -; CHECK-NEXT: setb %r14b -; CHECK-NEXT: addq %rsi, %r8 -; CHECK-NEXT: adcq %r10, %r14 +; CHECK-NEXT: movq %r10, %rcx +; CHECK-NEXT: adcq %r8, %rcx +; CHECK-NEXT: cmpq %rax, %rdx +; CHECK-NEXT: adcq $0, %rcx +; CHECK-NEXT: leaq (%r11,%r9), %rsi +; CHECK-NEXT: addq %r8, %r10 +; CHECK-NEXT: movq %r11, %r8 +; CHECK-NEXT: adcq %r9, %r8 +; CHECK-NEXT: cmpq %r10, %rcx +; CHECK-NEXT: adcq $0, %r8 ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: cmpq %r10, %r14 +; CHECK-NEXT: cmpq %rsi, %r8 ; CHECK-NEXT: setb %al -; CHECK-NEXT: addq %rcx, %r9 +; CHECK-NEXT: addq %r9, %r11 ; CHECK-NEXT: movq %rdx, 16(%rdi) -; CHECK-NEXT: movq %rbx, 24(%rdi) -; CHECK-NEXT: movq %r14, 32(%rdi) +; CHECK-NEXT: movq %rcx, 24(%rdi) +; CHECK-NEXT: movq %r8, 32(%rdi) ; CHECK-NEXT: adcl $0, %eax -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %r14 ; CHECK-NEXT: retq %7 = load i64, ptr %0, align 8 %8 = getelementptr inbounds %struct.U320, ptr %0, i64 0, i32 0, i64 1 diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -74,8 +74,7 @@ ; X86-LABEL: mask8_zext: ; X86: ## %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: notb %al -; X86-NEXT: movzbl %al, %eax +; X86-NEXT: xorl $255, %eax ; X86-NEXT: retl %m0 = bitcast i8 %x to <8 x i1> %m1 = xor <8 x i1> %m0, diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -1967,16 +1967,16 @@ ; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] ; X86-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1] -; X86-NEXT: kmovd %k2, %esi # encoding: [0xc5,0xfb,0x93,0xf2] -; X86-NEXT: adcl %edx, %esi # encoding: [0x11,0xd6] +; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] +; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6] +; X86-NEXT: adcl %edx, %ecx # encoding: [0x11,0xd1] ; X86-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x64,0xc1] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] -; X86-NEXT: adcl %esi, %edx # encoding: [0x11,0xf2] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] +; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax # encoding: [0x03,0x44,0x24,0x08] ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x0c] ; X86-NEXT: popl %esi # encoding: [0x5e] @@ -2140,16 +2140,16 @@ ; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] ; X86-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1] -; X86-NEXT: kmovd %k2, %esi # encoding: [0xc5,0xfb,0x93,0xf2] -; X86-NEXT: adcl %edx, %esi # encoding: [0x11,0xd6] +; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] +; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6] +; X86-NEXT: adcl %edx, %ecx # encoding: [0x11,0xd1] ; X86-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x06] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] -; X86-NEXT: adcl %esi, %edx # encoding: [0x11,0xf2] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] +; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax # encoding: [0x03,0x44,0x24,0x08] ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x0c] ; X86-NEXT: popl %esi # encoding: [0x5e] diff --git a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll --- a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -2698,9 +2698,8 @@ ; NoVLX-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2724,9 +2723,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 ; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2753,9 +2751,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2784,9 +2781,8 @@ ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2814,9 +2810,8 @@ ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2843,9 +2838,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7502,9 +7496,8 @@ ; NoVLX-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7528,9 +7521,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 ; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7557,9 +7549,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7588,9 +7579,8 @@ ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7618,9 +7608,8 @@ ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7647,9 +7636,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12366,9 +12354,8 @@ ; NoVLX-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vpcmpnltq %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12392,9 +12379,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 ; NoVLX-NEXT: vpcmpnltq %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12421,9 +12407,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpnltq %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12452,9 +12437,8 @@ ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpnltq %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12482,9 +12466,8 @@ ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vpcmpnltq (%rdi){1to8}, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12511,9 +12494,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpnltq (%rsi){1to8}, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17250,9 +17232,8 @@ ; NoVLX-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17276,9 +17257,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 ; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17305,9 +17285,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17336,9 +17315,8 @@ ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17366,9 +17344,8 @@ ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17395,9 +17372,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21081,9 +21057,8 @@ ; NoVLX-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21107,9 +21082,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vmovapd (%rdi), %xmm1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21133,9 +21107,8 @@ ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21163,9 +21136,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21193,9 +21165,8 @@ ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %xmm1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21223,9 +21194,8 @@ ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $14, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: andl $3, %eax ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll --- a/llvm/test/CodeGen/X86/bitreverse.ll +++ b/llvm/test/CodeGen/X86/bitreverse.ll @@ -15,6 +15,7 @@ ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: rolw $8, %ax +; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: movl %eax, %edx ; X86-NEXT: andl $3855, %edx # imm = 0xF0F ; X86-NEXT: shll $4, %edx @@ -32,6 +33,7 @@ ; X86-NEXT: andl $21845, %eax # imm = 0x5555 ; X86-NEXT: leal (%eax,%edx,2), %eax ; X86-NEXT: rolw $8, %cx +; X86-NEXT: movzwl %cx, %ecx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: andl $3855, %edx # imm = 0xF0F ; X86-NEXT: shll $4, %edx @@ -366,6 +368,7 @@ ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: rolw $8, %ax +; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $3855, %ecx # imm = 0xF0F ; X86-NEXT: shll $4, %ecx @@ -387,19 +390,19 @@ ; ; X64-LABEL: test_bitreverse_i16: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: rolw $8, %di -; X64-NEXT: movl %edi, %eax +; X64-NEXT: movzwl %di, %eax +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: andl $3855, %ecx # imm = 0xF0F +; X64-NEXT: shll $4, %ecx +; X64-NEXT: shrl $4, %eax ; X64-NEXT: andl $3855, %eax # imm = 0xF0F -; X64-NEXT: shll $4, %eax -; X64-NEXT: shrl $4, %edi -; X64-NEXT: andl $3855, %edi # imm = 0xF0F -; X64-NEXT: orl %eax, %edi -; X64-NEXT: movl %edi, %eax +; X64-NEXT: orl %ecx, %eax +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: andl $13107, %ecx # imm = 0x3333 +; X64-NEXT: shrl $2, %eax ; X64-NEXT: andl $13107, %eax # imm = 0x3333 -; X64-NEXT: shrl $2, %edi -; X64-NEXT: andl $13107, %edi # imm = 0x3333 -; X64-NEXT: leal (%rdi,%rax,4), %eax +; X64-NEXT: leal (%rax,%rcx,4), %eax ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: andl $21845, %ecx # imm = 0x5555 ; X64-NEXT: shrl %eax @@ -418,19 +421,19 @@ ; ; GFNI-LABEL: test_bitreverse_i16: ; GFNI: # %bb.0: -; GFNI-NEXT: # kill: def $edi killed $edi def $rdi ; GFNI-NEXT: rolw $8, %di -; GFNI-NEXT: movl %edi, %eax +; GFNI-NEXT: movzwl %di, %eax +; GFNI-NEXT: movl %eax, %ecx +; GFNI-NEXT: andl $3855, %ecx # imm = 0xF0F +; GFNI-NEXT: shll $4, %ecx +; GFNI-NEXT: shrl $4, %eax ; GFNI-NEXT: andl $3855, %eax # imm = 0xF0F -; GFNI-NEXT: shll $4, %eax -; GFNI-NEXT: shrl $4, %edi -; GFNI-NEXT: andl $3855, %edi # imm = 0xF0F -; GFNI-NEXT: orl %eax, %edi -; GFNI-NEXT: movl %edi, %eax +; GFNI-NEXT: orl %ecx, %eax +; GFNI-NEXT: movl %eax, %ecx +; GFNI-NEXT: andl $13107, %ecx # imm = 0x3333 +; GFNI-NEXT: shrl $2, %eax ; GFNI-NEXT: andl $13107, %eax # imm = 0x3333 -; GFNI-NEXT: shrl $2, %edi -; GFNI-NEXT: andl $13107, %edi # imm = 0x3333 -; GFNI-NEXT: leal (%rdi,%rax,4), %eax +; GFNI-NEXT: leal (%rax,%rcx,4), %eax ; GFNI-NEXT: movl %eax, %ecx ; GFNI-NEXT: andl $21845, %ecx # imm = 0x5555 ; GFNI-NEXT: shrl %eax diff --git a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll --- a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll +++ b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll @@ -574,7 +574,7 @@ define i16 @test_i16_2032_mask_lshr_4(i16 %a0) { ; X86-LABEL: test_i16_2032_mask_lshr_4: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl $4, %eax ; X86-NEXT: andl $127, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -594,7 +594,7 @@ define i16 @test_i16_2032_mask_lshr_5(i16 %a0) { ; X86-LABEL: test_i16_2032_mask_lshr_5: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl $5, %eax ; X86-NEXT: andl $63, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -614,7 +614,7 @@ define i16 @test_i16_2032_mask_lshr_6(i16 %a0) { ; X86-LABEL: test_i16_2032_mask_lshr_6: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl $6, %eax ; X86-NEXT: andl $31, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -755,7 +755,7 @@ define i16 @test_i16_2032_mask_ashr_4(i16 %a0) { ; X86-LABEL: test_i16_2032_mask_ashr_4: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl $4, %eax ; X86-NEXT: andl $127, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -775,7 +775,7 @@ define i16 @test_i16_2032_mask_ashr_5(i16 %a0) { ; X86-LABEL: test_i16_2032_mask_ashr_5: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl $5, %eax ; X86-NEXT: andl $63, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -795,7 +795,7 @@ define i16 @test_i16_2032_mask_ashr_6(i16 %a0) { ; X86-LABEL: test_i16_2032_mask_ashr_6: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl $6, %eax ; X86-NEXT: andl $31, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/dagcombine-cse.ll b/llvm/test/CodeGen/X86/dagcombine-cse.ll --- a/llvm/test/CodeGen/X86/dagcombine-cse.ll +++ b/llvm/test/CodeGen/X86/dagcombine-cse.ll @@ -50,6 +50,7 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %edi, %eax @@ -72,37 +73,31 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edi, %esi -; X86-NEXT: adcl $0, %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl (%esp), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl $0, %esi ; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb %al -; X86-NEXT: movl %edx, %edi -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: adcl $0, %eax +; X86-NEXT: setb %dl ; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: adcl %ecx, %esi -; X86-NEXT: setb %cl -; X86-NEXT: adcl %edx, %edi -; X86-NEXT: addb $255, %cl -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: adcl %ecx, %edi +; X86-NEXT: movzbl %dl, %ecx +; X86-NEXT: adcl %eax, %esi ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: setb %ah -; X86-NEXT: addb $255, %al -; X86-NEXT: adcl %edx, %ecx -; X86-NEXT: movzbl %ah, %ebx -; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %eax -; X86-NEXT: addl %eax, %edi -; X86-NEXT: adcl %edx, %ebx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl %edi, %edx -; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: addl %eax, %esi +; X86-NEXT: adcl %edx, %ecx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %esi, %edx +; X86-NEXT: addl $4, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/dagcombine-select.ll b/llvm/test/CodeGen/X86/dagcombine-select.ll --- a/llvm/test/CodeGen/X86/dagcombine-select.ll +++ b/llvm/test/CodeGen/X86/dagcombine-select.ll @@ -194,10 +194,12 @@ define i32 @shl_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: shl_constant_sel_constants: ; CHECK: # %bb.0: -; CHECK-NEXT: notb %dil -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: leal 4(,%rax,4), %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: xorb $3, %cl +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shll %cl, %eax ; CHECK-NEXT: retq %sel = select i1 %cond, i32 2, i32 3 %bo = shl i32 1, %sel @@ -207,9 +209,12 @@ define i32 @lshr_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: lshr_constant_sel_constants: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: leal 8(,%rdi,8), %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: xorb $3, %cl +; CHECK-NEXT: movl $64, %eax +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shrl %cl, %eax ; CHECK-NEXT: retq %sel = select i1 %cond, i32 2, i32 3 %bo = lshr i32 64, %sel @@ -219,10 +224,12 @@ define i32 @ashr_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: ashr_constant_sel_constants: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: shll $4, %edi -; CHECK-NEXT: leal 16(%rdi), %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: xorb $3, %cl +; CHECK-NEXT: movl $128, %eax +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shrl %cl, %eax ; CHECK-NEXT: retq %sel = select i1 %cond, i32 2, i32 3 %bo = ashr i32 128, %sel diff --git a/llvm/test/CodeGen/X86/field-extract-use-trunc.ll b/llvm/test/CodeGen/X86/field-extract-use-trunc.ll --- a/llvm/test/CodeGen/X86/field-extract-use-trunc.ll +++ b/llvm/test/CodeGen/X86/field-extract-use-trunc.ll @@ -73,7 +73,7 @@ define i16 @test5(i16 %f12) nounwind { ; i686-LABEL: test5: ; i686: # %bb.0: -; i686-NEXT: movl {{[0-9]+}}(%esp), %eax +; i686-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; i686-NEXT: shrl $6, %eax ; i686-NEXT: movsbl %al, %eax ; i686-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/freeze-unary.ll b/llvm/test/CodeGen/X86/freeze-unary.ll --- a/llvm/test/CodeGen/X86/freeze-unary.ll +++ b/llvm/test/CodeGen/X86/freeze-unary.ll @@ -11,6 +11,7 @@ ; X64-LABEL: freeze_sext: ; X64: # %bb.0: ; X64-NEXT: movsbl %dil, %eax +; X64-NEXT: cwtl ; X64-NEXT: retq %x = sext i8 %a0 to i16 %y = freeze i16 %x diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -32,17 +32,17 @@ ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm0 ; SSSE3-FAST-NEXT: retq ; -; AVX1-SLOW-LABEL: pair_sum_v4f32_v4f32: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 -; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1] -; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1] -; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 -; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] -; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX1-SLOW-NEXT: retq +; AVX-SLOW-LABEL: pair_sum_v4f32_v4f32: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1] +; AVX-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 +; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] +; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: pair_sum_v4f32_v4f32: ; AVX-FAST: # %bb.0: @@ -50,18 +50,6 @@ ; AVX-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm1 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: retq -; -; AVX2-SLOW-LABEL: pair_sum_v4f32_v4f32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] -; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: retq %5 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> %6 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> %7 = fadd <2 x float> %5, %6 @@ -126,34 +114,28 @@ ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; AVX1-SLOW-NEXT: retq ; -; AVX1-FAST-LABEL: pair_sum_v4i32_v4i32: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm1 -; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: retq +; AVX-FAST-LABEL: pair_sum_v4i32_v4i32: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm1 +; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq ; ; AVX2-SLOW-LABEL: pair_sum_v4i32_v4i32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm2 +; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: pair_sum_v4i32_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: retq %5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> %6 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> %7 = add <2 x i32> %5, %6 @@ -191,15 +173,14 @@ ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm2 -; SSSE3-SLOW-NEXT: movaps %xmm5, %xmm1 -; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm1 -; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm2 +; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm5 +; SSSE3-SLOW-NEXT: haddps %xmm5, %xmm2 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1,3,2] ; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6 -; SSSE3-SLOW-NEXT: haddps %xmm5, %xmm4 -; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm4 -; SSSE3-SLOW-NEXT: movaps %xmm4, %xmm1 +; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm6 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,1] +; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32: @@ -266,13 +247,13 @@ ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4 +; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm8 ; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3] -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1] +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0] +; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1] +; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] @@ -287,13 +268,13 @@ ; AVX2-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm1 -; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm4 +; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm8 ; AVX2-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3] -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] -; AVX2-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1] +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0] +; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm5, %xmm3 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1] +; AVX2-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] @@ -440,9 +421,11 @@ ; AVX2-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1 ; AVX2-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4 ; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3] -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -461,9 +444,11 @@ ; AVX2-FAST-NEXT: vphaddd %xmm4, %xmm4, %xmm1 ; AVX2-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4 ; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3] -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -751,16 +736,16 @@ ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] -; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm5 -; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] ; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] +; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: sequential_sum_v4i32_v4i32: @@ -775,15 +760,15 @@ ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm4 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] +; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: retq %5 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> %6 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> @@ -1046,6 +1031,9 @@ ret <4 x float> %12 } +; FIXME: This could be a serie of phaddd followed by a shufps +; See https://reviews.llvm.org/D127115?id=436232#inline-1223238 + define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) { ; SSSE3-SLOW-LABEL: reduction_sum_v4i32_v4i32: ; SSSE3-SLOW: # %bb.0: diff --git a/llvm/test/CodeGen/X86/icmp-shift-opt.ll b/llvm/test/CodeGen/X86/icmp-shift-opt.ll --- a/llvm/test/CodeGen/X86/icmp-shift-opt.ll +++ b/llvm/test/CodeGen/X86/icmp-shift-opt.ll @@ -222,11 +222,11 @@ ; X86-LABEL: opt_setcc_expanded_shl_correct_shifts: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shll $17, %ecx ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: shldl $17, %eax, %ecx +; X86-NEXT: orl %ecx, %eax ; X86-NEXT: sete %al ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll --- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll +++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll @@ -13,15 +13,17 @@ define <16 x i8> @elt0_v16i8(i8 %x) { ; X86-SSE2-LABEL: elt0_v16i8: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE2-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X86-SSE2-NEXT: andnps %xmm1, %xmm0 ; X86-SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: elt0_v16i8: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movd %edi, %xmm0 -; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE2-NEXT: movd %edi, %xmm1 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X64-SSE2-NEXT: pandn %xmm1, %xmm0 ; X64-SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/insertelement-var-index.ll b/llvm/test/CodeGen/X86/insertelement-var-index.ll --- a/llvm/test/CodeGen/X86/insertelement-var-index.ll +++ b/llvm/test/CodeGen/X86/insertelement-var-index.ll @@ -2288,31 +2288,58 @@ ; SSE-NEXT: divl %ecx ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: PR44139: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX1OR2-NEXT: movl (%rdi), %eax -; AVX1OR2-NEXT: vmovaps %ymm0, 64(%rdi) -; AVX1OR2-NEXT: vmovaps %ymm0, 96(%rdi) -; AVX1OR2-NEXT: vmovaps %ymm0, (%rdi) -; AVX1OR2-NEXT: vmovaps %ymm0, 32(%rdi) -; AVX1OR2-NEXT: leal 2147483647(%rax), %ecx -; AVX1OR2-NEXT: testl %eax, %eax -; AVX1OR2-NEXT: cmovnsl %eax, %ecx -; AVX1OR2-NEXT: andl $-2147483648, %ecx # imm = 0x80000000 -; AVX1OR2-NEXT: addl %eax, %ecx -; AVX1OR2-NEXT: # kill: def $eax killed $eax killed $rax -; AVX1OR2-NEXT: xorl %edx, %edx -; AVX1OR2-NEXT: divl %ecx -; AVX1OR2-NEXT: vzeroupper -; AVX1OR2-NEXT: retq +; AVX1-LABEL: PR44139: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX1-NEXT: vpinsrq $1, (%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vmovaps %ymm0, 64(%rdi) +; AVX1-NEXT: vmovaps %ymm0, 96(%rdi) +; AVX1-NEXT: vmovaps %ymm0, 32(%rdi) +; AVX1-NEXT: movl (%rdi), %eax +; AVX1-NEXT: vmovaps %ymm1, (%rdi) +; AVX1-NEXT: leal 2147483647(%rax), %ecx +; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: cmovnsl %eax, %ecx +; AVX1-NEXT: andl $-2147483648, %ecx # imm = 0x80000000 +; AVX1-NEXT: addl %eax, %ecx +; AVX1-NEXT: # kill: def $eax killed $eax killed $rax +; AVX1-NEXT: xorl %edx, %edx +; AVX1-NEXT: divl %ecx +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: PR44139: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: vpinsrq $1, (%rdi), %xmm0, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa %ymm0, 64(%rdi) +; AVX2-NEXT: vmovdqa %ymm0, 96(%rdi) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdi) +; AVX2-NEXT: movl (%rdi), %eax +; AVX2-NEXT: vmovdqa %ymm1, (%rdi) +; AVX2-NEXT: leal 2147483647(%rax), %ecx +; AVX2-NEXT: testl %eax, %eax +; AVX2-NEXT: cmovnsl %eax, %ecx +; AVX2-NEXT: andl $-2147483648, %ecx # imm = 0x80000000 +; AVX2-NEXT: addl %eax, %ecx +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: xorl %edx, %edx +; AVX2-NEXT: divl %ecx +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: PR44139: ; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastsd (%rdi), %zmm0 -; AVX512-NEXT: movl (%rdi), %eax -; AVX512-NEXT: vmovaps %zmm0, (%rdi) -; AVX512-NEXT: vmovaps %zmm0, 64(%rdi) +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vpbroadcastq (%rdi), %zmm1 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpinsrq $1, (%rdi), %xmm1, %xmm2 +; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rdi) +; AVX512-NEXT: vmovdqa64 %zmm2, (%rdi) +; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: leal 2147483647(%rax), %ecx ; AVX512-NEXT: testl %eax, %eax ; AVX512-NEXT: cmovnsl %eax, %ecx @@ -2327,12 +2354,14 @@ ; X86AVX2-LABEL: PR44139: ; X86AVX2: # %bb.0: ; X86AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86AVX2-NEXT: movl (%ecx), %eax ; X86AVX2-NEXT: vbroadcastsd (%ecx), %ymm0 +; X86AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] +; X86AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; X86AVX2-NEXT: vmovaps %ymm0, 64(%ecx) ; X86AVX2-NEXT: vmovaps %ymm0, 96(%ecx) -; X86AVX2-NEXT: vmovaps %ymm0, (%ecx) ; X86AVX2-NEXT: vmovaps %ymm0, 32(%ecx) +; X86AVX2-NEXT: movl (%ecx), %eax +; X86AVX2-NEXT: vmovaps %ymm1, (%ecx) ; X86AVX2-NEXT: leal 2147483647(%eax), %ecx ; X86AVX2-NEXT: testl %eax, %eax ; X86AVX2-NEXT: cmovnsl %eax, %ecx diff --git a/llvm/test/CodeGen/X86/is_fpclass-fp80.ll b/llvm/test/CodeGen/X86/is_fpclass-fp80.ll --- a/llvm/test/CodeGen/X86/is_fpclass-fp80.ll +++ b/llvm/test/CodeGen/X86/is_fpclass-fp80.ll @@ -318,9 +318,8 @@ ; ; CHECK-64-LABEL: is_neginf_f80: ; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-64-NEXT: notl %eax -; CHECK-64-NEXT: movzwl %ax, %eax +; CHECK-64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-64-NEXT: xorq $65535, %rax # imm = 0xFFFF ; CHECK-64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 ; CHECK-64-NEXT: xorq {{[0-9]+}}(%rsp), %rcx ; CHECK-64-NEXT: orq %rax, %rcx diff --git a/llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll b/llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll --- a/llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll +++ b/llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll @@ -5,10 +5,9 @@ ; CHECK-LABEL: csrot_: ; CHECK: # %bb.0: ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: xorps %xmm0, %xmm1 -; CHECK-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],mem[1,2,3] -; CHECK-NEXT: movlps %xmm1, (%rax) +; CHECK-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] +; CHECK-NEXT: movlps %xmm0, (%rax) ; CHECK-NEXT: retq 1: %2 = load float, ptr %0, align 4 diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -6138,42 +6138,37 @@ ; ; AVX1-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts: ; AVX1: ## %bb.0: -; AVX1-NEXT: vmovaps (%rsi), %ymm1 -; AVX1-NEXT: vmovdqa 32(%rsi), %ymm0 +; AVX1-NEXT: vmovdqa (%rsi), %ymm0 +; AVX1-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX1-NEXT: vmovaps 64(%rsi), %ymm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtd 48(%rdi), %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtd 32(%rdi), %xmm3, %xmm5 +; AVX1-NEXT: vpcmpgtd 80(%rdi), %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtd 64(%rdi), %xmm3, %xmm5 ; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpacksswb %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpgtd 16(%rdi), %xmm3, %xmm5 ; AVX1-NEXT: vpcmpgtd (%rdi), %xmm3, %xmm6 -; AVX1-NEXT: vpackssdw %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpacksswb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpcmpgtd 80(%rdi), %xmm3, %xmm5 -; AVX1-NEXT: vpcmpgtd 64(%rdi), %xmm3, %xmm6 -; AVX1-NEXT: vpackssdw %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpacksswb %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX1-NEXT: vpslld $31, %xmm6, %xmm6 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpcmpgtd 48(%rdi), %xmm3, %xmm7 +; AVX1-NEXT: vpcmpgtd 32(%rdi), %xmm3, %xmm8 +; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm3[2,3],xmm8[4,5],xmm3[6,7] +; AVX1-NEXT: vpslld $31, %xmm8, %xmm8 +; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm3[2,3],xmm7[4,5],xmm3[6,7] ; AVX1-NEXT: vpslld $31, %xmm7, %xmm7 -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-NEXT: vmaskmovps %ymm1, %ymm6, (%rdx) -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX1-NEXT: vmaskmovps %ymm1, %ymm7, 32(%rdx) +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm3[2,3],xmm6[4,5],xmm3[6,7] ; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpslld $31, %xmm5, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 -; AVX1-NEXT: vmaskmovps %ymm2, %ymm1, 64(%rdx) -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3],xmm5[4,5],xmm3[6,7] +; AVX1-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vmaskmovps %ymm0, %ymm1, (%rdx) +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vmaskmovps %ymm0, %ymm1, 32(%rdx) +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmaskmovps %ymm2, %ymm0, 64(%rdx) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -6183,29 +6178,29 @@ ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX2-NEXT: vmovdqa 64(%rsi), %ymm2 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpcmpgtd 32(%rdi), %ymm3, %ymm4 +; AVX2-NEXT: vpcmpgtd 64(%rdi), %ymm3, %ymm4 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-NEXT: vpackssdw %xmm5, %xmm4, %xmm4 +; AVX2-NEXT: vpacksswb %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpcmpgtd (%rdi), %ymm3, %ymm5 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vpacksswb %xmm4, %xmm5, %xmm4 -; AVX2-NEXT: vpcmpgtd 64(%rdi), %ymm3, %ymm3 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-NEXT: vpackssdw %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX2-NEXT: vpcmpgtd 32(%rdi), %ymm3, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-NEXT: vpackssdw %xmm6, %xmm3, %xmm3 +; AVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2],xmm6[3],xmm3[4],xmm6[5],xmm3[6],xmm6[7] ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; AVX2-NEXT: vpslld $31, %ymm3, %ymm3 -; AVX2-NEXT: vpmaskmovd %ymm2, %ymm3, 64(%rdx) -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX2-NEXT: vpslld $31, %ymm2, %ymm2 -; AVX2-NEXT: vpmaskmovd %ymm1, %ymm2, 32(%rdx) -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX2-NEXT: vpmaskmovd %ymm1, %ymm3, 32(%rdx) +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm6[1],xmm5[2],xmm6[3],xmm5[4],xmm6[5],xmm5[6],xmm6[7] ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 ; AVX2-NEXT: vpmaskmovd %ymm0, %ymm1, (%rdx) +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vpmaskmovd %ymm2, %ymm0, 64(%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -51,7 +51,7 @@ ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -60,7 +60,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -133,7 +133,7 @@ ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -145,7 +145,7 @@ ; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -429,7 +429,7 @@ ; SSE-NEXT: packsswb %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pmovmskb %xmm2, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -441,7 +441,7 @@ ; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1218,7 +1218,7 @@ ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: psllw $7, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -1228,7 +1228,7 @@ ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1326,7 +1326,7 @@ ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: psllw $7, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -1339,7 +1339,7 @@ ; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1596,7 +1596,7 @@ ; SSE-NEXT: packsswb %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pmovmskb %xmm2, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -1612,7 +1612,7 @@ ; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2507,7 +2507,7 @@ ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: psllw $5, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -2517,7 +2517,7 @@ ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2615,7 +2615,7 @@ ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: psllw $5, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -2628,7 +2628,7 @@ ; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2885,7 +2885,7 @@ ; SSE-NEXT: packsswb %xmm3, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pmovmskb %xmm2, %eax -; SSE-NEXT: cmpw $-1, %ax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; @@ -2901,7 +2901,7 @@ ; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/mulvi32.ll b/llvm/test/CodeGen/X86/mulvi32.ll --- a/llvm/test/CodeGen/X86/mulvi32.ll +++ b/llvm/test/CodeGen/X86/mulvi32.ll @@ -134,31 +134,31 @@ define <4 x i64> @_mul4xi32toi64a(<4 x i32>, <4 x i32>) { ; SSE2-LABEL: _mul4xi32toi64a: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,1,3,3] ; SSE2-NEXT: pmuludq %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE42-LABEL: _mul4xi32toi64a: ; SSE42: # %bb.0: -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,1,3,3] ; SSE42-NEXT: pmuludq %xmm3, %xmm2 -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,3,3] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,3,3] -; SSE42-NEXT: pmuludq %xmm3, %xmm1 -; SSE42-NEXT: movdqa %xmm2, %xmm0 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: pmuludq %xmm1, %xmm0 +; SSE42-NEXT: movdqa %xmm2, %xmm1 ; SSE42-NEXT: retq ; ; AVX1-LABEL: _mul4xi32toi64a: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] -; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/nontemporal-3.ll b/llvm/test/CodeGen/X86/nontemporal-3.ll --- a/llvm/test/CodeGen/X86/nontemporal-3.ll +++ b/llvm/test/CodeGen/X86/nontemporal-3.ll @@ -494,14 +494,14 @@ ; CHECK-LABEL: test_zero_v8f64_align1: ; CHECK: # %bb.0: ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: movntiq %rax, 24(%rdi) -; CHECK-NEXT: movntiq %rax, 16(%rdi) ; CHECK-NEXT: movntiq %rax, 8(%rdi) ; CHECK-NEXT: movntiq %rax, (%rdi) -; CHECK-NEXT: movntiq %rax, 56(%rdi) -; CHECK-NEXT: movntiq %rax, 48(%rdi) +; CHECK-NEXT: movntiq %rax, 24(%rdi) +; CHECK-NEXT: movntiq %rax, 16(%rdi) ; CHECK-NEXT: movntiq %rax, 40(%rdi) ; CHECK-NEXT: movntiq %rax, 32(%rdi) +; CHECK-NEXT: movntiq %rax, 56(%rdi) +; CHECK-NEXT: movntiq %rax, 48(%rdi) ; CHECK-NEXT: retq store <8 x double> zeroinitializer, ptr %dst, align 1, !nontemporal !1 ret void @@ -511,67 +511,67 @@ ; SSE2-LABEL: test_zero_v16f32_align1: ; SSE2: # %bb.0: ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 8(%rdi) ; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 40(%rdi) ; SSE2-NEXT: movntiq %rax, 32(%rdi) +; SSE2-NEXT: movntiq %rax, 56(%rdi) +; SSE2-NEXT: movntiq %rax, 48(%rdi) ; SSE2-NEXT: retq ; ; SSE4A-LABEL: test_zero_v16f32_align1: ; SSE4A: # %bb.0: ; SSE4A-NEXT: xorl %eax, %eax -; SSE4A-NEXT: movntiq %rax, 24(%rdi) ; SSE4A-NEXT: movntiq %rax, 8(%rdi) -; SSE4A-NEXT: movntiq %rax, 56(%rdi) +; SSE4A-NEXT: movntiq %rax, 24(%rdi) ; SSE4A-NEXT: movntiq %rax, 40(%rdi) +; SSE4A-NEXT: movntiq %rax, 56(%rdi) ; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) ; SSE4A-NEXT: retq ; ; SSE41-LABEL: test_zero_v16f32_align1: ; SSE41: # %bb.0: ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 8(%rdi) ; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 40(%rdi) ; SSE41-NEXT: movntiq %rax, 32(%rdi) +; SSE41-NEXT: movntiq %rax, 56(%rdi) +; SSE41-NEXT: movntiq %rax, 48(%rdi) ; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v16f32_align1: ; AVX: # %bb.0: ; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 56(%rdi) -; AVX-NEXT: movntiq %rax, 48(%rdi) +; AVX-NEXT: movntiq %rax, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 40(%rdi) ; AVX-NEXT: movntiq %rax, 32(%rdi) +; AVX-NEXT: movntiq %rax, 56(%rdi) +; AVX-NEXT: movntiq %rax, 48(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v16f32_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 40(%rdi) ; AVX512-NEXT: movntiq %rax, 32(%rdi) +; AVX512-NEXT: movntiq %rax, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 48(%rdi) ; AVX512-NEXT: retq store <16 x float> zeroinitializer, ptr %dst, align 1, !nontemporal !1 ret void @@ -581,66 +581,66 @@ ; SSE2-LABEL: test_zero_v8i64_align1: ; SSE2: # %bb.0: ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 8(%rdi) ; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 40(%rdi) ; SSE2-NEXT: movntiq %rax, 32(%rdi) +; SSE2-NEXT: movntiq %rax, 56(%rdi) +; SSE2-NEXT: movntiq %rax, 48(%rdi) ; SSE2-NEXT: retq ; ; SSE4A-LABEL: test_zero_v8i64_align1: ; SSE4A: # %bb.0: ; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) ; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) ; SSE4A-NEXT: retq ; ; SSE41-LABEL: test_zero_v8i64_align1: ; SSE41: # %bb.0: ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 8(%rdi) ; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 40(%rdi) ; SSE41-NEXT: movntiq %rax, 32(%rdi) +; SSE41-NEXT: movntiq %rax, 56(%rdi) +; SSE41-NEXT: movntiq %rax, 48(%rdi) ; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v8i64_align1: ; AVX: # %bb.0: ; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 56(%rdi) -; AVX-NEXT: movntiq %rax, 48(%rdi) +; AVX-NEXT: movntiq %rax, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 40(%rdi) ; AVX-NEXT: movntiq %rax, 32(%rdi) +; AVX-NEXT: movntiq %rax, 56(%rdi) +; AVX-NEXT: movntiq %rax, 48(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v8i64_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 40(%rdi) ; AVX512-NEXT: movntiq %rax, 32(%rdi) +; AVX512-NEXT: movntiq %rax, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 48(%rdi) ; AVX512-NEXT: retq store <8 x i64> zeroinitializer, ptr %dst, align 1, !nontemporal !1 ret void @@ -650,66 +650,66 @@ ; SSE2-LABEL: test_zero_v16i32_align1: ; SSE2: # %bb.0: ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 8(%rdi) ; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 40(%rdi) ; SSE2-NEXT: movntiq %rax, 32(%rdi) +; SSE2-NEXT: movntiq %rax, 56(%rdi) +; SSE2-NEXT: movntiq %rax, 48(%rdi) ; SSE2-NEXT: retq ; ; SSE4A-LABEL: test_zero_v16i32_align1: ; SSE4A: # %bb.0: ; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) ; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) ; SSE4A-NEXT: retq ; ; SSE41-LABEL: test_zero_v16i32_align1: ; SSE41: # %bb.0: ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 8(%rdi) ; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 40(%rdi) ; SSE41-NEXT: movntiq %rax, 32(%rdi) +; SSE41-NEXT: movntiq %rax, 56(%rdi) +; SSE41-NEXT: movntiq %rax, 48(%rdi) ; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v16i32_align1: ; AVX: # %bb.0: ; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 56(%rdi) -; AVX-NEXT: movntiq %rax, 48(%rdi) +; AVX-NEXT: movntiq %rax, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 40(%rdi) ; AVX-NEXT: movntiq %rax, 32(%rdi) +; AVX-NEXT: movntiq %rax, 56(%rdi) +; AVX-NEXT: movntiq %rax, 48(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v16i32_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 40(%rdi) ; AVX512-NEXT: movntiq %rax, 32(%rdi) +; AVX512-NEXT: movntiq %rax, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 48(%rdi) ; AVX512-NEXT: retq store <16 x i32> zeroinitializer, ptr %dst, align 1, !nontemporal !1 ret void @@ -719,66 +719,66 @@ ; SSE2-LABEL: test_zero_v32i16_align1: ; SSE2: # %bb.0: ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 8(%rdi) ; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 40(%rdi) ; SSE2-NEXT: movntiq %rax, 32(%rdi) +; SSE2-NEXT: movntiq %rax, 56(%rdi) +; SSE2-NEXT: movntiq %rax, 48(%rdi) ; SSE2-NEXT: retq ; ; SSE4A-LABEL: test_zero_v32i16_align1: ; SSE4A: # %bb.0: ; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) ; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) ; SSE4A-NEXT: retq ; ; SSE41-LABEL: test_zero_v32i16_align1: ; SSE41: # %bb.0: ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 8(%rdi) ; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 40(%rdi) ; SSE41-NEXT: movntiq %rax, 32(%rdi) +; SSE41-NEXT: movntiq %rax, 56(%rdi) +; SSE41-NEXT: movntiq %rax, 48(%rdi) ; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v32i16_align1: ; AVX: # %bb.0: ; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 56(%rdi) -; AVX-NEXT: movntiq %rax, 48(%rdi) +; AVX-NEXT: movntiq %rax, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 40(%rdi) ; AVX-NEXT: movntiq %rax, 32(%rdi) +; AVX-NEXT: movntiq %rax, 56(%rdi) +; AVX-NEXT: movntiq %rax, 48(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v32i16_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 40(%rdi) ; AVX512-NEXT: movntiq %rax, 32(%rdi) +; AVX512-NEXT: movntiq %rax, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 48(%rdi) ; AVX512-NEXT: retq store <32 x i16> zeroinitializer, ptr %dst, align 1, !nontemporal !1 ret void @@ -788,66 +788,66 @@ ; SSE2-LABEL: test_zero_v64i8_align1: ; SSE2: # %bb.0: ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 8(%rdi) ; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) ; SSE2-NEXT: movntiq %rax, 40(%rdi) ; SSE2-NEXT: movntiq %rax, 32(%rdi) +; SSE2-NEXT: movntiq %rax, 56(%rdi) +; SSE2-NEXT: movntiq %rax, 48(%rdi) ; SSE2-NEXT: retq ; ; SSE4A-LABEL: test_zero_v64i8_align1: ; SSE4A: # %bb.0: ; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) ; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) ; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) ; SSE4A-NEXT: retq ; ; SSE41-LABEL: test_zero_v64i8_align1: ; SSE41: # %bb.0: ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 8(%rdi) ; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) ; SSE41-NEXT: movntiq %rax, 40(%rdi) ; SSE41-NEXT: movntiq %rax, 32(%rdi) +; SSE41-NEXT: movntiq %rax, 56(%rdi) +; SSE41-NEXT: movntiq %rax, 48(%rdi) ; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v64i8_align1: ; AVX: # %bb.0: ; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 56(%rdi) -; AVX-NEXT: movntiq %rax, 48(%rdi) +; AVX-NEXT: movntiq %rax, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: movntiq %rax, 40(%rdi) ; AVX-NEXT: movntiq %rax, 32(%rdi) +; AVX-NEXT: movntiq %rax, 56(%rdi) +; AVX-NEXT: movntiq %rax, 48(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v64i8_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: movntiq %rax, 40(%rdi) ; AVX512-NEXT: movntiq %rax, 32(%rdi) +; AVX512-NEXT: movntiq %rax, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 48(%rdi) ; AVX512-NEXT: retq store <64 x i8> zeroinitializer, ptr %dst, align 1, !nontemporal !1 ret void diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -319,41 +319,41 @@ ; SSE2-LABEL: and_mulhuw_v16i16: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767] -; SSE2-NEXT: pand %xmm8, %xmm3 -; SSE2-NEXT: pand %xmm8, %xmm2 -; SSE2-NEXT: packssdw %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm8, %xmm1 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm8, %xmm7 -; SSE2-NEXT: pand %xmm8, %xmm6 -; SSE2-NEXT: packssdw %xmm7, %xmm6 -; SSE2-NEXT: pmulhw %xmm2, %xmm6 +; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm8, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm8 -; SSE2-NEXT: packssdw %xmm5, %xmm8 -; SSE2-NEXT: pmulhw %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: pmulhw %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pand %xmm6, %xmm8 +; SSE2-NEXT: packssdw %xmm7, %xmm8 +; SSE2-NEXT: pmulhw %xmm2, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: and_mulhuw_v16i16: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767] -; SSE41-NEXT: pand %xmm8, %xmm3 -; SSE41-NEXT: pand %xmm8, %xmm2 -; SSE41-NEXT: packusdw %xmm3, %xmm2 ; SSE41-NEXT: pand %xmm8, %xmm1 ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: pand %xmm8, %xmm7 -; SSE41-NEXT: pand %xmm8, %xmm6 -; SSE41-NEXT: packusdw %xmm7, %xmm6 -; SSE41-NEXT: pmulhw %xmm2, %xmm6 +; SSE41-NEXT: pand %xmm8, %xmm3 +; SSE41-NEXT: pand %xmm8, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 ; SSE41-NEXT: pand %xmm8, %xmm5 -; SSE41-NEXT: pand %xmm4, %xmm8 -; SSE41-NEXT: packusdw %xmm5, %xmm8 -; SSE41-NEXT: pmulhw %xmm8, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm1 +; SSE41-NEXT: pand %xmm8, %xmm4 +; SSE41-NEXT: packusdw %xmm5, %xmm4 +; SSE41-NEXT: pmulhw %xmm4, %xmm0 +; SSE41-NEXT: pand %xmm8, %xmm7 +; SSE41-NEXT: pand %xmm6, %xmm8 +; SSE41-NEXT: packusdw %xmm7, %xmm8 +; SSE41-NEXT: pmulhw %xmm2, %xmm8 +; SSE41-NEXT: movdqa %xmm8, %xmm1 ; SSE41-NEXT: retq ; ; AVX2-LABEL: and_mulhuw_v16i16: @@ -417,13 +417,6 @@ define <16 x i16> @ashr_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) { ; SSE2-LABEL: ashr_mulhuw_v16i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: packssdw %xmm5, %xmm4 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: pmulhw %xmm4, %xmm0 ; SSE2-NEXT: psrad $16, %xmm7 ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: packssdw %xmm7, %xmm6 @@ -431,25 +424,32 @@ ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: packssdw %xmm3, %xmm2 ; SSE2-NEXT: pmulhw %xmm6, %xmm2 +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: pmulhw %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: ashr_mulhuw_v16i16: ; SSE41: # %bb.0: -; SSE41-NEXT: psrld $16, %xmm3 -; SSE41-NEXT: psrld $16, %xmm2 -; SSE41-NEXT: packusdw %xmm3, %xmm2 ; SSE41-NEXT: psrld $16, %xmm1 ; SSE41-NEXT: psrld $16, %xmm0 ; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm7 -; SSE41-NEXT: psrld $16, %xmm6 -; SSE41-NEXT: packusdw %xmm7, %xmm6 -; SSE41-NEXT: pmulhw %xmm2, %xmm6 +; SSE41-NEXT: psrld $16, %xmm3 +; SSE41-NEXT: psrld $16, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 ; SSE41-NEXT: psrld $16, %xmm5 ; SSE41-NEXT: psrld $16, %xmm4 ; SSE41-NEXT: packusdw %xmm5, %xmm4 ; SSE41-NEXT: pmulhw %xmm4, %xmm0 +; SSE41-NEXT: psrld $16, %xmm7 +; SSE41-NEXT: psrld $16, %xmm6 +; SSE41-NEXT: packusdw %xmm7, %xmm6 +; SSE41-NEXT: pmulhw %xmm2, %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm1 ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll --- a/llvm/test/CodeGen/X86/popcnt.ll +++ b/llvm/test/CodeGen/X86/popcnt.ll @@ -62,7 +62,7 @@ define i16 @cnt16(i16 %x) nounwind readnone { ; X86-LABEL: cnt16: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: shrl %ecx ; X86-NEXT: andl $21845, %ecx # imm = 0x5555 @@ -1513,7 +1513,7 @@ define i32 @popcount_i16_zext(i16 zeroext %x) { ; X86-LABEL: popcount_i16_zext: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: shrl %ecx ; X86-NEXT: andl $21845, %ecx # imm = 0x5555 diff --git a/llvm/test/CodeGen/X86/pr53419.ll b/llvm/test/CodeGen/X86/pr53419.ll --- a/llvm/test/CodeGen/X86/pr53419.ll +++ b/llvm/test/CodeGen/X86/pr53419.ll @@ -60,21 +60,33 @@ } define i1 @intrinsic_v8i8(ptr align 1 %arg, ptr align 1 %arg1) { -; SSE-LABEL: intrinsic_v8i8: -; SSE: # %bb.0: # %bb -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE-NEXT: pmovmskb %xmm1, %eax -; SSE-NEXT: cmpb $-1, %al -; SSE-NEXT: sete %al -; SSE-NEXT: retq +; SSE2-LABEL: intrinsic_v8i8: +; SSE2: # %bb.0: # %bb +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: cmpb $-1, %al +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE42-LABEL: intrinsic_v8i8: +; SSE42: # %bb.0: # %bb +; SSE42-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; SSE42-NEXT: pmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; SSE42-NEXT: pcmpeqw %xmm0, %xmm1 +; SSE42-NEXT: packsswb %xmm1, %xmm1 +; SSE42-NEXT: pmovmskb %xmm1, %eax +; SSE42-NEXT: cmpb $-1, %al +; SSE42-NEXT: sete %al +; SSE42-NEXT: retq ; ; AVX-LABEL: intrinsic_v8i8: ; AVX: # %bb.0: # %bb -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpmovmskb %xmm0, %eax ; AVX-NEXT: cmpb $-1, %al ; AVX-NEXT: sete %al @@ -84,9 +96,10 @@ ; X86: # %bb.0: # %bb ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; X86-NEXT: vpmovmskb %xmm0, %eax ; X86-NEXT: cmpb $-1, %al ; X86-NEXT: sete %al @@ -226,3 +239,5 @@ %all_eq = icmp eq i32 %lhs, %rhs ret i1 %all_eq } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; SSE: {{.*}} diff --git a/llvm/test/CodeGen/X86/promote-vec3.ll b/llvm/test/CodeGen/X86/promote-vec3.ll --- a/llvm/test/CodeGen/X86/promote-vec3.ll +++ b/llvm/test/CodeGen/X86/promote-vec3.ll @@ -42,13 +42,13 @@ ; ; AVX-64-LABEL: zext_i8: ; AVX-64: # %bb.0: -; AVX-64-NEXT: movzbl %sil, %esi +; AVX-64-NEXT: movzbl %dl, %ecx +; AVX-64-NEXT: movzbl %sil, %edx ; AVX-64-NEXT: vmovd %edi, %xmm0 ; AVX-64-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-64-NEXT: movzbl %dl, %ecx ; AVX-64-NEXT: vmovd %xmm0, %eax ; AVX-64-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-64-NEXT: movl %esi, %edx +; AVX-64-NEXT: # kill: def $dx killed $dx killed $edx ; AVX-64-NEXT: # kill: def $cx killed $cx killed $ecx ; AVX-64-NEXT: retq %2 = zext <3 x i8> %0 to <3 x i16> diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -1794,58 +1794,58 @@ ; SSE2OR3-LABEL: psubus_16i32_max: ; SSE2OR3: # %bb.0: # %vector.ph ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] -; SSE2OR3-NEXT: movdqa %xmm3, %xmm8 +; SSE2OR3-NEXT: movdqa %xmm5, %xmm8 ; SSE2OR3-NEXT: pxor %xmm7, %xmm8 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] ; SSE2OR3-NEXT: movdqa %xmm6, %xmm9 ; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm9 ; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm8 +; SSE2OR3-NEXT: pand %xmm9, %xmm5 +; SSE2OR3-NEXT: pxor %xmm8, %xmm9 +; SSE2OR3-NEXT: por %xmm5, %xmm9 +; SSE2OR3-NEXT: pslld $16, %xmm9 +; SSE2OR3-NEXT: psrad $16, %xmm9 +; SSE2OR3-NEXT: movdqa %xmm4, %xmm10 +; SSE2OR3-NEXT: pxor %xmm7, %xmm10 +; SSE2OR3-NEXT: movdqa %xmm6, %xmm5 +; SSE2OR3-NEXT: pcmpgtd %xmm10, %xmm5 +; SSE2OR3-NEXT: pand %xmm5, %xmm4 +; SSE2OR3-NEXT: pxor %xmm8, %xmm5 +; SSE2OR3-NEXT: por %xmm4, %xmm5 +; SSE2OR3-NEXT: pslld $16, %xmm5 +; SSE2OR3-NEXT: psrad $16, %xmm5 +; SSE2OR3-NEXT: packssdw %xmm9, %xmm5 +; SSE2OR3-NEXT: movdqa %xmm3, %xmm4 +; SSE2OR3-NEXT: pxor %xmm7, %xmm4 +; SSE2OR3-NEXT: movdqa %xmm6, %xmm9 +; SSE2OR3-NEXT: pcmpgtd %xmm4, %xmm9 ; SSE2OR3-NEXT: pand %xmm9, %xmm3 ; SSE2OR3-NEXT: pxor %xmm8, %xmm9 ; SSE2OR3-NEXT: por %xmm3, %xmm9 ; SSE2OR3-NEXT: pslld $16, %xmm9 ; SSE2OR3-NEXT: psrad $16, %xmm9 -; SSE2OR3-NEXT: movdqa %xmm2, %xmm3 -; SSE2OR3-NEXT: pxor %xmm7, %xmm3 -; SSE2OR3-NEXT: movdqa %xmm6, %xmm10 -; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm10 -; SSE2OR3-NEXT: pand %xmm10, %xmm2 -; SSE2OR3-NEXT: pxor %xmm8, %xmm10 -; SSE2OR3-NEXT: por %xmm2, %xmm10 -; SSE2OR3-NEXT: pslld $16, %xmm10 -; SSE2OR3-NEXT: psrad $16, %xmm10 -; SSE2OR3-NEXT: packssdw %xmm9, %xmm10 -; SSE2OR3-NEXT: psubusw %xmm10, %xmm0 -; SSE2OR3-NEXT: movdqa %xmm5, %xmm2 -; SSE2OR3-NEXT: pxor %xmm7, %xmm2 -; SSE2OR3-NEXT: movdqa %xmm6, %xmm3 -; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2OR3-NEXT: pand %xmm3, %xmm5 -; SSE2OR3-NEXT: pxor %xmm8, %xmm3 -; SSE2OR3-NEXT: por %xmm5, %xmm3 -; SSE2OR3-NEXT: pslld $16, %xmm3 -; SSE2OR3-NEXT: psrad $16, %xmm3 -; SSE2OR3-NEXT: pxor %xmm4, %xmm7 +; SSE2OR3-NEXT: pxor %xmm2, %xmm7 ; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm6 ; SSE2OR3-NEXT: pxor %xmm6, %xmm8 -; SSE2OR3-NEXT: pand %xmm4, %xmm6 +; SSE2OR3-NEXT: pand %xmm2, %xmm6 ; SSE2OR3-NEXT: por %xmm8, %xmm6 ; SSE2OR3-NEXT: pslld $16, %xmm6 ; SSE2OR3-NEXT: psrad $16, %xmm6 -; SSE2OR3-NEXT: packssdw %xmm3, %xmm6 -; SSE2OR3-NEXT: psubusw %xmm6, %xmm1 +; SSE2OR3-NEXT: packssdw %xmm9, %xmm6 +; SSE2OR3-NEXT: psubusw %xmm6, %xmm0 +; SSE2OR3-NEXT: psubusw %xmm5, %xmm1 ; SSE2OR3-NEXT: retq ; ; SSE41-LABEL: psubus_16i32_max: ; SSE41: # %bb.0: # %vector.ph ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535] +; SSE41-NEXT: pminud %xmm6, %xmm5 +; SSE41-NEXT: pminud %xmm6, %xmm4 +; SSE41-NEXT: packusdw %xmm5, %xmm4 ; SSE41-NEXT: pminud %xmm6, %xmm3 ; SSE41-NEXT: pminud %xmm6, %xmm2 ; SSE41-NEXT: packusdw %xmm3, %xmm2 ; SSE41-NEXT: psubusw %xmm2, %xmm0 -; SSE41-NEXT: pminud %xmm6, %xmm5 -; SSE41-NEXT: pminud %xmm6, %xmm4 -; SSE41-NEXT: packusdw %xmm5, %xmm4 ; SSE41-NEXT: psubusw %xmm4, %xmm1 ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/shift-mask.ll b/llvm/test/CodeGen/X86/shift-mask.ll --- a/llvm/test/CodeGen/X86/shift-mask.ll +++ b/llvm/test/CodeGen/X86/shift-mask.ll @@ -142,9 +142,9 @@ define i16 @test_i16_shl_lshr_2(i16 %a0) { ; X86-LABEL: test_i16_shl_lshr_2: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl $2, %eax -; X86-NEXT: andl $16376, %eax # imm = 0x3FF8 +; X86-NEXT: andl $-8, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; @@ -411,7 +411,7 @@ define i16 @test_i16_lshr_lshr_1(i16 %a0) { ; X86-LABEL: test_i16_lshr_lshr_1: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrl $2, %eax ; X86-NEXT: andl $2047, %eax # imm = 0x7FF ; X86-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll @@ -463,8 +463,8 @@ ; SSE2-LABEL: shuffle_v16i8_to_v2i8_2: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rsi) @@ -541,8 +541,8 @@ ; SSE2-LABEL: shuffle_v16i8_to_v2i8_4: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rsi) @@ -619,8 +619,8 @@ ; SSE2-LABEL: shuffle_v16i8_to_v2i8_6: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rsi) diff --git a/llvm/test/CodeGen/X86/smax.ll b/llvm/test/CodeGen/X86/smax.ll --- a/llvm/test/CodeGen/X86/smax.ll +++ b/llvm/test/CodeGen/X86/smax.ll @@ -660,8 +660,8 @@ ; X64: # %bb.0: ; X64-NEXT: movswl %si, %eax ; X64-NEXT: movswl %di, %ecx -; X64-NEXT: sarl $15, %ecx -; X64-NEXT: sarl $8, %eax +; X64-NEXT: shrl $15, %ecx +; X64-NEXT: shrl $8, %eax ; X64-NEXT: cmpw %ax, %cx ; X64-NEXT: cmovgl %ecx, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax @@ -671,7 +671,7 @@ ; X86: # %bb.0: ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sarl $15, %eax +; X86-NEXT: shrl $15, %eax ; X86-NEXT: cmpw %cx, %ax ; X86-NEXT: cmovlel %ecx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/smin.ll b/llvm/test/CodeGen/X86/smin.ll --- a/llvm/test/CodeGen/X86/smin.ll +++ b/llvm/test/CodeGen/X86/smin.ll @@ -659,8 +659,8 @@ ; X64: # %bb.0: ; X64-NEXT: movswl %si, %eax ; X64-NEXT: movswl %di, %ecx -; X64-NEXT: sarl $15, %ecx -; X64-NEXT: sarl $8, %eax +; X64-NEXT: shrl $15, %ecx +; X64-NEXT: shrl $8, %eax ; X64-NEXT: cmpw %ax, %cx ; X64-NEXT: cmovll %ecx, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax @@ -670,7 +670,7 @@ ; X86: # %bb.0: ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sarl $15, %eax +; X86-NEXT: shrl $15, %eax ; X86-NEXT: cmpw %cx, %ax ; X86-NEXT: cmovgel %ecx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/umax.ll b/llvm/test/CodeGen/X86/umax.ll --- a/llvm/test/CodeGen/X86/umax.ll +++ b/llvm/test/CodeGen/X86/umax.ll @@ -668,8 +668,8 @@ ; X64: # %bb.0: ; X64-NEXT: movswl %si, %eax ; X64-NEXT: movswl %di, %ecx -; X64-NEXT: sarl $15, %ecx -; X64-NEXT: sarl $8, %eax +; X64-NEXT: shrl $15, %ecx +; X64-NEXT: shrl $8, %eax ; X64-NEXT: cmpw %ax, %cx ; X64-NEXT: cmoval %ecx, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax @@ -679,7 +679,7 @@ ; X86: # %bb.0: ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sarl $15, %eax +; X86-NEXT: shrl $15, %eax ; X86-NEXT: cmpw %cx, %ax ; X86-NEXT: cmovbel %ecx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/umin.ll b/llvm/test/CodeGen/X86/umin.ll --- a/llvm/test/CodeGen/X86/umin.ll +++ b/llvm/test/CodeGen/X86/umin.ll @@ -668,8 +668,8 @@ ; X64: # %bb.0: ; X64-NEXT: movswl %si, %eax ; X64-NEXT: movswl %di, %ecx -; X64-NEXT: sarl $15, %ecx -; X64-NEXT: sarl $8, %eax +; X64-NEXT: shrl $15, %ecx +; X64-NEXT: shrl $8, %eax ; X64-NEXT: cmpw %ax, %cx ; X64-NEXT: cmovbl %ecx, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax @@ -679,7 +679,7 @@ ; X86: # %bb.0: ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sarl $15, %eax +; X86-NEXT: shrl $15, %eax ; X86-NEXT: cmpw %cx, %ax ; X86-NEXT: cmovael %ecx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/v8i1-masks.ll b/llvm/test/CodeGen/X86/v8i1-masks.ll --- a/llvm/test/CodeGen/X86/v8i1-masks.ll +++ b/llvm/test/CodeGen/X86/v8i1-masks.ll @@ -149,14 +149,18 @@ ; X32-AVX2: ## %bb.0: ; X32-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X32-AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 -; X32-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7] +; X32-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; X32-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X32-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: and_mask_constant: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7] +; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: retq %m = icmp eq <8 x i32> %v0, zeroinitializer %mand = and <8 x i1> %m, diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll --- a/llvm/test/CodeGen/X86/vector-bitreverse.ll +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -99,19 +99,19 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind { ; SSE-LABEL: test_bitreverse_i16: ; SSE: # %bb.0: -; SSE-NEXT: # kill: def $edi killed $edi def $rdi ; SSE-NEXT: rolw $8, %di -; SSE-NEXT: movl %edi, %eax +; SSE-NEXT: movzwl %di, %eax +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: andl $3855, %ecx # imm = 0xF0F +; SSE-NEXT: shll $4, %ecx +; SSE-NEXT: shrl $4, %eax ; SSE-NEXT: andl $3855, %eax # imm = 0xF0F -; SSE-NEXT: shll $4, %eax -; SSE-NEXT: shrl $4, %edi -; SSE-NEXT: andl $3855, %edi # imm = 0xF0F -; SSE-NEXT: orl %eax, %edi -; SSE-NEXT: movl %edi, %eax +; SSE-NEXT: orl %ecx, %eax +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: andl $13107, %ecx # imm = 0x3333 +; SSE-NEXT: shrl $2, %eax ; SSE-NEXT: andl $13107, %eax # imm = 0x3333 -; SSE-NEXT: shrl $2, %edi -; SSE-NEXT: andl $13107, %edi # imm = 0x3333 -; SSE-NEXT: leal (%rdi,%rax,4), %eax +; SSE-NEXT: leal (%rax,%rcx,4), %eax ; SSE-NEXT: movl %eax, %ecx ; SSE-NEXT: andl $21845, %ecx # imm = 0x5555 ; SSE-NEXT: shrl %eax @@ -122,19 +122,19 @@ ; ; AVX-LABEL: test_bitreverse_i16: ; AVX: # %bb.0: -; AVX-NEXT: # kill: def $edi killed $edi def $rdi ; AVX-NEXT: rolw $8, %di -; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: movzwl %di, %eax +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: andl $3855, %ecx # imm = 0xF0F +; AVX-NEXT: shll $4, %ecx +; AVX-NEXT: shrl $4, %eax ; AVX-NEXT: andl $3855, %eax # imm = 0xF0F -; AVX-NEXT: shll $4, %eax -; AVX-NEXT: shrl $4, %edi -; AVX-NEXT: andl $3855, %edi # imm = 0xF0F -; AVX-NEXT: orl %eax, %edi -; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: orl %ecx, %eax +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: andl $13107, %ecx # imm = 0x3333 +; AVX-NEXT: shrl $2, %eax ; AVX-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NEXT: shrl $2, %edi -; AVX-NEXT: andl $13107, %edi # imm = 0x3333 -; AVX-NEXT: leal (%rdi,%rax,4), %eax +; AVX-NEXT: leal (%rax,%rcx,4), %eax ; AVX-NEXT: movl %eax, %ecx ; AVX-NEXT: andl $21845, %ecx # imm = 0x5555 ; AVX-NEXT: shrl %eax @@ -153,19 +153,19 @@ ; ; GFNISSE-LABEL: test_bitreverse_i16: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: # kill: def $edi killed $edi def $rdi ; GFNISSE-NEXT: rolw $8, %di -; GFNISSE-NEXT: movl %edi, %eax +; GFNISSE-NEXT: movzwl %di, %eax +; GFNISSE-NEXT: movl %eax, %ecx +; GFNISSE-NEXT: andl $3855, %ecx # imm = 0xF0F +; GFNISSE-NEXT: shll $4, %ecx +; GFNISSE-NEXT: shrl $4, %eax ; GFNISSE-NEXT: andl $3855, %eax # imm = 0xF0F -; GFNISSE-NEXT: shll $4, %eax -; GFNISSE-NEXT: shrl $4, %edi -; GFNISSE-NEXT: andl $3855, %edi # imm = 0xF0F -; GFNISSE-NEXT: orl %eax, %edi -; GFNISSE-NEXT: movl %edi, %eax +; GFNISSE-NEXT: orl %ecx, %eax +; GFNISSE-NEXT: movl %eax, %ecx +; GFNISSE-NEXT: andl $13107, %ecx # imm = 0x3333 +; GFNISSE-NEXT: shrl $2, %eax ; GFNISSE-NEXT: andl $13107, %eax # imm = 0x3333 -; GFNISSE-NEXT: shrl $2, %edi -; GFNISSE-NEXT: andl $13107, %edi # imm = 0x3333 -; GFNISSE-NEXT: leal (%rdi,%rax,4), %eax +; GFNISSE-NEXT: leal (%rax,%rcx,4), %eax ; GFNISSE-NEXT: movl %eax, %ecx ; GFNISSE-NEXT: andl $21845, %ecx # imm = 0x5555 ; GFNISSE-NEXT: shrl %eax @@ -176,19 +176,19 @@ ; ; GFNIAVX-LABEL: test_bitreverse_i16: ; GFNIAVX: # %bb.0: -; GFNIAVX-NEXT: # kill: def $edi killed $edi def $rdi ; GFNIAVX-NEXT: rolw $8, %di -; GFNIAVX-NEXT: movl %edi, %eax +; GFNIAVX-NEXT: movzwl %di, %eax +; GFNIAVX-NEXT: movl %eax, %ecx +; GFNIAVX-NEXT: andl $3855, %ecx # imm = 0xF0F +; GFNIAVX-NEXT: shll $4, %ecx +; GFNIAVX-NEXT: shrl $4, %eax ; GFNIAVX-NEXT: andl $3855, %eax # imm = 0xF0F -; GFNIAVX-NEXT: shll $4, %eax -; GFNIAVX-NEXT: shrl $4, %edi -; GFNIAVX-NEXT: andl $3855, %edi # imm = 0xF0F -; GFNIAVX-NEXT: orl %eax, %edi -; GFNIAVX-NEXT: movl %edi, %eax +; GFNIAVX-NEXT: orl %ecx, %eax +; GFNIAVX-NEXT: movl %eax, %ecx +; GFNIAVX-NEXT: andl $13107, %ecx # imm = 0x3333 +; GFNIAVX-NEXT: shrl $2, %eax ; GFNIAVX-NEXT: andl $13107, %eax # imm = 0x3333 -; GFNIAVX-NEXT: shrl $2, %edi -; GFNIAVX-NEXT: andl $13107, %edi # imm = 0x3333 -; GFNIAVX-NEXT: leal (%rdi,%rax,4), %eax +; GFNIAVX-NEXT: leal (%rax,%rcx,4), %eax ; GFNIAVX-NEXT: movl %eax, %ecx ; GFNIAVX-NEXT: andl $21845, %ecx # imm = 0x5555 ; GFNIAVX-NEXT: shrl %eax diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -999,7 +999,7 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 @@ -1010,7 +1010,7 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 @@ -1021,7 +1021,7 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 @@ -1032,7 +1032,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v16i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 @@ -1052,7 +1052,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 @@ -1087,7 +1087,7 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v16i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll --- a/llvm/test/CodeGen/X86/vector-fshl-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll @@ -580,7 +580,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 @@ -597,7 +597,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -767,7 +767,7 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX2-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX2-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -778,7 +778,7 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -789,7 +789,7 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -800,7 +800,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v16i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512BW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -811,7 +811,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -330,7 +330,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm4 ; AVX512BW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3 @@ -341,7 +341,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3 diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -1032,7 +1032,7 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1043,7 +1043,7 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1054,7 +1054,7 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1065,7 +1065,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v16i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1085,7 +1085,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1121,7 +1121,7 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v16i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -582,7 +582,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -600,7 +600,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,15] ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -801,7 +801,7 @@ ; ; AVX2-LABEL: splatvar_funnnel_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -812,7 +812,7 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -823,7 +823,7 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -834,7 +834,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v16i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -845,7 +845,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -328,7 +328,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw %xmm3, %zmm0, %zmm3 ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -339,7 +339,7 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm0, %zmm3 ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -648,97 +648,99 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3],ymm5[4],ymm0[5,6,7],ymm5[8],ymm0[9,10,11],ymm5[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm1[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm4[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rdx) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r8) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -828,91 +830,93 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3],ymm5[4],ymm0[5,6,7],ymm5[8],ymm0[9,10,11],ymm5[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm11, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm5, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm2, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm4[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm11[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r8) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -1433,163 +1437,169 @@ ; ; AVX2-SLOW-LABEL: vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $184, %rsp -; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: subq $200, %rsp +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7],ymm3[8],ymm1[9,10,11],ymm3[12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-SLOW-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7],ymm3[8],ymm1[9,10,11],ymm3[12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[3,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm9 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm9 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm7 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 @@ -1606,10 +1616,10 @@ ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -1623,19 +1633,19 @@ ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -1656,12 +1666,12 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm11, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqa %ymm13, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-SLOW-NEXT: addq $184, %rsp +; AVX2-SLOW-NEXT: addq $200, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1826,201 +1836,208 @@ ; ; AVX2-FAST-PERLANE-LABEL: vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $168, %rsp -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: subq $200, %rsp +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7],ymm3[8],ymm1[9,10,11],ymm3[12],ymm1[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7],ymm3[8],ymm1[9,10,11],ymm3[12],ymm1[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm9, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm7, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm5, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm7, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm4, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm15, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm15, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm11, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm15, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm2, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm14[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm12 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] @@ -2033,12 +2050,12 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-PERLANE-NEXT: addq $168, %rsp +; AVX2-FAST-PERLANE-NEXT: addq $200, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -1322,33 +1322,33 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $424, %rsp # imm = 0x1A8 +; SSE-NEXT: subq $408, %rsp # imm = 0x198 ; SSE-NEXT: movdqa 304(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm8 -; SSE-NEXT: movdqa 256(%rdi), %xmm12 +; SSE-NEXT: movdqa 240(%rdi), %xmm12 +; SSE-NEXT: movdqa 256(%rdi), %xmm9 ; SSE-NEXT: movdqa 288(%rdi), %xmm6 -; SSE-NEXT: movdqa 272(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 272(%rdi), %xmm8 ; SSE-NEXT: movdqa 144(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm5 -; SSE-NEXT: movdqa 96(%rdi), %xmm9 +; SSE-NEXT: movdqa 96(%rdi), %xmm10 ; SSE-NEXT: movdqa 128(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm11 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] @@ -1360,43 +1360,42 @@ ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,1,0,3] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm14 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,2,2,3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] ; SSE-NEXT: movaps %xmm10, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: movdqa 32(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm7 ; SSE-NEXT: andps %xmm10, %xmm3 ; SSE-NEXT: orps %xmm3, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] @@ -1408,22 +1407,21 @@ ; SSE-NEXT: andps %xmm10, %xmm3 ; SSE-NEXT: orps %xmm3, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] -; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: movdqa 208(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa 192(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa 176(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 160(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,2,2,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,2,2,3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] @@ -1435,17 +1433,17 @@ ; SSE-NEXT: andps %xmm10, %xmm2 ; SSE-NEXT: orps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm9 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -1461,16 +1459,17 @@ ; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: orps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,3,2,3] -; SSE-NEXT: pshufd $232, (%rsp), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm15 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] @@ -1478,25 +1477,24 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 ; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] +; SSE-NEXT: psrlq $48, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] @@ -1512,16 +1510,16 @@ ; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -1537,21 +1535,20 @@ ; SSE-NEXT: pandn %xmm1, %xmm10 ; SSE-NEXT: por %xmm0, %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,1,3] +; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,2,0] @@ -1559,39 +1556,38 @@ ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps (%rsp), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm15[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,0] @@ -1599,42 +1595,42 @@ ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm12[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm12[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,1,3] -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: andnps %xmm4, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,1,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0,1,3] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: andnps %xmm5, %xmm1 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm13[3,0] -; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm13[3,0] +; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: andnps %xmm13, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm14[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm6[0,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] @@ -1643,178 +1639,173 @@ ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm0[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[3,0] -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: andnps %xmm15, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: pandn %xmm15, %xmm10 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm3[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm9, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[3,0] +; SSE-NEXT: movdqa %xmm3, %xmm15 +; SSE-NEXT: pandn %xmm11, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: por %xmm15, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm0[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm12 +; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[0,2] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[0,2] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm14[3,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm9, %xmm15 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm5[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm14[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm11[3,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm9, %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm9, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm4[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm9, %xmm13 -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,7,4,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm7[0,1,0,3] +; SSE-NEXT: pand %xmm3, %xmm13 +; SSE-NEXT: pandn %xmm11, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm2[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,7,4,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: por %xmm6, %xmm15 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,7,4,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm6[2,0] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm3[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm6[2,0] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,2,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm11[2,0] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,0] +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3] ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm3[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] -; SSE-NEXT: por %xmm2, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm2[2,0] -; SSE-NEXT: por %xmm13, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm2[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movaps %xmm15, 32(%r8) -; SSE-NEXT: movaps %xmm5, (%r8) -; SSE-NEXT: movaps %xmm8, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%r8) -; SSE-NEXT: movaps %xmm9, (%r9) -; SSE-NEXT: movaps %xmm12, 32(%r9) +; SSE-NEXT: por %xmm15, %xmm14 +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm4[2,0] +; SSE-NEXT: por %xmm13, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps %xmm12, 32(%r8) +; SSE-NEXT: movaps %xmm6, (%r8) +; SSE-NEXT: movaps %xmm9, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%r8) +; SSE-NEXT: movaps %xmm3, 32(%r9) +; SSE-NEXT: movaps %xmm14, (%r9) ; SSE-NEXT: movaps %xmm0, 48(%r9) -; SSE-NEXT: movaps %xmm14, 16(%r9) -; SSE-NEXT: addq $424, %rsp # imm = 0x1A8 +; SSE-NEXT: movaps %xmm1, 16(%r9) +; SSE-NEXT: addq $408, %rsp # imm = 0x198 ; SSE-NEXT: retq ; ; AVX1-LABEL: vf32: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -1612,29 +1612,28 @@ define void @vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $456, %rsp # imm = 0x1C8 -; SSE-NEXT: movdqa 208(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: subq $472, %rsp # imm = 0x1D8 +; SSE-NEXT: movdqa 208(%rdi), %xmm4 ; SSE-NEXT: movdqa 224(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdi), %xmm2 ; SSE-NEXT: movdqa 80(%rdi), %xmm10 ; SSE-NEXT: movdqa (%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm14 +; SSE-NEXT: movdqa 16(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm11 @@ -1651,22 +1650,25 @@ ; SSE-NEXT: pslld $16, %xmm10 ; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,3,2,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm0[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 192(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: movdqa 256(%rdi), %xmm4 @@ -1695,151 +1697,154 @@ ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa 112(%rdi), %xmm12 +; SSE-NEXT: movdqa 112(%rdi), %xmm13 ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa 160(%rdi), %xmm4 +; SSE-NEXT: movdqa 160(%rdi), %xmm3 ; SSE-NEXT: movdqa 176(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm4[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[3,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: movdqa 144(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa 304(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: movdqa 304(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 288(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa 352(%rdi), %xmm0 -; SSE-NEXT: movdqa 368(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] -; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[3,0] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa 352(%rdi), %xmm1 +; SSE-NEXT: movdqa 368(%rdi), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] +; SSE-NEXT: movdqa %xmm12, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm1[3,0] +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslld $16, %xmm12 ; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; SSE-NEXT: movdqa 336(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm1[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm11[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm9 -; SSE-NEXT: psrld $16, %xmm9 -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pandn %xmm14, %xmm9 -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm6[0,1,1,3,4,5,6,7] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm9[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,0] +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; SSE-NEXT: movdqa 336(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm3[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm12[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: psrld $16, %xmm9 +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pandn %xmm15, %xmm9 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: por %xmm9, %xmm1 +; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[0,1,1,3,4,5,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm9[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, (%rsp), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm11 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm11 ; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm12 +; SSE-NEXT: psrld $16, %xmm13 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm13[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,0] ; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,5,7,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm13 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm4[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm5[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] @@ -1850,84 +1855,84 @@ ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,0,0] ; SSE-NEXT: movdqa %xmm12, %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm3, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm6[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm6[2,3] +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] ; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm12, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 ; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm15[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm15[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm10[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm10[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,2,3,4,5,6,7] ; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 ; SSE-NEXT: pand %xmm12, %xmm6 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm7 -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] ; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,2,3,4,5,6,7] ; SSE-NEXT: pand %xmm1, %xmm8 ; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[0,1],mem[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm15[0,1,2,3,4,6,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0,1],mem[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] ; SSE-NEXT: movdqa %xmm12, %xmm0 @@ -1935,67 +1940,70 @@ ; SSE-NEXT: pand %xmm12, %xmm8 ; SSE-NEXT: por %xmm8, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: psrlq $48, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm6[0] +; SSE-NEXT: psrlq $48, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm11[0] ; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: pandn %xmm8, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm9 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm9[0] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: psrlq $48, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm4[0] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm6, %xmm4 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] ; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 ; SSE-NEXT: pand %xmm12, %xmm5 ; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm9 -; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: psrlq $48, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: psrlq $48, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm4, %xmm11 -; SSE-NEXT: psrlq $48, %xmm14 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm14[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm1, %xmm2 @@ -2004,47 +2012,46 @@ ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm10 ; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm8[1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm12, %xmm9 +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: andps %xmm12, %xmm1 +; SSE-NEXT: por %xmm1, %xmm9 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[0,1,0,3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm10[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm11[1] ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm12, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 ; SSE-NEXT: andps %xmm12, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[1],mem[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: andps %xmm12, %xmm2 -; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: por %xmm3, %xmm7 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -2054,106 +2061,106 @@ ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm9[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm14[1] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: andps %xmm12, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: por %xmm2, %xmm5 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,3,2,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,0,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm13[1] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 ; SSE-NEXT: andps %xmm12, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: psrld $16, %xmm10 +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: psrld $16, %xmm3 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm10[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm6[0],xmm2[1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 ; SSE-NEXT: andps %xmm12, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: psrlq $48, %xmm15 -; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm9 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm6 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm9[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm13[0],xmm2[1,2,3] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: pandn %xmm8, %xmm15 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm12, %xmm14 +; SSE-NEXT: pandn %xmm6, %xmm14 ; SSE-NEXT: andps %xmm12, %xmm2 -; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: por %xmm2, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm6 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm9[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm8[0],xmm2[1,2,3] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: pandn %xmm8, %xmm13 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: andps %xmm12, %xmm2 -; SSE-NEXT: por %xmm2, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm8 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: psrld $16, %xmm8 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm9[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm8[0],xmm2[1,2,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm8[1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm6[0],xmm2[1,2,3] ; SSE-NEXT: andps %xmm12, %xmm2 -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7] -; SSE-NEXT: pandn %xmm8, %xmm12 +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] +; SSE-NEXT: pandn %xmm6, %xmm12 ; SSE-NEXT: por %xmm2, %xmm12 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 48(%rsi) @@ -2179,22 +2186,23 @@ ; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm7, 48(%r8) -; SSE-NEXT: movdqa %xmm11, 16(%r8) +; SSE-NEXT: movdqa %xmm10, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movdqa %xmm3, 48(%r9) -; SSE-NEXT: movdqa %xmm4, 16(%r9) -; SSE-NEXT: movdqa %xmm5, 32(%r9) -; SSE-NEXT: movdqa %xmm6, (%r9) +; SSE-NEXT: movdqa %xmm4, 48(%r9) +; SSE-NEXT: movdqa %xmm5, 16(%r9) +; SSE-NEXT: movdqa %xmm7, (%r9) +; SSE-NEXT: movdqa %xmm9, 32(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm12, 48(%rax) -; SSE-NEXT: movdqa %xmm13, 16(%rax) -; SSE-NEXT: movdqa %xmm15, 32(%rax) -; SSE-NEXT: movdqa %xmm1, (%rax) -; SSE-NEXT: addq $456, %rsp # imm = 0x1C8 +; SSE-NEXT: movdqa %xmm1, 16(%rax) +; SSE-NEXT: movdqa %xmm14, 32(%rax) +; SSE-NEXT: movdqa %xmm3, (%rax) +; SSE-NEXT: addq $472, %rsp # imm = 0x1D8 ; SSE-NEXT: retq ; ; AVX1-LABEL: vf32: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll @@ -107,23 +107,23 @@ ; ; AVX1-LABEL: load_i32_stride4_vf4: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX1-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-NEXT: vmovaps (%rdi), %xmm3 -; AVX1-NEXT: vmovaps 16(%rdi), %xmm4 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,0] +; AVX1-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX1-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX1-NEXT: vmovaps 48(%rdi), %xmm3 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] ; AVX1-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm3[1],xmm4[1],zero,zero -; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm0[2],xmm1[2] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm7 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm4[3,0],xmm3[3,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[2,3] -; AVX1-NEXT: vmovaps %xmm2, (%rsi) +; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm0[1],xmm1[1],zero,zero +; AVX1-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm2[2],xmm3[2] +; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,0],xmm0[3,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; AVX1-NEXT: vmovaps %xmm4, (%rsi) ; AVX1-NEXT: vmovaps %xmm5, (%rdx) ; AVX1-NEXT: vmovaps %xmm6, (%rcx) ; AVX1-NEXT: vmovaps %xmm0, (%r8) @@ -247,37 +247,37 @@ ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1] ; AVX1-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[4],ymm0[4],ymm4[5],ymm0[5] ; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,0],ymm5[4,5],ymm3[6,4] -; AVX1-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX1-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] -; AVX1-NEXT: vmovaps (%rdi), %xmm8 -; AVX1-NEXT: vmovaps 16(%rdi), %xmm9 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm10 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,0] -; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX1-NEXT: vunpcklps {{.*#+}} ymm7 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,0],ymm4[1,0],ymm0[5,4],ymm4[5,4] -; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm10[2,0],ymm7[2,3],ymm10[6,4],ymm7[6,7] +; AVX1-NEXT: vmovaps (%rdi), %xmm5 +; AVX1-NEXT: vmovaps 16(%rdi), %xmm6 +; AVX1-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX1-NEXT: vmovaps 48(%rdi), %xmm8 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0] ; AVX1-NEXT: vunpcklps {{.*#+}} xmm10 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX1-NEXT: vinsertps {{.*#+}} xmm11 = xmm8[1],xmm9[1],zero,zero -; AVX1-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] +; AVX1-NEXT: vshufps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,0] +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7] +; AVX1-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,0],ymm4[1,0],ymm0[5,4],ymm4[5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm9 = ymm10[2,0],ymm9[2,3],ymm10[6,4],ymm9[6,7] +; AVX1-NEXT: vinsertps {{.*#+}} xmm10 = xmm5[1],xmm6[1],zero,zero +; AVX1-NEXT: vunpcklps {{.*#+}} xmm11 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX1-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; AVX1-NEXT: vunpckhps {{.*#+}} ymm11 = ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[6],ymm0[6],ymm4[7],ymm0[7] ; AVX1-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4] -; AVX1-NEXT: vinsertps {{.*#+}} xmm11 = zero,zero,xmm5[2],xmm6[2] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm12 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm11 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm7[2],xmm8[2] +; AVX1-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX1-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm4[3,0],ymm0[7,4],ymm4[7,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,3],ymm0[6,4],ymm1[6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm9[3,0],xmm8[3,0] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm6[3,0],xmm5[3,0] ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-NEXT: vmovaps %ymm7, (%rdx) +; AVX1-NEXT: vmovaps %ymm9, (%rdx) ; AVX1-NEXT: vmovaps %ymm10, (%rcx) ; AVX1-NEXT: vmovaps %ymm0, (%r8) ; AVX1-NEXT: vzeroupper @@ -487,7 +487,7 @@ ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] ; AVX1-NEXT: vmovaps %ymm3, %ymm14 -; AVX1-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovaps %ymm1, %ymm15 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] @@ -507,22 +507,24 @@ ; AVX1-NEXT: vshufps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,0] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm4[2,3,0,1] ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] ; AVX1-NEXT: vmovaps %ymm6, %ymm8 -; AVX1-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm5[2,3,0,1] -; AVX1-NEXT: vunpcklps {{.*#+}} ymm13 = ymm9[0],ymm5[0],ymm9[1],ymm5[1],ymm9[4],ymm5[4],ymm9[5],ymm5[5] +; AVX1-NEXT: vmovaps %ymm4, %ymm12 +; AVX1-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm5[2,3,0,1] +; AVX1-NEXT: vunpcklps {{.*#+}} ymm13 = ymm11[0],ymm5[0],ymm11[1],ymm5[1],ymm11[4],ymm5[4],ymm11[5],ymm5[5] ; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm13[0,1],ymm0[2,0],ymm13[4,5],ymm0[6,4] -; AVX1-NEXT: vmovaps 32(%rdi), %xmm11 -; AVX1-NEXT: vmovaps 48(%rdi), %xmm12 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm12[0],xmm11[0] -; AVX1-NEXT: vmovaps (%rdi), %xmm4 -; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps (%rdi), %xmm9 ; AVX1-NEXT: vmovaps 16(%rdi), %xmm5 ; AVX1-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX1-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 48(%rdi), %xmm0 +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; AVX1-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,0] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -537,64 +539,63 @@ ; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm5[1],xmm1[1],zero,zero ; AVX1-NEXT: vmovaps %xmm1, %xmm14 ; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] ; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX1-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm8, %ymm10 +; AVX1-NEXT: vmovaps %ymm12, %ymm0 +; AVX1-NEXT: vmovaps %ymm8, %ymm12 ; AVX1-NEXT: vunpcklps {{.*#+}} ymm3 = ymm8[0],ymm0[0],ymm8[1],ymm0[1],ymm8[4],ymm0[4],ymm8[5],ymm0[5] -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm2[1,0],ymm9[1,0],ymm2[5,4],ymm9[5,4] +; AVX1-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm2[1,0],ymm11[1,0],ymm2[5,4],ymm11[5,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm6[2,0],ymm3[2,3],ymm6[6,4],ymm3[6,7] -; AVX1-NEXT: vunpcklps {{.*#+}} xmm6 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm7 # 16-byte Folded Reload -; AVX1-NEXT: # xmm7 = mem[0],xmm8[1],zero,zero -; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm9[1],xmm8[1],zero,zero +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] ; AVX1-NEXT: vunpckhps {{.*#+}} ymm6 = ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[6],ymm15[6],ymm13[7],ymm15[7] ; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1],ymm3[2,0],ymm6[4,5],ymm3[6,4] ; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm5[2],xmm14[2],xmm5[3],xmm14[3] ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm1[2],xmm3[2] +; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm10[2],xmm3[2] +; AVX1-NEXT: vmovaps %xmm10, %xmm14 ; AVX1-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX1-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps %ymm10, %ymm14 -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] ; AVX1-NEXT: vmovaps %ymm0, %ymm5 -; AVX1-NEXT: vunpckhps {{.*#+}} ymm7 = ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[6],ymm2[6],ymm9[7],ymm2[7] +; AVX1-NEXT: vunpckhps {{.*#+}} ymm7 = ymm11[2],ymm2[2],ymm11[3],ymm2[3],ymm11[6],ymm2[6],ymm11[7],ymm2[7] ; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] -; AVX1-NEXT: vmovaps %xmm11, %xmm10 -; AVX1-NEXT: vmovaps %xmm12, %xmm11 -; AVX1-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm10[2],xmm12[2] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm8[2],xmm12[3],xmm8[3] -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm9[2],xmm1[2] +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-NEXT: vunpckhps (%rsp), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload ; AVX1-NEXT: # ymm4 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm15[3,0],ymm13[3,0],ymm15[7,4],ymm13[7,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm4[2,3],ymm2[6,4],ymm4[6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload -; AVX1-NEXT: # xmm6 = xmm1[3,0],mem[3,0] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm14[2],xmm3[2],xmm14[3],xmm3[3] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload +; AVX1-NEXT: # xmm6 = xmm3[3,0],mem[3,0] ; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,0],xmm4[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} ymm3 = ymm14[2],ymm5[2],ymm14[3],ymm5[3],ymm14[6],ymm5[6],ymm14[7],ymm5[7] -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,0],ymm9[3,0],ymm1[7,4],ymm9[7,4] +; AVX1-NEXT: vunpckhps {{.*#+}} ymm3 = ymm12[2],ymm5[2],ymm12[3],ymm5[3],ymm12[6],ymm5[6],ymm12[7],ymm5[7] +; AVX1-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0],ymm11[3,0],ymm4[7,4],ymm11[7,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,0],ymm3[2,3],ymm4[6,4],ymm3[6,7] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm8[3,0],xmm12[3,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[2,3] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm8[3,0],xmm10[3,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,0],xmm1[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm3, 32(%rsi) @@ -1028,7 +1029,7 @@ ; ; AVX1-LABEL: load_i32_stride4_vf32: ; AVX1: # %bb.0: -; AVX1-NEXT: subq $1000, %rsp # imm = 0x3E8 +; AVX1-NEXT: subq $984, %rsp # imm = 0x3D8 ; AVX1-NEXT: vmovaps 320(%rdi), %ymm3 ; AVX1-NEXT: vmovaps 352(%rdi), %ymm4 ; AVX1-NEXT: vmovaps 448(%rdi), %ymm2 @@ -1048,7 +1049,7 @@ ; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-NEXT: vmovaps 400(%rdi), %xmm5 -; AVX1-NEXT: vmovaps %xmm5, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovaps 384(%rdi), %xmm2 ; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] @@ -1081,28 +1082,28 @@ ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovaps 192(%rdi), %ymm0 ; AVX1-NEXT: vmovaps 224(%rdi), %ymm7 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm7[2,3,0,1] -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX1-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[2,3,0,1] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX1-NEXT: vmovaps %ymm3, %ymm12 +; AVX1-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX1-NEXT: vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX1-NEXT: vmovaps %ymm1, %ymm13 +; AVX1-NEXT: vmovaps %ymm1, %ymm14 ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps %ymm0, %ymm12 +; AVX1-NEXT: vmovaps %ymm0, %ymm13 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4] ; AVX1-NEXT: vmovaps 160(%rdi), %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovaps 176(%rdi), %xmm1 +; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] -; AVX1-NEXT: vmovaps %xmm1, %xmm14 +; AVX1-NEXT: vmovaps 144(%rdi), %xmm9 +; AVX1-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 144(%rdi), %xmm0 -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 128(%rdi), %xmm5 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm10 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX1-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vunpcklps {{.*#+}} xmm10 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; AVX1-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,0] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1117,16 +1118,16 @@ ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vunpcklps {{.*#+}} ymm11 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4] -; AVX1-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX1-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovaps 16(%rdi), %xmm3 -; AVX1-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 48(%rdi), %xmm5 +; AVX1-NEXT: vmovaps %xmm5, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0] ; AVX1-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-NEXT: vmovaps %xmm3, %xmm5 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,0] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1143,13 +1144,14 @@ ; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5] -; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,0],ymm13[1,0],ymm12[5,4],ymm13[5,4] +; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[4],ymm7[4],ymm12[5],ymm7[5] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,0],ymm14[1,0],ymm13[5,4],ymm14[5,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm5[1],xmm7[1],zero,zero +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[1],xmm9[1],zero,zero ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] +; AVX1-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload +; AVX1-NEXT: # xmm15 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1161,7 +1163,7 @@ ; AVX1-NEXT: # ymm1 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm13[1],xmm1[1],zero,zero ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -1176,13 +1178,13 @@ ; AVX1-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm5, %xmm12 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm5[1],xmm12[1],zero,zero ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-NEXT: # xmm15 = mem[0],xmm15[1],zero,zero -; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] +; AVX1-NEXT: vunpcklps (%rsp), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload @@ -1197,18 +1199,18 @@ ; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX1-NEXT: # ymm0 = ymm6[1],mem[1],ymm6[3],mem[3] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload -; AVX1-NEXT: # ymm1 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] +; AVX1-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX1-NEXT: # ymm0 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm15 # 16-byte Folded Reload -; AVX1-NEXT: # xmm15 = zero,zero,xmm4[2],mem[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm4[2],xmm6[2] ; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1216,7 +1218,7 @@ ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm14[2],ymm2[3],ymm14[3],ymm2[6],ymm14[6],ymm2[7],ymm14[7] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm7[2],xmm13[3],xmm7[3] ; AVX1-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm3[2],xmm9[2] ; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] @@ -1229,21 +1231,21 @@ ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX1-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm5[2],xmm12[2] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; AVX1-NEXT: vmovaps (%rsp), %xmm10 # 16-byte Reload ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm15 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm9[2],xmm10[2] +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[6],ymm6[6],ymm0[7],ymm6[7] +; AVX1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm8[3,0],ymm1[7,4],ymm8[7,4] +; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: # xmm1 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,0],xmm11[3,0] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload +; AVX1-NEXT: # xmm4 = xmm11[3,0],mem[3,0] ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,0],xmm1[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -1278,9 +1280,9 @@ ; AVX1-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm14[3,0],ymm13[3,0],ymm14[7,4],ymm13[7,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,0],ymm2[2,3],ymm3[6,4],ymm2[6,7] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm10[3,0],xmm9[3,0] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX1-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm5 # 16-byte Folded Reload +; AVX1-NEXT: # xmm5 = xmm12[3,0],mem[3,0] ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm5[2,0],xmm3[2,3] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -1310,7 +1312,7 @@ ; AVX1-NEXT: vmovaps %ymm1, 96(%r8) ; AVX1-NEXT: vmovaps %ymm2, (%r8) ; AVX1-NEXT: vmovaps %ymm0, 32(%r8) -; AVX1-NEXT: addq $1000, %rsp # imm = 0x3E8 +; AVX1-NEXT: addq $984, %rsp # imm = 0x3D8 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -298,13 +298,15 @@ ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512-NEXT: vpextrd $2, %xmm5, %r10d ; AVX512-NEXT: vpinsrd $3, %r10d, %xmm4, %xmm4 +; AVX512-NEXT: vpextrd $3, %xmm0, %r10d +; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,1,1] +; AVX512-NEXT: vpinsrd $1, %r10d, %xmm6, %xmm6 ; AVX512-NEXT: vpextrd $1, %xmm3, %r10d -; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm0[2,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] ; AVX512-NEXT: vpinsrd $2, %r10d, %xmm6, %xmm6 ; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm5[3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm1[0,1],xmm2[2,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,0,2,3] +; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm7 +; AVX512-NEXT: vmovd %xmm1, %r10d +; AVX512-NEXT: vpinsrd $1, %r10d, %xmm7, %xmm7 ; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm3[2],xmm7[3] ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm8 ; AVX512-NEXT: vmovd %xmm8, %edi @@ -321,9 +323,10 @@ ; AVX512-NEXT: vpinsrd $2, %edi, %xmm3, %xmm3 ; AVX512-NEXT: vpextrd $2, %xmm8, %edi ; AVX512-NEXT: vpinsrd $3, %edi, %xmm3, %xmm3 +; AVX512-NEXT: vpextrd $3, %xmm1, %edi +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX512-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 ; AVX512-NEXT: vpextrd $1, %xmm5, %edi -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512-NEXT: vpinsrd $2, %edi, %xmm0, %xmm0 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3] ; AVX512-NEXT: vmovdqa %xmm4, (%rsi) @@ -877,92 +880,94 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i32_stride6_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $360, %rsp # imm = 0x168 -; SSE-NEXT: movdqa 144(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: subq $392, %rsp # imm = 0x188 +; SSE-NEXT: movdqa 144(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rdi), %xmm4 -; SSE-NEXT: movdqa 96(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm15 ; SSE-NEXT: movdqa 112(%rdi), %xmm5 ; SSE-NEXT: movdqa 240(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa 256(%rdi), %xmm7 -; SSE-NEXT: movdqa 192(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm14 -; SSE-NEXT: movdqa 64(%rdi), %xmm12 -; SSE-NEXT: movdqa (%rdi), %xmm9 -; SSE-NEXT: movdqa 16(%rdi), %xmm13 -; SSE-NEXT: movdqa 48(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE-NEXT: movdqa 192(%rdi), %xmm11 +; SSE-NEXT: movdqa 208(%rdi), %xmm10 +; SSE-NEXT: movdqa 64(%rdi), %xmm6 +; SSE-NEXT: movdqa (%rdi), %xmm14 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa 48(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] ; SSE-NEXT: movdqa %xmm7, %xmm10 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; SSE-NEXT: movdqa %xmm5, %xmm7 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] ; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm2 -; SSE-NEXT: movdqa 304(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa 288(%rdi), %xmm13 +; SSE-NEXT: movdqa 304(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movdqa 336(%rdi), %xmm11 -; SSE-NEXT: movdqa 352(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm15 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,0,1,1] +; SSE-NEXT: movdqa 336(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 352(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] @@ -971,106 +976,108 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: movdqa %xmm11, %xmm5 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: movdqa 80(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa 80(%rdi), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] -; SSE-NEXT: movdqa 32(%rdi), %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa 32(%rdi), %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa 272(%rdi), %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: movdqa 272(%rdi), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] ; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa 368(%rdi), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa 320(%rdi), %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] ; SSE-NEXT: movdqa 176(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa 128(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa 128(%rdi), %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa 368(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa 320(%rdi), %xmm6 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,3,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] -; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm12 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] @@ -1078,21 +1085,20 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, %xmm3 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -1100,19 +1106,18 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, %xmm14 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -1139,27 +1144,27 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rcx) ; SSE-NEXT: movapd %xmm11, 16(%r8) -; SSE-NEXT: movapd %xmm8, 48(%r8) +; SSE-NEXT: movapd %xmm9, 48(%r8) ; SSE-NEXT: movapd %xmm10, 32(%r8) ; SSE-NEXT: movapd %xmm15, (%r8) ; SSE-NEXT: movapd %xmm0, 48(%r9) ; SSE-NEXT: movapd %xmm4, 16(%r9) -; SSE-NEXT: movapd %xmm5, 32(%r9) -; SSE-NEXT: movapd %xmm6, (%r9) +; SSE-NEXT: movapd %xmm5, (%r9) +; SSE-NEXT: movapd %xmm7, 32(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm13, 16(%rax) -; SSE-NEXT: movapd %xmm9, (%rax) -; SSE-NEXT: movapd %xmm14, 32(%rax) -; SSE-NEXT: movapd %xmm3, 48(%rax) -; SSE-NEXT: addq $360, %rsp # imm = 0x168 +; SSE-NEXT: movapd %xmm6, (%rax) +; SSE-NEXT: movapd %xmm8, 32(%rax) +; SSE-NEXT: movapd %xmm14, 48(%rax) +; SSE-NEXT: addq $392, %rsp # imm = 0x188 ; SSE-NEXT: retq ; ; AVX1-LABEL: load_i32_stride6_vf16: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll @@ -282,23 +282,23 @@ ; AVX1-NEXT: vmovaps 96(%rdi), %xmm13 ; AVX1-NEXT: vmovaps 64(%rdi), %xmm14 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm8[1] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm13[1] ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm9[1],xmm8[1] ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm6[1] ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm11[1],xmm10[1] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm14[1],xmm13[1] -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX1-NEXT: vmovaps 48(%rdi), %xmm11 -; AVX1-NEXT: vmovaps 16(%rdi), %xmm13 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm14 = xmm13[0],xmm11[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX1-NEXT: vmovaps 48(%rdi), %xmm10 +; AVX1-NEXT: vmovaps 16(%rdi), %xmm11 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm14 = xmm11[0],xmm10[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX1-NEXT: vmovaps 176(%rdi), %xmm1 ; AVX1-NEXT: vmovaps 144(%rdi), %xmm0 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm8 = xmm0[0],xmm1[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm1[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm13[1],xmm11[1] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm11[1],xmm10[1] ; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] @@ -309,13 +309,13 @@ ; AVX1-NEXT: vmovaps %xmm1, 32(%rsi) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-NEXT: vmovaps %xmm1, (%rsi) -; AVX1-NEXT: vmovaps %xmm9, 16(%rdx) ; AVX1-NEXT: vmovaps %xmm7, 48(%rdx) ; AVX1-NEXT: vmovaps %xmm6, (%rdx) +; AVX1-NEXT: vmovaps %xmm8, 32(%rdx) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm1, 32(%rdx) -; AVX1-NEXT: vmovaps %ymm8, 32(%rcx) -; AVX1-NEXT: vmovaps %ymm10, (%rcx) +; AVX1-NEXT: vmovaps %xmm1, 16(%rdx) +; AVX1-NEXT: vmovaps %ymm13, 32(%rcx) +; AVX1-NEXT: vmovaps %ymm9, (%rcx) ; AVX1-NEXT: vmovaps %ymm0, 32(%r8) ; AVX1-NEXT: vmovaps %ymm4, (%r8) ; AVX1-NEXT: vzeroupper @@ -598,132 +598,132 @@ ; AVX1-NEXT: vmovaps 320(%rdi), %xmm5 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] ; AVX1-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 160(%rdi), %xmm7 -; AVX1-NEXT: vmovaps 128(%rdi), %xmm8 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm8[0],xmm7[0] -; AVX1-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX1-NEXT: vmovaps (%rdi), %xmm10 -; AVX1-NEXT: vmovaps 288(%rdi), %xmm6 -; AVX1-NEXT: vmovaps 256(%rdi), %xmm11 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] -; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm11[0],xmm6[0] -; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm11[1],xmm6[1] -; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm10[0],xmm9[0] -; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 416(%rdi), %xmm0 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm7[1] -; AVX1-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX1-NEXT: vmovaps 480(%rdi), %xmm1 +; AVX1-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX1-NEXT: vmovaps (%rdi), %xmm7 +; AVX1-NEXT: vmovaps 416(%rdi), %xmm8 +; AVX1-NEXT: vmovaps 384(%rdi), %xmm9 +; AVX1-NEXT: vmovaps 480(%rdi), %xmm10 +; AVX1-NEXT: vmovaps 448(%rdi), %xmm11 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm10[0] +; AVX1-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1] +; AVX1-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0] +; AVX1-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm9[1],xmm8[1] +; AVX1-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] +; AVX1-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 448(%rdi), %xmm2 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm10[1],xmm9[1] +; AVX1-NEXT: vmovaps 288(%rdi), %xmm2 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm7[1],xmm6[1] ; AVX1-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] -; AVX1-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovaps 256(%rdi), %xmm3 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] +; AVX1-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] +; AVX1-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps 160(%rdi), %xmm2 ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovaps 224(%rdi), %ymm13 -; AVX1-NEXT: vmovaps 192(%rdi), %ymm12 -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm12[0],ymm13[0],ymm12[2],ymm13[2] -; AVX1-NEXT: vmovaps 176(%rdi), %xmm11 -; AVX1-NEXT: vmovaps 144(%rdi), %xmm10 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm10[0],xmm11[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm2[0] +; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-NEXT: vmovaps 304(%rdi), %xmm15 ; AVX1-NEXT: vmovaps 272(%rdi), %xmm14 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm14[0],xmm15[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm14[0],xmm15[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps 480(%rdi), %ymm8 -; AVX1-NEXT: vmovaps 448(%rdi), %ymm7 -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX1-NEXT: vmovaps 432(%rdi), %xmm5 -; AVX1-NEXT: vmovaps 400(%rdi), %xmm4 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm4[0],xmm5[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-NEXT: vmovaps 224(%rdi), %ymm12 +; AVX1-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] +; AVX1-NEXT: vmovaps 176(%rdi), %xmm10 +; AVX1-NEXT: vmovaps 144(%rdi), %xmm8 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm10[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX1-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX1-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm9 = xmm0[0],xmm1[0] -; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm11[1] -; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: vmovaps 96(%rdi), %ymm7 +; AVX1-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX1-NEXT: vmovaps 48(%rdi), %xmm5 +; AVX1-NEXT: vmovaps 16(%rdi), %xmm4 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm13 = xmm4[0],xmm5[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovaps 480(%rdi), %ymm3 +; AVX1-NEXT: vmovaps 448(%rdi), %ymm2 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-NEXT: vmovaps 432(%rdi), %xmm1 +; AVX1-NEXT: vmovaps 400(%rdi), %xmm0 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm1[0] +; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm10[1] +; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm5[1] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm14[1],xmm15[1] ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 112(%rsi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 96(%rsi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 64(%rsi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, (%rsi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 32(%rsi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 80(%rsi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 16(%rsi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 48(%rsi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 32(%rdx) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 48(%rdx) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 64(%rdx) +; AVX1-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 80(%rdx) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, (%rdx) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 16(%rdx) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 96(%rdx) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm2, 112(%rdx) +; AVX1-NEXT: vmovaps %ymm9, 96(%rcx) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm14[1],xmm15[1] -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 112(%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 96(%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 64(%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, (%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 32(%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 80(%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 16(%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 48(%rsi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 96(%rdx) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 112(%rdx) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, (%rdx) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 16(%rdx) -; AVX1-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 32(%rdx) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 48(%rdx) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 64(%rdx) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovaps %xmm3, 80(%rdx) -; AVX1-NEXT: vmovaps %ymm6, (%rcx) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX1-NEXT: vmovaps %ymm2, 64(%r8) -; AVX1-NEXT: vmovaps %ymm1, 96(%r8) -; AVX1-NEXT: vmovaps %ymm0, (%r8) -; AVX1-NEXT: vmovaps %ymm10, 32(%r8) +; AVX1-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX1-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-NEXT: vmovaps %ymm0, 96(%r8) +; AVX1-NEXT: vmovaps %ymm4, (%r8) +; AVX1-NEXT: vmovaps %ymm8, 32(%r8) ; AVX1-NEXT: addq $296, %rsp # imm = 0x128 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -731,97 +731,97 @@ ; AVX2-LABEL: load_i64_stride4_vf16: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $296, %rsp # imm = 0x128 -; AVX2-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX2-NEXT: vmovaps 192(%rdi), %xmm6 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm0[0] -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 96(%rdi), %xmm9 -; AVX2-NEXT: vmovaps (%rdi), %xmm11 -; AVX2-NEXT: vmovaps 64(%rdi), %xmm10 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm9[0] -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 352(%rdi), %xmm4 -; AVX2-NEXT: vmovaps 320(%rdi), %xmm5 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm4[0] -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 160(%rdi), %xmm8 -; AVX2-NEXT: vmovaps 128(%rdi), %xmm12 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm8[0] -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 288(%rdi), %xmm13 -; AVX2-NEXT: vmovaps 256(%rdi), %xmm14 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm13[0] -; AVX2-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 416(%rdi), %xmm15 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm14[1],xmm13[1] -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 384(%rdi), %xmm13 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm0[1] +; AVX2-NEXT: vmovaps 224(%rdi), %xmm6 +; AVX2-NEXT: vmovaps 192(%rdi), %xmm9 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm9[0],xmm6[0] ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 480(%rdi), %xmm0 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm8[1] -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 448(%rdi), %xmm12 -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],xmm9[1] -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm0[0] -; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm12[1],xmm0[1] +; AVX2-NEXT: vmovaps 96(%rdi), %xmm5 +; AVX2-NEXT: vmovaps (%rdi), %xmm10 +; AVX2-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm5[0] ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm13[0],xmm15[0] +; AVX2-NEXT: vmovaps 352(%rdi), %xmm8 +; AVX2-NEXT: vmovaps 320(%rdi), %xmm11 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm8[0] ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm15[1] +; AVX2-NEXT: vmovaps 160(%rdi), %xmm13 +; AVX2-NEXT: vmovaps 416(%rdi), %xmm4 +; AVX2-NEXT: vmovaps 384(%rdi), %xmm14 +; AVX2-NEXT: vmovaps 480(%rdi), %xmm3 +; AVX2-NEXT: vmovaps 448(%rdi), %xmm15 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm15[0],xmm3[0] ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm2[0] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm3[1] ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm2[1] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm4[0] +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm4[1] +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 288(%rdi), %xmm14 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm5[1] +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 256(%rdi), %xmm15 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm8[1] +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm15[0],xmm14[0] +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm14[1] +; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovaps 128(%rdi), %xmm14 +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm6[1] +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm13[0] +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm13[1] +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm10[0],xmm6[0] +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm6[1] ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 160(%rdi), %ymm9 -; AVX2-NEXT: vmovaps 128(%rdi), %ymm8 -; AVX2-NEXT: vmovaps 224(%rdi), %ymm5 -; AVX2-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 256(%rdi), %ymm1 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 352(%rdi), %ymm14 -; AVX2-NEXT: vmovaps 320(%rdi), %ymm13 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm11[2,3] +; AVX2-NEXT: vmovaps 352(%rdi), %ymm13 +; AVX2-NEXT: vmovaps 320(%rdi), %ymm12 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm12[0],ymm13[0],ymm12[2],ymm13[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm6[2,3] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 160(%rdi), %ymm15 +; AVX2-NEXT: vmovaps 128(%rdi), %ymm7 +; AVX2-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX2-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm7[0],ymm15[0],ymm7[2],ymm15[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3],ymm10[2,3] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 32(%rdi), %ymm9 +; AVX2-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX2-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm10[2,3],ymm14[2,3] +; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 416(%rdi), %ymm10 -; AVX2-NEXT: vmovaps 384(%rdi), %ymm7 -; AVX2-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-NEXT: vmovaps 448(%rdi), %ymm3 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm15[2,3] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 32(%rdi), %ymm11 -; AVX2-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX2-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm11[0],ymm2[2],ymm11[2] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm4[2,3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm11[1],ymm2[3],ymm11[3] +; AVX2-NEXT: vmovaps 384(%rdi), %ymm6 +; AVX2-NEXT: vmovaps 480(%rdi), %ymm4 +; AVX2-NEXT: vmovaps 448(%rdi), %ymm2 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm14[2,3],ymm11[2,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm7[1],ymm15[1],ymm7[3],ymm15[3] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm3[2,3],ymm0[2,3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] @@ -830,7 +830,7 @@ ; AVX2-NEXT: vmovaps %xmm3, 112(%rsi) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-NEXT: vmovaps %xmm3, 96(%rsi) -; AVX2-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-NEXT: vmovaps %xmm3, 64(%rsi) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-NEXT: vmovaps %xmm3, (%rsi) @@ -843,32 +843,32 @@ ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-NEXT: vmovaps %xmm3, 48(%rsi) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vmovaps %xmm3, 96(%rdx) +; AVX2-NEXT: vmovaps %xmm3, 32(%rdx) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vmovaps %xmm3, 112(%rdx) +; AVX2-NEXT: vmovaps %xmm3, 48(%rdx) +; AVX2-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload +; AVX2-NEXT: vmovaps %xmm3, 64(%rdx) +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-NEXT: vmovaps %xmm3, 80(%rdx) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-NEXT: vmovaps %xmm3, (%rdx) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-NEXT: vmovaps %xmm3, 16(%rdx) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vmovaps %xmm3, 32(%rdx) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vmovaps %xmm3, 48(%rdx) -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vmovaps %xmm3, 64(%rdx) +; AVX2-NEXT: vmovaps %xmm3, 96(%rdx) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-NEXT: vmovaps %xmm3, 80(%rdx) -; AVX2-NEXT: vmovaps %ymm12, (%rcx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-NEXT: vmovaps %xmm3, 112(%rdx) +; AVX2-NEXT: vmovaps %ymm11, 96(%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-NEXT: vmovaps %ymm3, (%rcx) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm3, 64(%rcx) ; AVX2-NEXT: vmovaps %ymm2, 64(%r8) ; AVX2-NEXT: vmovaps %ymm1, 96(%r8) ; AVX2-NEXT: vmovaps %ymm0, (%r8) -; AVX2-NEXT: vmovaps %ymm5, 32(%r8) +; AVX2-NEXT: vmovaps %ymm15, 32(%r8) ; AVX2-NEXT: addq $296, %rsp # imm = 0x128 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll @@ -343,25 +343,25 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 304(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] -; SSE-NEXT: movaps 80(%rdi), %xmm1 +; SSE-NEXT: movaps 304(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1] +; SSE-NEXT: movaps 80(%rdi), %xmm0 ; SSE-NEXT: movaps 32(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] -; SSE-NEXT: movaps 272(%rdi), %xmm1 -; SSE-NEXT: movaps 224(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: movaps 176(%rdi), %xmm1 -; SSE-NEXT: movaps 128(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 272(%rdi), %xmm0 +; SSE-NEXT: movaps 224(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps 176(%rdi), %xmm0 +; SSE-NEXT: movaps 128(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] ; SSE-NEXT: movaps 368(%rdi), %xmm1 ; SSE-NEXT: movaps 320(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm2 @@ -381,11 +381,11 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rdx) ; SSE-NEXT: movaps %xmm12, 16(%rcx) -; SSE-NEXT: movaps %xmm9, 48(%rcx) +; SSE-NEXT: movaps %xmm7, 48(%rcx) ; SSE-NEXT: movaps %xmm15, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps %xmm7, 48(%r8) +; SSE-NEXT: movaps %xmm9, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -393,13 +393,13 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%r8) ; SSE-NEXT: movaps %xmm2, 48(%r9) -; SSE-NEXT: movaps %xmm5, 16(%r9) -; SSE-NEXT: movaps %xmm6, 32(%r9) +; SSE-NEXT: movaps %xmm4, 16(%r9) ; SSE-NEXT: movaps %xmm10, (%r9) +; SSE-NEXT: movaps %xmm6, 32(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps %xmm4, 16(%rax) -; SSE-NEXT: movaps %xmm3, 32(%rax) +; SSE-NEXT: movaps %xmm3, 16(%rax) +; SSE-NEXT: movaps %xmm5, 32(%rax) ; SSE-NEXT: movaps %xmm8, (%rax) ; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll @@ -417,11 +417,11 @@ ; ; AVX1-LABEL: store_i64_stride6_vf8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovapd (%rdi), %ymm7 -; AVX1-NEXT: vmovapd 32(%rdi), %ymm12 -; AVX1-NEXT: vmovapd (%rsi), %ymm9 +; AVX1-NEXT: vmovapd (%rdi), %ymm6 +; AVX1-NEXT: vmovapd 32(%rdi), %ymm10 +; AVX1-NEXT: vmovapd (%rsi), %ymm7 ; AVX1-NEXT: vmovapd 32(%rsi), %ymm13 -; AVX1-NEXT: vmovapd (%r8), %ymm10 +; AVX1-NEXT: vmovapd (%r8), %ymm8 ; AVX1-NEXT: vmovapd 32(%r8), %ymm14 ; AVX1-NEXT: vmovapd 32(%r9), %ymm2 ; AVX1-NEXT: vmovaps 48(%rsi), %xmm0 @@ -432,30 +432,30 @@ ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-NEXT: vmovaps (%rsi), %xmm3 -; AVX1-NEXT: vmovaps 16(%rsi), %xmm5 -; AVX1-NEXT: vmovaps 32(%rsi), %xmm6 -; AVX1-NEXT: vmovaps (%rdi), %xmm4 -; AVX1-NEXT: vmovaps 16(%rdi), %xmm11 -; AVX1-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm8[1],xmm6[1] -; AVX1-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-NEXT: vblendpd {{.*#+}} ymm15 = ymm14[0],ymm15[1,2,3] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0],ymm1[1],ymm15[2,3] +; AVX1-NEXT: vmovaps 32(%rsi), %xmm4 +; AVX1-NEXT: vmovaps 32(%rdi), %xmm5 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm4[1] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm14[0],ymm3[1,2,3] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0],ymm1[1],ymm3[2,3] ; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] -; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] -; AVX1-NEXT: vbroadcastsd 16(%rcx), %ymm11 -; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6,7] -; AVX1-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm4[1],xmm3[1] +; AVX1-NEXT: vmovaps 16(%rsi), %xmm3 +; AVX1-NEXT: vmovaps 16(%rdi), %xmm9 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm3[0],ymm9[2],ymm3[2] +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX1-NEXT: vbroadcastsd 16(%rcx), %ymm9 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] +; AVX1-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX1-NEXT: vmovaps (%rsi), %xmm11 +; AVX1-NEXT: vmovaps (%rdi), %xmm12 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm12[1],xmm11[1] ; AVX1-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-NEXT: vblendpd {{.*#+}} ymm15 = ymm10[0],ymm15[1,2,3] -; AVX1-NEXT: vblendpd {{.*#+}} ymm11 = ymm15[0],ymm11[1],ymm15[2,3] +; AVX1-NEXT: vblendpd {{.*#+}} ymm15 = ymm8[0],ymm15[1,2,3] +; AVX1-NEXT: vblendpd {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2,3] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm2[2,3],ymm13[2,3] -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] -; AVX1-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[0],ymm15[0],ymm12[2],ymm15[3] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm13[1],ymm10[3],ymm13[3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm14[2,3],ymm10[2,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[0],ymm15[0],ymm10[2],ymm15[3] ; AVX1-NEXT: vmovaps 32(%rcx), %xmm14 ; AVX1-NEXT: vpermilps {{.*#+}} xmm13 = mem[2,3,2,3] ; AVX1-NEXT: vbroadcastsd 40(%r8), %ymm15 @@ -463,43 +463,43 @@ ; AVX1-NEXT: vinsertf128 $1, 32(%r9), %ymm14, %ymm15 ; AVX1-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7] ; AVX1-NEXT: vmovapd (%r9), %ymm15 -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3],ymm7[2,3] -; AVX1-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[2],ymm9[3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm9 = mem[2,3,2,3] -; AVX1-NEXT: vbroadcastsd 8(%r8), %ymm10 -; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] -; AVX1-NEXT: vmovaps (%rcx), %xmm10 -; AVX1-NEXT: vinsertf128 $1, (%r9), %ymm10, %ymm0 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3],ymm9[4,5],ymm0[6,7] -; AVX1-NEXT: vmovapd 48(%rdx), %xmm9 -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],mem[1] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[2,3],ymm7[2,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm8[2,3],ymm6[2,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = mem[2,3,2,3] +; AVX1-NEXT: vbroadcastsd 8(%r8), %ymm8 +; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] +; AVX1-NEXT: vmovaps (%rcx), %xmm8 +; AVX1-NEXT: vinsertf128 $1, (%r9), %ymm8, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] +; AVX1-NEXT: vmovapd 48(%rdx), %xmm7 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],mem[1] ; AVX1-NEXT: vbroadcastsd 56(%r8), %ymm1 -; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2],ymm9[3] +; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3] ; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] ; AVX1-NEXT: vmovapd 16(%rdx), %xmm2 ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],mem[1] -; AVX1-NEXT: vbroadcastsd 24(%r8), %ymm9 -; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm9[2],ymm2[3] +; AVX1-NEXT: vbroadcastsd 24(%r8), %ymm7 +; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3] ; AVX1-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm15[3] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm8[0],xmm6[0] -; AVX1-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm8 = xmm8[0],xmm14[0] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX1-NEXT: vmovaps (%rdx), %xmm4 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm10[0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0] +; AVX1-NEXT: vmovaps 32(%rdx), %xmm5 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm5[0],xmm14[0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm7 = xmm12[0],xmm11[0] +; AVX1-NEXT: vmovaps (%rdx), %xmm11 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm8 = xmm11[0],xmm8[0] ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-NEXT: vmovaps %xmm4, 16(%rax) -; AVX1-NEXT: vmovaps %xmm3, (%rax) -; AVX1-NEXT: vmovaps %xmm8, 208(%rax) -; AVX1-NEXT: vmovaps %xmm6, 192(%rax) +; AVX1-NEXT: vmovaps %xmm8, 16(%rax) +; AVX1-NEXT: vmovaps %xmm7, (%rax) +; AVX1-NEXT: vmovaps %xmm5, 208(%rax) +; AVX1-NEXT: vmovaps %xmm4, 192(%rax) ; AVX1-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-NEXT: vmovapd %ymm7, 128(%rax) +; AVX1-NEXT: vmovapd %ymm6, 128(%rax) ; AVX1-NEXT: vmovaps %ymm13, 256(%rax) -; AVX1-NEXT: vmovapd %ymm12, 320(%rax) -; AVX1-NEXT: vmovapd %ymm11, 32(%rax) -; AVX1-NEXT: vmovaps %ymm5, 96(%rax) +; AVX1-NEXT: vmovapd %ymm10, 320(%rax) +; AVX1-NEXT: vmovapd %ymm9, 32(%rax) +; AVX1-NEXT: vmovaps %ymm3, 96(%rax) ; AVX1-NEXT: vmovapd %ymm2, 160(%rax) ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm0, 224(%rax) diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll @@ -572,7 +572,7 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: setne %al @@ -588,7 +588,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: setne %al @@ -604,7 +604,7 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: testl %eax, %eax ; AVX512-NEXT: setne %al @@ -643,7 +643,7 @@ ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al @@ -660,7 +660,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al @@ -674,11 +674,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: testl %eax, %eax ; AVX512-NEXT: sete %al @@ -719,11 +719,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: setne %al @@ -738,11 +738,11 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: setne %al @@ -757,11 +757,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: testl %eax, %eax ; AVX512-NEXT: setne %al diff --git a/llvm/test/CodeGen/X86/vector-reduce-and.ll b/llvm/test/CodeGen/X86/vector-reduce-and.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and.ll @@ -489,11 +489,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -504,11 +504,11 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -519,11 +519,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -559,7 +559,7 @@ ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -575,7 +575,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -588,11 +588,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -630,11 +630,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -648,11 +648,11 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -666,11 +666,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-or.ll b/llvm/test/CodeGen/X86/vector-reduce-or.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or.ll @@ -489,11 +489,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -504,11 +504,11 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -519,11 +519,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -559,7 +559,7 @@ ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -575,7 +575,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -588,11 +588,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -630,11 +630,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -648,11 +648,11 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -666,11 +666,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor.ll b/llvm/test/CodeGen/X86/vector-reduce-xor.ll --- a/llvm/test/CodeGen/X86/vector-reduce-xor.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor.ll @@ -489,11 +489,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -504,11 +504,11 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -519,11 +519,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -559,7 +559,7 @@ ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -575,7 +575,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -588,11 +588,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -630,11 +630,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -648,11 +648,11 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -666,11 +666,11 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -658,7 +658,7 @@ ; ; AVX2-LABEL: splatvar_rotate_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX2-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX2-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -669,7 +669,7 @@ ; ; AVX512F-LABEL: splatvar_rotate_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -680,7 +680,7 @@ ; ; AVX512VL-LABEL: splatvar_rotate_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -691,7 +691,7 @@ ; ; AVX512BW-LABEL: splatvar_rotate_v16i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512BW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 @@ -702,7 +702,7 @@ ; ; AVX512VLBW-LABEL: splatvar_rotate_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm0, %ymm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3 diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll --- a/llvm/test/CodeGen/X86/vector-rotate-512.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll @@ -343,7 +343,7 @@ ; ; AVX512BW-LABEL: splatvar_rotate_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm4 ; AVX512BW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3 @@ -354,7 +354,7 @@ ; ; AVX512VLBW-LABEL: splatvar_rotate_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,15] ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm4, %zmm3 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -2991,40 +2991,56 @@ define <8 x i16> @shuffle_extract_concat_insert(<4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i16> %b) { ; SSE2-LABEL: shuffle_extract_concat_insert: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: pextrw $5, %xmm2, %edx +; SSE2-NEXT: pextrw $7, %xmm2, %esi +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE2-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-NEXT: pinsrw $5, %edx, %xmm0 +; SSE2-NEXT: pinsrw $6, %eax, %xmm0 +; SSE2-NEXT: pinsrw $7, %esi, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_extract_concat_insert: ; SSSE3: # %bb.0: -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pextrw $2, %xmm1, %eax +; SSSE3-NEXT: pextrw $5, %xmm2, %ecx +; SSSE3-NEXT: movd %xmm1, %edx +; SSSE3-NEXT: pextrw $7, %xmm2, %esi ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,0,1,14,15,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pinsrw $4, %eax, %xmm0 +; SSSE3-NEXT: pinsrw $5, %ecx, %xmm0 +; SSSE3-NEXT: pinsrw $6, %edx, %xmm0 +; SSSE3-NEXT: pinsrw $7, %esi, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_extract_concat_insert: ; SSE41: # %bb.0: -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] +; SSE41-NEXT: pextrw $2, %xmm1, %eax ; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,0,1,14,15,u,u,u,u,12,13,14,15] +; SSE41-NEXT: movd %xmm1, %ecx +; SSE41-NEXT: pinsrw $4, %eax, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5],xmm0[6,7] +; SSE41-NEXT: pinsrw $6, %ecx, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_extract_concat_insert: ; AVX: # %bb.0: -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpextrw $2, %xmm1, %eax +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,0,1,14,15,u,u,u,u,12,13,14,15] +; AVX-NEXT: vmovd %xmm1, %ecx +; AVX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5],xmm0[6,7] +; AVX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] ; AVX-NEXT: retq %a = shufflevector <4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i32> %a0 = extractelement <8 x i16> %a, i32 0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll @@ -364,8 +364,8 @@ ; AMD10H: # %bb.0: ; AMD10H-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; AMD10H-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AMD10H-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AMD10H-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; AMD10H-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AMD10H-NEXT: packuswb %xmm0, %xmm0 ; AMD10H-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -2571,10 +2571,9 @@ ; ; SSE41-LABEL: splatshuf_zext_v4i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatshuf_zext_v4i64: @@ -2711,11 +2710,8 @@ ; ; SSE41-LABEL: splatshuf_zext_v16i16: ; SSE41: # %bb.0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatshuf_zext_v16i16: diff --git a/llvm/test/CodeGen/X86/xor.ll b/llvm/test/CodeGen/X86/xor.ll --- a/llvm/test/CodeGen/X86/xor.ll +++ b/llvm/test/CodeGen/X86/xor.ll @@ -409,8 +409,8 @@ ; ; X64-WIN-LABEL: PR17487: ; X64-WIN: # %bb.0: +; X64-WIN-NEXT: andb $1, %cl ; X64-WIN-NEXT: movzbl %cl, %eax -; X64-WIN-NEXT: andl $1, %eax ; X64-WIN-NEXT: retq %tmp = insertelement <2 x i1> undef, i1 %tobool, i32 1 %tmp1 = zext <2 x i1> %tmp to <2 x i64>