Index: llvm/lib/CodeGen/RegAllocGreedy.cpp =================================================================== --- llvm/lib/CodeGen/RegAllocGreedy.cpp +++ llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -763,7 +763,7 @@ bool AddPriorityToGlobal = TRI->addAllocPriorityToGlobalRanges(); const TargetRegisterClass &RC = *MRI->getRegClass(Reg); bool ForceGlobal = !ReverseLocal && - (Size / SlotIndex::InstrDist) > (2 * RC.getNumRegs()); + (Size / SlotIndex::InstrDist) > (2 * RCI.getNumAllocatableRegs(&RC)); if (ExtraRegInfo[Reg].Stage == RS_Assign && !ForceGlobal && !LI->empty() && LIS->intervalIsInOneMBB(*LI)) { Index: llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -485,15 +485,40 @@ ; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc +; GCN-NEXT: global_load_dwordx4 v[5:8], v[3:4], off offset:16 ; GCN-NEXT: s_movk_i32 s4, 0x80 -; GCN-NEXT: global_load_dwordx4 v[8:11], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[12:15], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[56:59], v[3:4], off offset:48 ; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_add_i32 s32, s32, 0x10000 +; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[8:11], v[3:4], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v[3:4], off offset:48 ; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: v_mov_b32_e32 v4, s5 ; GCN-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 +; GCN-NEXT: s_movk_i32 s4, 0xc0 +; GCN-NEXT: v_mov_b32_e32 v6, s5 ; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v4, vcc +; GCN-NEXT: v_mov_b32_e32 v5, s4 +; GCN-NEXT: v_add_co_u32_e32 v60, vcc, v0, v5 +; GCN-NEXT: v_addc_co_u32_e32 v61, vcc, v1, v6, vcc ; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off ; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:32 @@ -502,128 +527,91 @@ ; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 ; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:192 ; GCN-NEXT: global_load_dwordx4 v[44:47], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:48 -; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: v_mov_b32_e32 v6, s5 -; GCN-NEXT: v_mov_b32_e32 v5, s4 -; GCN-NEXT: v_add_co_u32_e32 v60, vcc, v0, v5 -; GCN-NEXT: v_addc_co_u32_e32 v61, vcc, v1, v6, vcc +; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[4:7], v[60:61], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[56:59], v[60:61], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[60:63], v[60:61], off offset:48 ; GCN-NEXT: v_and_b32_e32 v0, 31, v2 ; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: v_add_u32_e32 v2, 0x100, v2 ; GCN-NEXT: v_add_u32_e32 v1, v2, v0 -; GCN-NEXT: s_add_i32 s32, s32, 0x10000 -; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[4:7], v[60:61], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[60:61], off offset:32 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[60:63], v[60:61], off offset:48 +; GCN-NEXT: s_waitcnt vmcnt(12) ; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:256 ; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:260 ; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:264 ; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:268 +; GCN-NEXT: s_waitcnt vmcnt(15) ; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:272 ; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:276 ; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:280 ; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:284 +; GCN-NEXT: s_waitcnt vmcnt(18) ; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:288 ; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:292 ; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:296 ; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:300 +; GCN-NEXT: s_waitcnt vmcnt(21) ; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:304 ; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:308 ; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:312 ; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:316 +; GCN-NEXT: s_waitcnt vmcnt(24) ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:320 ; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:324 ; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:328 ; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:332 +; GCN-NEXT: s_waitcnt vmcnt(27) ; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384 ; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388 ; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392 ; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:336 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:340 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:344 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:348 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:352 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:356 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:360 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:364 -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:368 -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:372 -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:376 -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:380 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v16, v20 +; GCN-NEXT: v_mov_b32_e32 v17, v21 +; GCN-NEXT: v_mov_b32_e32 v18, v22 +; GCN-NEXT: v_mov_b32_e32 v19, v23 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:336 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:340 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:344 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:348 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:352 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:356 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:360 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:364 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:368 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:372 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:376 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:380 ; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:400 ; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:404 ; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:408 ; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v20 -; GCN-NEXT: v_mov_b32_e32 v13, v21 -; GCN-NEXT: v_mov_b32_e32 v14, v22 -; GCN-NEXT: v_mov_b32_e32 v15, v23 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:432 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:436 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:440 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:444 +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:432 +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:436 +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:440 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:444 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:448 ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:452 ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:456 @@ -632,31 +620,10 @@ ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 ; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 ; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v11 -; GCN-NEXT: v_mov_b32_e32 v9, v12 -; GCN-NEXT: v_mov_b32_e32 v10, v13 -; GCN-NEXT: v_mov_b32_e32 v11, v14 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:480 +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:484 +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:488 +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:492 ; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:496 ; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:500 ; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:504 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -1025,10 +1025,11 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-LABEL: s_mul_i256: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s16, s0 ; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX7-NEXT: s_mul_i32 s17, s1, s8 -; GFX7-NEXT: s_mul_i32 s18, s0, s9 +; GFX7-NEXT: s_mul_i32 s18, s16, s9 ; GFX7-NEXT: s_add_u32 s17, s17, s18 ; GFX7-NEXT: s_cselect_b32 s18, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s17, v0 @@ -1041,12 +1042,12 @@ ; GFX7-NEXT: s_add_u32 s17, s17, s18 ; GFX7-NEXT: s_cselect_b32 s18, 1, 0 ; GFX7-NEXT: v_mul_hi_u32 v2, v2, s8 -; GFX7-NEXT: s_mul_i32 s19, s0, s10 +; GFX7-NEXT: s_mul_i32 s19, s16, s10 ; GFX7-NEXT: s_and_b32 s18, s18, 1 ; GFX7-NEXT: s_add_u32 s17, s17, s19 ; GFX7-NEXT: v_mov_b32_e32 v3, s9 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 -; GFX7-NEXT: v_mul_hi_u32 v4, s0, v3 +; GFX7-NEXT: v_mul_hi_u32 v4, s16, v3 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, s17, v2 ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: s_add_i32 s18, s18, s19 @@ -1069,7 +1070,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-NEXT: v_mul_hi_u32 v5, v4, s8 ; GFX7-NEXT: s_and_b32 s19, s19, 1 -; GFX7-NEXT: s_mul_i32 s20, s0, s11 +; GFX7-NEXT: s_mul_i32 s20, s16, s11 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: s_add_u32 s17, s17, s20 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 @@ -1080,7 +1081,7 @@ ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v8, vcc, s18, v8 -; GFX7-NEXT: v_mul_hi_u32 v7, s0, v6 +; GFX7-NEXT: v_mul_hi_u32 v7, s16, v6 ; GFX7-NEXT: s_mul_i32 s17, s4, s8 ; GFX7-NEXT: s_mul_i32 s18, s3, s9 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, v5, v3 @@ -1106,7 +1107,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_mul_hi_u32 v7, v5, s8 -; GFX7-NEXT: s_mul_i32 s21, s0, s12 +; GFX7-NEXT: s_mul_i32 s21, s16, s12 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: s_add_u32 s17, s17, s21 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 @@ -1128,7 +1129,7 @@ ; GFX7-NEXT: s_and_b32 s18, s18, 1 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; GFX7-NEXT: s_add_u32 s17, s17, s19 -; GFX7-NEXT: v_mul_hi_u32 v10, s0, v9 +; GFX7-NEXT: v_mul_hi_u32 v10, s16, v9 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc @@ -1152,7 +1153,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v7, s4 ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_mul_hi_u32 v8, v7, s8 -; GFX7-NEXT: s_mul_i32 s22, s0, s13 +; GFX7-NEXT: s_mul_i32 s22, s16, s13 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: s_add_u32 s17, s17, s22 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 @@ -1183,7 +1184,7 @@ ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; GFX7-NEXT: s_add_u32 s17, s17, s20 -; GFX7-NEXT: v_mul_hi_u32 v13, s0, v12 +; GFX7-NEXT: v_mul_hi_u32 v13, s16, v12 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc @@ -1207,7 +1208,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v8, s5 ; GFX7-NEXT: v_mul_hi_u32 v10, v8, s8 ; GFX7-NEXT: s_and_b32 s19, s19, 1 -; GFX7-NEXT: s_mul_i32 s23, s0, s14 +; GFX7-NEXT: s_mul_i32 s23, s16, s14 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: s_add_u32 s17, s17, s23 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 @@ -1230,49 +1231,49 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX7-NEXT: v_mov_b32_e32 v15, s13 ; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GFX7-NEXT: v_mul_hi_u32 v16, s0, v15 +; GFX7-NEXT: v_mul_hi_u32 v16, s16, v15 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v14 ; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v16 ; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GFX7-NEXT: v_mov_b32_e32 v13, s14 ; GFX7-NEXT: s_mul_i32 s7, s7, s8 ; GFX7-NEXT: s_mul_i32 s17, s6, s9 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GFX7-NEXT: s_mul_i32 s16, s0, s8 +; GFX7-NEXT: s_mul_i32 s4, s4, s11 +; GFX7-NEXT: s_mul_i32 s11, s3, s12 +; GFX7-NEXT: s_mul_i32 s12, s2, s13 +; GFX7-NEXT: v_mul_hi_u32 v11, s2, v12 ; GFX7-NEXT: s_mul_i32 s5, s5, s10 -; GFX7-NEXT: s_mul_i32 s15, s0, s15 -; GFX7-NEXT: v_mul_hi_u32 v13, s0, v13 -; GFX7-NEXT: s_add_i32 s0, s7, s17 +; GFX7-NEXT: s_mul_i32 s13, s1, s14 +; GFX7-NEXT: v_mul_hi_u32 v12, s1, v15 +; GFX7-NEXT: s_add_i32 s1, s7, s17 ; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX7-NEXT: s_mul_i32 s4, s4, s11 -; GFX7-NEXT: s_add_i32 s0, s0, s5 +; GFX7-NEXT: s_add_i32 s1, s1, s5 +; GFX7-NEXT: s_add_i32 s1, s1, s4 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; GFX7-NEXT: v_mov_b32_e32 v10, s6 -; GFX7-NEXT: s_mul_i32 s11, s3, s12 -; GFX7-NEXT: s_add_i32 s0, s0, s4 -; GFX7-NEXT: s_mul_i32 s12, s2, s13 -; GFX7-NEXT: s_add_i32 s0, s0, s11 +; GFX7-NEXT: s_add_i32 s1, s1, s11 ; GFX7-NEXT: v_mul_hi_u32 v10, v10, s8 -; GFX7-NEXT: s_mul_i32 s13, s1, s14 -; GFX7-NEXT: s_add_i32 s0, s0, s12 +; GFX7-NEXT: s_add_i32 s1, s1, s12 ; GFX7-NEXT: v_mul_hi_u32 v8, v8, s9 -; GFX7-NEXT: s_add_i32 s0, s0, s13 +; GFX7-NEXT: s_mul_i32 s15, s16, s15 +; GFX7-NEXT: s_add_i32 s1, s1, s13 ; GFX7-NEXT: v_mul_hi_u32 v7, v7, s10 ; GFX7-NEXT: v_mul_hi_u32 v9, s3, v9 -; GFX7-NEXT: s_add_i32 s0, s0, s15 -; GFX7-NEXT: v_mul_hi_u32 v11, s2, v12 -; GFX7-NEXT: v_add_i32_e32 v10, vcc, s0, v10 -; GFX7-NEXT: v_mul_hi_u32 v12, s1, v15 +; GFX7-NEXT: s_add_i32 s1, s1, s15 +; GFX7-NEXT: v_add_i32_e32 v10, vcc, s1, v10 +; GFX7-NEXT: v_mov_b32_e32 v13, s14 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GFX7-NEXT: v_mul_hi_u32 v13, s16, v13 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v11 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v12 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX7-NEXT: s_mul_i32 s0, s0, s8 ; GFX7-NEXT: v_readfirstlane_b32 s1, v0 ; GFX7-NEXT: v_readfirstlane_b32 s2, v1 ; GFX7-NEXT: v_readfirstlane_b32 s3, v2 @@ -1280,15 +1281,15 @@ ; GFX7-NEXT: v_readfirstlane_b32 s5, v4 ; GFX7-NEXT: v_readfirstlane_b32 s6, v5 ; GFX7-NEXT: v_readfirstlane_b32 s7, v6 -; GFX7-NEXT: s_mov_b32 s0, s16 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_mul_i256: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s16, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX8-NEXT: s_mul_i32 s17, s1, s8 -; GFX8-NEXT: s_mul_i32 s18, s0, s9 +; GFX8-NEXT: s_mul_i32 s18, s16, s9 ; GFX8-NEXT: s_add_u32 s17, s17, s18 ; GFX8-NEXT: s_cselect_b32 s18, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s17, v0 @@ -1301,12 +1302,12 @@ ; GFX8-NEXT: s_add_u32 s17, s17, s18 ; GFX8-NEXT: s_cselect_b32 s18, 1, 0 ; GFX8-NEXT: v_mul_hi_u32 v2, v2, s8 -; GFX8-NEXT: s_mul_i32 s19, s0, s10 +; GFX8-NEXT: s_mul_i32 s19, s16, s10 ; GFX8-NEXT: s_and_b32 s18, s18, 1 ; GFX8-NEXT: s_add_u32 s17, s17, s19 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 -; GFX8-NEXT: v_mul_hi_u32 v4, s0, v3 +; GFX8-NEXT: v_mul_hi_u32 v4, s16, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s17, v2 ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: s_add_i32 s18, s18, s19 @@ -1329,7 +1330,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: v_mul_hi_u32 v5, v4, s8 ; GFX8-NEXT: s_and_b32 s19, s19, 1 -; GFX8-NEXT: s_mul_i32 s20, s0, s11 +; GFX8-NEXT: s_mul_i32 s20, s16, s11 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: s_add_u32 s17, s17, s20 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 @@ -1340,7 +1341,7 @@ ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, s18, v8 -; GFX8-NEXT: v_mul_hi_u32 v7, s0, v6 +; GFX8-NEXT: v_mul_hi_u32 v7, s16, v6 ; GFX8-NEXT: s_mul_i32 s17, s4, s8 ; GFX8-NEXT: s_mul_i32 s18, s3, s9 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 @@ -1366,7 +1367,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_mul_hi_u32 v7, v5, s8 -; GFX8-NEXT: s_mul_i32 s21, s0, s12 +; GFX8-NEXT: s_mul_i32 s21, s16, s12 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: s_add_u32 s17, s17, s21 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 @@ -1388,7 +1389,7 @@ ; GFX8-NEXT: s_and_b32 s18, s18, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v11, v7 ; GFX8-NEXT: s_add_u32 s17, s17, s19 -; GFX8-NEXT: v_mul_hi_u32 v10, s0, v9 +; GFX8-NEXT: v_mul_hi_u32 v10, s16, v9 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc @@ -1412,7 +1413,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v7, s4 ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_mul_hi_u32 v8, v7, s8 -; GFX8-NEXT: s_mul_i32 s22, s0, s13 +; GFX8-NEXT: s_mul_i32 s22, s16, s13 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: s_add_u32 s17, s17, s22 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 @@ -1443,7 +1444,7 @@ ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v10, v8 ; GFX8-NEXT: s_add_u32 s17, s17, s20 -; GFX8-NEXT: v_mul_hi_u32 v13, s0, v12 +; GFX8-NEXT: v_mul_hi_u32 v13, s16, v12 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc @@ -1467,7 +1468,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v8, s5 ; GFX8-NEXT: v_mul_hi_u32 v10, v8, s8 ; GFX8-NEXT: s_and_b32 s19, s19, 1 -; GFX8-NEXT: s_mul_i32 s23, s0, s14 +; GFX8-NEXT: s_mul_i32 s23, s16, s14 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: s_add_u32 s17, s17, s23 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 @@ -1490,49 +1491,49 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX8-NEXT: v_mov_b32_e32 v15, s13 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11 -; GFX8-NEXT: v_mul_hi_u32 v16, s0, v15 +; GFX8-NEXT: v_mul_hi_u32 v16, s16, v15 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v14 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v16 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11 -; GFX8-NEXT: v_mov_b32_e32 v13, s14 ; GFX8-NEXT: s_mul_i32 s7, s7, s8 ; GFX8-NEXT: s_mul_i32 s17, s6, s9 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v6 -; GFX8-NEXT: s_mul_i32 s16, s0, s8 +; GFX8-NEXT: s_mul_i32 s4, s4, s11 +; GFX8-NEXT: s_mul_i32 s11, s3, s12 +; GFX8-NEXT: s_mul_i32 s12, s2, s13 +; GFX8-NEXT: v_mul_hi_u32 v11, s2, v12 ; GFX8-NEXT: s_mul_i32 s5, s5, s10 -; GFX8-NEXT: s_mul_i32 s15, s0, s15 -; GFX8-NEXT: v_mul_hi_u32 v13, s0, v13 -; GFX8-NEXT: s_add_i32 s0, s7, s17 +; GFX8-NEXT: s_mul_i32 s13, s1, s14 +; GFX8-NEXT: v_mul_hi_u32 v12, s1, v15 +; GFX8-NEXT: s_add_i32 s1, s7, s17 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: s_mul_i32 s4, s4, s11 -; GFX8-NEXT: s_add_i32 s0, s0, s5 +; GFX8-NEXT: s_add_i32 s1, s1, s5 +; GFX8-NEXT: s_add_i32 s1, s1, s4 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v10, v6 ; GFX8-NEXT: v_mov_b32_e32 v10, s6 -; GFX8-NEXT: s_mul_i32 s11, s3, s12 -; GFX8-NEXT: s_add_i32 s0, s0, s4 -; GFX8-NEXT: s_mul_i32 s12, s2, s13 -; GFX8-NEXT: s_add_i32 s0, s0, s11 +; GFX8-NEXT: s_add_i32 s1, s1, s11 ; GFX8-NEXT: v_mul_hi_u32 v10, v10, s8 -; GFX8-NEXT: s_mul_i32 s13, s1, s14 -; GFX8-NEXT: s_add_i32 s0, s0, s12 +; GFX8-NEXT: s_add_i32 s1, s1, s12 ; GFX8-NEXT: v_mul_hi_u32 v8, v8, s9 -; GFX8-NEXT: s_add_i32 s0, s0, s13 +; GFX8-NEXT: s_mul_i32 s15, s16, s15 +; GFX8-NEXT: s_add_i32 s1, s1, s13 ; GFX8-NEXT: v_mul_hi_u32 v7, v7, s10 ; GFX8-NEXT: v_mul_hi_u32 v9, s3, v9 -; GFX8-NEXT: s_add_i32 s0, s0, s15 -; GFX8-NEXT: v_mul_hi_u32 v11, s2, v12 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, s0, v10 -; GFX8-NEXT: v_mul_hi_u32 v12, s1, v15 +; GFX8-NEXT: s_add_i32 s1, s1, s15 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, s1, v10 +; GFX8-NEXT: v_mov_b32_e32 v13, s14 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v10, v8 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 +; GFX8-NEXT: v_mul_hi_u32 v13, s16, v13 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v11 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v12 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v13 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 +; GFX8-NEXT: s_mul_i32 s0, s0, s8 ; GFX8-NEXT: v_readfirstlane_b32 s1, v0 ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_readfirstlane_b32 s3, v2 @@ -1540,16 +1541,16 @@ ; GFX8-NEXT: v_readfirstlane_b32 s5, v4 ; GFX8-NEXT: v_readfirstlane_b32 s6, v5 ; GFX8-NEXT: v_readfirstlane_b32 s7, v6 -; GFX8-NEXT: s_mov_b32 s0, s16 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i256: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s16, s0 ; GFX9-NEXT: s_mul_i32 s17, s1, s8 -; GFX9-NEXT: s_mul_i32 s18, s0, s9 +; GFX9-NEXT: s_mul_i32 s18, s16, s9 ; GFX9-NEXT: s_add_u32 s17, s17, s18 ; GFX9-NEXT: s_cselect_b32 s18, 1, 0 -; GFX9-NEXT: s_mul_hi_u32 s19, s0, s8 +; GFX9-NEXT: s_mul_hi_u32 s19, s16, s8 ; GFX9-NEXT: s_and_b32 s18, s18, 1 ; GFX9-NEXT: s_add_u32 s17, s17, s19 ; GFX9-NEXT: s_cselect_b32 s19, 1, 0 @@ -1559,7 +1560,7 @@ ; GFX9-NEXT: s_mul_i32 s20, s1, s9 ; GFX9-NEXT: s_add_u32 s19, s19, s20 ; GFX9-NEXT: s_cselect_b32 s20, 1, 0 -; GFX9-NEXT: s_mul_i32 s21, s0, s10 +; GFX9-NEXT: s_mul_i32 s21, s16, s10 ; GFX9-NEXT: s_and_b32 s20, s20, 1 ; GFX9-NEXT: s_add_u32 s19, s19, s21 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 @@ -1569,7 +1570,7 @@ ; GFX9-NEXT: s_add_u32 s19, s19, s22 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 ; GFX9-NEXT: s_and_b32 s21, s21, 1 -; GFX9-NEXT: s_mul_hi_u32 s23, s0, s9 +; GFX9-NEXT: s_mul_hi_u32 s23, s16, s9 ; GFX9-NEXT: s_add_i32 s20, s20, s21 ; GFX9-NEXT: s_add_u32 s19, s19, s23 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 @@ -1588,7 +1589,7 @@ ; GFX9-NEXT: s_add_u32 s19, s19, s22 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0 ; GFX9-NEXT: s_and_b32 s22, s22, 1 -; GFX9-NEXT: s_mul_i32 s23, s0, s11 +; GFX9-NEXT: s_mul_i32 s23, s16, s11 ; GFX9-NEXT: s_add_i32 s21, s21, s22 ; GFX9-NEXT: s_add_u32 s19, s19, s23 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0 @@ -1603,7 +1604,7 @@ ; GFX9-NEXT: s_add_u32 s19, s19, s25 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0 ; GFX9-NEXT: s_and_b32 s22, s22, 1 -; GFX9-NEXT: s_mul_hi_u32 s26, s0, s10 +; GFX9-NEXT: s_mul_hi_u32 s26, s16, s10 ; GFX9-NEXT: s_add_i32 s21, s21, s22 ; GFX9-NEXT: s_add_u32 s19, s19, s26 ; GFX9-NEXT: s_cselect_b32 s22, 1, 0 @@ -1627,7 +1628,7 @@ ; GFX9-NEXT: s_add_u32 s20, s20, s24 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 ; GFX9-NEXT: s_and_b32 s23, s23, 1 -; GFX9-NEXT: s_mul_i32 s25, s0, s12 +; GFX9-NEXT: s_mul_i32 s25, s16, s12 ; GFX9-NEXT: s_add_i32 s22, s22, s23 ; GFX9-NEXT: s_add_u32 s20, s20, s25 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 @@ -1647,7 +1648,7 @@ ; GFX9-NEXT: s_add_u32 s20, s20, s28 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 ; GFX9-NEXT: s_and_b32 s23, s23, 1 -; GFX9-NEXT: s_mul_hi_u32 s29, s0, s11 +; GFX9-NEXT: s_mul_hi_u32 s29, s16, s11 ; GFX9-NEXT: s_add_i32 s22, s22, s23 ; GFX9-NEXT: s_add_u32 s20, s20, s29 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 @@ -1676,7 +1677,7 @@ ; GFX9-NEXT: s_add_u32 s21, s21, s26 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 ; GFX9-NEXT: s_and_b32 s24, s24, 1 -; GFX9-NEXT: s_mul_i32 s27, s0, s13 +; GFX9-NEXT: s_mul_i32 s27, s16, s13 ; GFX9-NEXT: s_add_i32 s23, s23, s24 ; GFX9-NEXT: s_add_u32 s21, s21, s27 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 @@ -1701,7 +1702,7 @@ ; GFX9-NEXT: s_add_u32 s21, s21, s31 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 ; GFX9-NEXT: s_and_b32 s24, s24, 1 -; GFX9-NEXT: s_mul_hi_u32 s33, s0, s12 +; GFX9-NEXT: s_mul_hi_u32 s33, s16, s12 ; GFX9-NEXT: s_add_i32 s23, s23, s24 ; GFX9-NEXT: s_add_u32 s21, s21, s33 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 @@ -1735,7 +1736,7 @@ ; GFX9-NEXT: s_add_u32 s22, s22, s28 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 ; GFX9-NEXT: s_and_b32 s25, s25, 1 -; GFX9-NEXT: s_mul_i32 s29, s0, s14 +; GFX9-NEXT: s_mul_i32 s29, s16, s14 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s29 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 @@ -1765,7 +1766,7 @@ ; GFX9-NEXT: s_add_u32 s22, s22, s35 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 ; GFX9-NEXT: s_and_b32 s25, s25, 1 -; GFX9-NEXT: s_mul_hi_u32 s36, s0, s13 +; GFX9-NEXT: s_mul_hi_u32 s36, s16, s13 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s36 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 @@ -1787,7 +1788,7 @@ ; GFX9-NEXT: s_add_i32 s7, s7, s27 ; GFX9-NEXT: s_mul_i32 s29, s1, s14 ; GFX9-NEXT: s_add_i32 s7, s7, s28 -; GFX9-NEXT: s_mul_i32 s15, s0, s15 +; GFX9-NEXT: s_mul_i32 s15, s16, s15 ; GFX9-NEXT: s_add_i32 s7, s7, s29 ; GFX9-NEXT: s_mul_hi_u32 s6, s6, s8 ; GFX9-NEXT: s_add_i32 s7, s7, s15 @@ -1801,12 +1802,11 @@ ; GFX9-NEXT: s_mul_hi_u32 s2, s2, s12 ; GFX9-NEXT: s_add_i32 s2, s3, s2 ; GFX9-NEXT: s_mul_hi_u32 s1, s1, s13 -; GFX9-NEXT: s_mul_i32 s16, s0, s8 +; GFX9-NEXT: s_mul_i32 s0, s0, s8 ; GFX9-NEXT: s_add_i32 s1, s2, s1 -; GFX9-NEXT: s_mul_hi_u32 s0, s0, s14 -; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_add_i32 s7, s0, s24 -; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: s_mul_hi_u32 s8, s16, s14 +; GFX9-NEXT: s_add_i32 s1, s1, s8 +; GFX9-NEXT: s_add_i32 s7, s1, s24 ; GFX9-NEXT: s_mov_b32 s1, s17 ; GFX9-NEXT: s_mov_b32 s2, s18 ; GFX9-NEXT: s_mov_b32 s3, s19 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -538,14 +538,14 @@ ; GISEL-NEXT: v_cndmask_b32_e32 v0, v13, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v14, v4, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v7 +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v5 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v5 -; GISEL-NEXT: v_xor_b32_e32 v6, v6, v5 -; GISEL-NEXT: v_xor_b32_e32 v4, v11, v8 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v4 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v4 +; GISEL-NEXT: v_xor_b32_e32 v5, v11, v8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v6 ; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v7 ; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3 @@ -555,7 +555,7 @@ ; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 ; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v6 ; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v5 ; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 ; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 ; GISEL-NEXT: v_trunc_f32_e32 v9, v9 @@ -568,13 +568,14 @@ ; GISEL-NEXT: v_mul_lo_u32 v14, v11, v9 ; GISEL-NEXT: v_mul_hi_u32 v16, v11, v8 ; GISEL-NEXT: v_mul_lo_u32 v15, v11, v8 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; GISEL-NEXT: v_mul_lo_u32 v14, v9, v15 ; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13 ; GISEL-NEXT: v_mul_hi_u32 v17, v8, v15 ; GISEL-NEXT: v_mul_hi_u32 v15, v9, v15 +; GISEL-NEXT: v_xor_b32_e32 v4, v10, v4 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 @@ -625,18 +626,18 @@ ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_mul_lo_u32 v11, v3, v8 ; GISEL-NEXT: v_mul_lo_u32 v12, v2, v9 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_mul_hi_u32 v4, v2, v8 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v2, v8 ; GISEL-NEXT: v_mul_hi_u32 v8, v3, v8 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v11, v3, v9 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 ; GISEL-NEXT: v_mul_hi_u32 v12, v2, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 @@ -644,14 +645,14 @@ ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v5 ; GISEL-NEXT: v_mul_lo_u32 v11, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v13, v6, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, v6, v4 +; GISEL-NEXT: v_mul_hi_u32 v13, v6, v5 +; GISEL-NEXT: v_mul_lo_u32 v12, v6, v5 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 @@ -665,7 +666,7 @@ ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v7 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v4 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v12, s[4:5] ; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v8, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 @@ -680,8 +681,7 @@ ; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v6, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 @@ -1063,32 +1063,32 @@ ; CHECK-LABEL: v_sdiv_i64_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000 +; CHECK-NEXT: v_cvt_f32_u32_e32 v3, 0x1000 ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 ; CHECK-NEXT: s_movk_i32 s6, 0xf000 -; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1 -; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v4 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 +; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 +; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 +; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; CHECK-NEXT: v_trunc_f32_e32 v4, v4 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 ; CHECK-NEXT: s_bfe_i32 s7, -1, 0x10000 -; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, -1, v3 ; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 +; CHECK-NEXT: v_mul_hi_u32 v8, s6, v3 +; CHECK-NEXT: v_mul_lo_u32 v7, s6, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 -; CHECK-NEXT: v_mul_lo_u32 v8, v2, v5 -; CHECK-NEXT: v_mul_hi_u32 v9, v2, v7 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v9, v3, v7 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc @@ -1096,7 +1096,7 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v8, v3, v5 ; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc @@ -1107,18 +1107,18 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_addc_u32_e64 v6, s[4:5], v4, v5, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, -1, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, -1, v3 ; CHECK-NEXT: v_mul_lo_u32 v8, s6, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, s6, v2 +; CHECK-NEXT: v_mul_hi_u32 v10, s6, v3 +; CHECK-NEXT: v_mul_lo_u32 v9, s6, v3 ; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v10 ; CHECK-NEXT: v_mul_lo_u32 v8, v6, v9 -; CHECK-NEXT: v_mul_lo_u32 v10, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v5, v2, v9 +; CHECK-NEXT: v_mul_lo_u32 v10, v3, v7 +; CHECK-NEXT: v_mul_hi_u32 v5, v3, v9 ; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9 ; CHECK-NEXT: s_movk_i32 s6, 0x1000 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 @@ -1127,7 +1127,7 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] ; CHECK-NEXT: v_mul_lo_u32 v8, v6, v7 ; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 -; CHECK-NEXT: v_mul_hi_u32 v10, v2, v7 +; CHECK-NEXT: v_mul_hi_u32 v10, v3, v7 ; CHECK-NEXT: v_mul_hi_u32 v6, v6, v7 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] @@ -1139,12 +1139,12 @@ ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v8 ; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 ; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v1, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v1, v3 ; CHECK-NEXT: v_mul_lo_u32 v6, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v7, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 +; CHECK-NEXT: v_mul_hi_u32 v7, v0, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -1153,19 +1153,19 @@ ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_mul_hi_u32 v6, v0, v4 ; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_mul_lo_u32 v5, 0, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, 0, v3 ; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 +; CHECK-NEXT: v_mul_hi_u32 v8, s6, v3 +; CHECK-NEXT: v_mul_lo_u32 v7, s6, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 @@ -1178,7 +1178,7 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 ; CHECK-NEXT: v_mov_b32_e32 v7, s7 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v2 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] ; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v4, vcc ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 @@ -1193,12 +1193,12 @@ ; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i64 %num, 4096 ret i64 %result @@ -1217,44 +1217,44 @@ ; GISEL-NEXT: s_mov_b32 s7, s6 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s9 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s9 ; GISEL-NEXT: s_sub_u32 s11, 0, s8 ; GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GISEL-NEXT: s_and_b32 s4, s4, 1 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GISEL-NEXT: s_subb_u32 s12, 0, s9 -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v5, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v6, v6 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, s12, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, s11, v5 -; GISEL-NEXT: v_mul_hi_u32 v10, s11, v4 -; GISEL-NEXT: v_mul_lo_u32 v9, s11, v4 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, s12, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s11, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, s11, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, s11, v5 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v8, v5, v9 -; GISEL-NEXT: v_mul_lo_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GISEL-NEXT: v_mul_hi_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 @@ -1264,18 +1264,18 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v5, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, s12, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v6, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, s12, v5 ; GISEL-NEXT: v_mul_lo_u32 v10, s11, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, s11, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, s11, v4 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v12, s11, v5 +; GISEL-NEXT: v_mul_lo_u32 v11, s11, v5 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; GISEL-NEXT: v_mul_lo_u32 v10, v8, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v7, v4, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] @@ -1283,7 +1283,7 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v10, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 -; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v5, v9 ; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] @@ -1294,35 +1294,35 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v10 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v1, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v10, v0, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v1, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, v0, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 ; GISEL-NEXT: v_mov_b32_e32 v9, s9 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v1, v5 +; GISEL-NEXT: v_mul_lo_u32 v10, v1, v6 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v8, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v7, s9, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, s8, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, s8, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, s8, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_mul_lo_u32 v7, s9, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s8, v6 +; GISEL-NEXT: v_mul_hi_u32 v11, s8, v5 +; GISEL-NEXT: v_mul_lo_u32 v10, s8, v5 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 @@ -1336,8 +1336,8 @@ ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s9, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v5 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc @@ -1357,49 +1357,49 @@ ; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s7 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s7 ; GISEL-NEXT: s_sub_u32 s8, 0, s6 ; GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GISEL-NEXT: s_and_b32 s4, s4, 1 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GISEL-NEXT: s_subb_u32 s9, 0, s7 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v6 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v5, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v6, v6 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_mul_lo_u32 v7, s9, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, s8, v5 -; GISEL-NEXT: v_mul_hi_u32 v10, s8, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v3 -; GISEL-NEXT: v_mul_lo_u32 v9, s8, v4 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, s9, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s8, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, s8, v5 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GISEL-NEXT: v_mul_lo_u32 v9, s8, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v8, v5, v9 -; GISEL-NEXT: v_mul_lo_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v6 +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9 +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GISEL-NEXT: v_mul_hi_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 @@ -1409,27 +1409,27 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v5, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, s9, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v6, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, s9, v5 ; GISEL-NEXT: v_mul_lo_u32 v10, s8, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, s8, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, s8, v4 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v12, s8, v5 +; GISEL-NEXT: v_mul_lo_u32 v11, s8, v5 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; GISEL-NEXT: v_mul_lo_u32 v10, v8, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v7, v4, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v6 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v10, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 -; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v5, v9 ; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] @@ -1440,35 +1440,35 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v10 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v2, v5 -; GISEL-NEXT: v_mul_hi_u32 v10, v2, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v3, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v2, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 ; GISEL-NEXT: v_mov_b32_e32 v9, s7 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v3, v5 +; GISEL-NEXT: v_mul_lo_u32 v10, v3, v6 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v2, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v8, v2, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v7, s7, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, s6, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, s6, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, s6, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_mul_lo_u32 v7, s7, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s6, v6 +; GISEL-NEXT: v_mul_hi_u32 v11, s6, v5 +; GISEL-NEXT: v_mul_lo_u32 v10, s6, v5 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 @@ -1482,8 +1482,8 @@ ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s7, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v5 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc @@ -1497,34 +1497,34 @@ ; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v6 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v6 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 +; CGP-NEXT: v_cvt_f32_u32_e32 v5, 0x1000 ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 ; CGP-NEXT: s_movk_i32 s6, 0xf000 -; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 -; CGP-NEXT: v_mov_b32_e32 v7, v4 +; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; CGP-NEXT: v_mov_b32_e32 v7, v5 ; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 ; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 ; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 ; CGP-NEXT: s_movk_i32 s7, 0x1000 ; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 ; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 @@ -1537,7 +1537,7 @@ ; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 ; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 +; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 @@ -1568,7 +1568,7 @@ ; CGP-NEXT: v_mul_lo_u32 v14, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v13 ; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 @@ -1593,7 +1593,7 @@ ; CGP-NEXT: v_mul_lo_u32 v10, v0, v8 ; CGP-NEXT: v_mul_hi_u32 v11, v0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 @@ -1644,32 +1644,32 @@ ; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; CGP-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4 +; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 ; CGP-NEXT: v_trunc_f32_e32 v7, v7 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_mul_lo_u32 v8, -1, v4 +; CGP-NEXT: v_mul_lo_u32 v8, -1, v5 ; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 -; CGP-NEXT: v_mul_hi_u32 v11, s6, v4 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v4 +; CGP-NEXT: v_mul_hi_u32 v11, s6, v5 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v5 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; CGP-NEXT: v_mul_lo_u32 v9, v7, v10 -; CGP-NEXT: v_mul_lo_u32 v11, v4, v8 -; CGP-NEXT: v_mul_hi_u32 v12, v4, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v5, v8 +; CGP-NEXT: v_mul_hi_u32 v12, v5, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v7, v10 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v12, v7, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_mul_hi_u32 v11, v4, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v5, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc @@ -1680,18 +1680,18 @@ ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v7, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v10, -1, v4 +; CGP-NEXT: v_mul_lo_u32 v10, -1, v5 ; CGP-NEXT: v_mul_lo_u32 v11, s6, v9 -; CGP-NEXT: v_mul_hi_u32 v13, s6, v4 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v4 +; CGP-NEXT: v_mul_hi_u32 v13, s6, v5 +; CGP-NEXT: v_mul_lo_u32 v12, s6, v5 ; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 ; CGP-NEXT: v_mul_lo_u32 v11, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v8, v4, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v5, v10 +; CGP-NEXT: v_mul_hi_u32 v8, v5, v12 ; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 ; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 @@ -1700,7 +1700,7 @@ ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; CGP-NEXT: v_mul_lo_u32 v11, v9, v10 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 -; CGP-NEXT: v_mul_hi_u32 v13, v4, v10 +; CGP-NEXT: v_mul_hi_u32 v13, v5, v10 ; CGP-NEXT: v_mul_hi_u32 v9, v9, v10 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] @@ -1712,30 +1712,30 @@ ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v11 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 ; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_mul_lo_u32 v8, v3, v4 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_mul_lo_u32 v8, v3, v5 ; CGP-NEXT: v_mul_lo_u32 v9, v2, v7 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; CGP-NEXT: v_mul_hi_u32 v5, v2, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; CGP-NEXT: v_mul_hi_u32 v4, v2, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v3, v5 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v8, v3, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; CGP-NEXT: v_mul_hi_u32 v9, v2, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 @@ -1786,32 +1786,32 @@ ; CHECK-LABEL: v_sdiv_i64_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb +; CHECK-NEXT: v_cvt_f32_u32_e32 v3, 0x12d8fb ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 ; CHECK-NEXT: s_mov_b32 s6, 0xffed2705 -; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1 -; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v4 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 +; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 +; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 +; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; CHECK-NEXT: v_trunc_f32_e32 v4, v4 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 ; CHECK-NEXT: s_bfe_i32 s7, -1, 0x10000 -; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, -1, v3 ; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 +; CHECK-NEXT: v_mul_hi_u32 v8, s6, v3 +; CHECK-NEXT: v_mul_lo_u32 v7, s6, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 -; CHECK-NEXT: v_mul_lo_u32 v8, v2, v5 -; CHECK-NEXT: v_mul_hi_u32 v9, v2, v7 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v9, v3, v7 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc @@ -1819,7 +1819,7 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v8, v3, v5 ; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc @@ -1830,18 +1830,18 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_addc_u32_e64 v6, s[4:5], v4, v5, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, -1, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, -1, v3 ; CHECK-NEXT: v_mul_lo_u32 v8, s6, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, s6, v2 +; CHECK-NEXT: v_mul_hi_u32 v10, s6, v3 +; CHECK-NEXT: v_mul_lo_u32 v9, s6, v3 ; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v10 ; CHECK-NEXT: v_mul_lo_u32 v8, v6, v9 -; CHECK-NEXT: v_mul_lo_u32 v10, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v5, v2, v9 +; CHECK-NEXT: v_mul_lo_u32 v10, v3, v7 +; CHECK-NEXT: v_mul_hi_u32 v5, v3, v9 ; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9 ; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 @@ -1850,7 +1850,7 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] ; CHECK-NEXT: v_mul_lo_u32 v8, v6, v7 ; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 -; CHECK-NEXT: v_mul_hi_u32 v10, v2, v7 +; CHECK-NEXT: v_mul_hi_u32 v10, v3, v7 ; CHECK-NEXT: v_mul_hi_u32 v6, v6, v7 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] @@ -1862,12 +1862,12 @@ ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v8 ; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 ; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v1, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v1, v3 ; CHECK-NEXT: v_mul_lo_u32 v6, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v7, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 +; CHECK-NEXT: v_mul_hi_u32 v7, v0, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -1876,19 +1876,19 @@ ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_mul_hi_u32 v6, v0, v4 ; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_mul_lo_u32 v5, 0, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, 0, v3 ; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 +; CHECK-NEXT: v_mul_hi_u32 v8, s6, v3 +; CHECK-NEXT: v_mul_lo_u32 v7, s6, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 @@ -1901,7 +1901,7 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 ; CHECK-NEXT: v_mov_b32_e32 v7, s7 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v2 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] ; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v4, vcc ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 @@ -1916,12 +1916,12 @@ ; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i64 %num, 1235195 ret i64 %result @@ -1940,44 +1940,44 @@ ; GISEL-NEXT: s_mov_b32 s7, s6 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s9 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s9 ; GISEL-NEXT: s_sub_u32 s11, 0, s8 ; GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GISEL-NEXT: s_and_b32 s4, s4, 1 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GISEL-NEXT: s_subb_u32 s12, 0, s9 -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v5, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v6, v6 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, s12, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, s11, v5 -; GISEL-NEXT: v_mul_hi_u32 v10, s11, v4 -; GISEL-NEXT: v_mul_lo_u32 v9, s11, v4 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, s12, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s11, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, s11, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, s11, v5 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v8, v5, v9 -; GISEL-NEXT: v_mul_lo_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GISEL-NEXT: v_mul_hi_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 @@ -1987,18 +1987,18 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v5, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, s12, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v6, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, s12, v5 ; GISEL-NEXT: v_mul_lo_u32 v10, s11, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, s11, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, s11, v4 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v12, s11, v5 +; GISEL-NEXT: v_mul_lo_u32 v11, s11, v5 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; GISEL-NEXT: v_mul_lo_u32 v10, v8, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v7, v4, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] @@ -2006,7 +2006,7 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v10, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 -; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v5, v9 ; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] @@ -2017,35 +2017,35 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v10 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v1, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v10, v0, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v1, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, v0, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 ; GISEL-NEXT: v_mov_b32_e32 v9, s9 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v1, v5 +; GISEL-NEXT: v_mul_lo_u32 v10, v1, v6 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v8, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v7, s9, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, s8, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, s8, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, s8, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_mul_lo_u32 v7, s9, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s8, v6 +; GISEL-NEXT: v_mul_hi_u32 v11, s8, v5 +; GISEL-NEXT: v_mul_lo_u32 v10, s8, v5 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 @@ -2059,8 +2059,8 @@ ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s9, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v5 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc @@ -2080,49 +2080,49 @@ ; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s7 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s7 ; GISEL-NEXT: s_sub_u32 s8, 0, s6 ; GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GISEL-NEXT: s_and_b32 s4, s4, 1 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GISEL-NEXT: s_subb_u32 s9, 0, s7 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v6 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v5, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v6, v6 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_mul_lo_u32 v7, s9, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, s8, v5 -; GISEL-NEXT: v_mul_hi_u32 v10, s8, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v3 -; GISEL-NEXT: v_mul_lo_u32 v9, s8, v4 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, s9, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s8, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, s8, v5 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GISEL-NEXT: v_mul_lo_u32 v9, s8, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v8, v5, v9 -; GISEL-NEXT: v_mul_lo_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v6 +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9 +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GISEL-NEXT: v_mul_hi_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 @@ -2132,27 +2132,27 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v5, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, s9, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v6, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, s9, v5 ; GISEL-NEXT: v_mul_lo_u32 v10, s8, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, s8, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, s8, v4 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v12, s8, v5 +; GISEL-NEXT: v_mul_lo_u32 v11, s8, v5 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; GISEL-NEXT: v_mul_lo_u32 v10, v8, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v7, v4, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v6 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v10, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 -; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v5, v9 ; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] @@ -2163,35 +2163,35 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v10 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v2, v5 -; GISEL-NEXT: v_mul_hi_u32 v10, v2, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v3, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v2, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 ; GISEL-NEXT: v_mov_b32_e32 v9, s7 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v3, v5 +; GISEL-NEXT: v_mul_lo_u32 v10, v3, v6 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v2, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v8, v2, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v7, s7, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, s6, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, s6, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, s6, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_mul_lo_u32 v7, s7, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s6, v6 +; GISEL-NEXT: v_mul_hi_u32 v11, s6, v5 +; GISEL-NEXT: v_mul_lo_u32 v10, s6, v5 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 @@ -2205,8 +2205,8 @@ ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s7, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v5 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc @@ -2220,34 +2220,34 @@ ; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v6 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v6 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb +; CGP-NEXT: v_cvt_f32_u32_e32 v5, 0x12d8fb ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 ; CGP-NEXT: s_mov_b32 s6, 0xffed2705 -; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 -; CGP-NEXT: v_mov_b32_e32 v7, v4 +; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; CGP-NEXT: v_mov_b32_e32 v7, v5 ; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 ; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 ; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 ; CGP-NEXT: s_mov_b32 s7, 0x12d8fb ; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 ; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 @@ -2260,7 +2260,7 @@ ; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 ; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 +; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 @@ -2291,7 +2291,7 @@ ; CGP-NEXT: v_mul_lo_u32 v14, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v13 ; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 @@ -2316,7 +2316,7 @@ ; CGP-NEXT: v_mul_lo_u32 v10, v0, v8 ; CGP-NEXT: v_mul_hi_u32 v11, v0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 @@ -2367,32 +2367,32 @@ ; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; CGP-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4 +; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 ; CGP-NEXT: v_trunc_f32_e32 v7, v7 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_mul_lo_u32 v8, -1, v4 +; CGP-NEXT: v_mul_lo_u32 v8, -1, v5 ; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 -; CGP-NEXT: v_mul_hi_u32 v11, s6, v4 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v4 +; CGP-NEXT: v_mul_hi_u32 v11, s6, v5 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v5 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; CGP-NEXT: v_mul_lo_u32 v9, v7, v10 -; CGP-NEXT: v_mul_lo_u32 v11, v4, v8 -; CGP-NEXT: v_mul_hi_u32 v12, v4, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v5, v8 +; CGP-NEXT: v_mul_hi_u32 v12, v5, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v7, v10 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v12, v7, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_mul_hi_u32 v11, v4, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v5, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc @@ -2403,18 +2403,18 @@ ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v7, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v10, -1, v4 +; CGP-NEXT: v_mul_lo_u32 v10, -1, v5 ; CGP-NEXT: v_mul_lo_u32 v11, s6, v9 -; CGP-NEXT: v_mul_hi_u32 v13, s6, v4 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v4 +; CGP-NEXT: v_mul_hi_u32 v13, s6, v5 +; CGP-NEXT: v_mul_lo_u32 v12, s6, v5 ; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 ; CGP-NEXT: v_mul_lo_u32 v11, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v8, v4, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v5, v10 +; CGP-NEXT: v_mul_hi_u32 v8, v5, v12 ; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 ; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 @@ -2423,7 +2423,7 @@ ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; CGP-NEXT: v_mul_lo_u32 v11, v9, v10 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 -; CGP-NEXT: v_mul_hi_u32 v13, v4, v10 +; CGP-NEXT: v_mul_hi_u32 v13, v5, v10 ; CGP-NEXT: v_mul_hi_u32 v9, v9, v10 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] @@ -2435,30 +2435,30 @@ ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v11 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 ; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_mul_lo_u32 v8, v3, v4 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_mul_lo_u32 v8, v3, v5 ; CGP-NEXT: v_mul_lo_u32 v9, v2, v7 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; CGP-NEXT: v_mul_hi_u32 v5, v2, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; CGP-NEXT: v_mul_hi_u32 v4, v2, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v3, v5 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v8, v3, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; CGP-NEXT: v_mul_hi_u32 v9, v2, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 @@ -2703,22 +2703,22 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b64 s[6:7], 0x1000 -; GISEL-NEXT: v_lshl_b64 v[4:5], s[6:7], v4 +; GISEL-NEXT: v_lshl_b64 v[7:8], s[6:7], v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v1 -; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v4, v7 -; GISEL-NEXT: v_xor_b32_e32 v5, v5, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5 +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v4 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v8, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v5 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v7 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v10, vcc -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v4 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v5 ; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 ; GISEL-NEXT: v_xor_b32_e32 v9, v0, v10 -; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v5, vcc +; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v7, vcc ; GISEL-NEXT: v_xor_b32_e32 v17, v1, v10 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v8 ; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v0 @@ -2808,51 +2808,51 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v5, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v13, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v7, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, v5, v8 +; GISEL-NEXT: v_mul_hi_u32 v14, v5, v6 +; GISEL-NEXT: v_mul_lo_u32 v13, v5, v6 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 ; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v9, v13 ; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], v17, v11, vcc ; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v17, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v5 -; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v11, v5, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v7 +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v11, v7, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v9, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v9, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v7 ; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v11, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v12, v13, v14, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v6 ; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v8, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v5 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v15, v4, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v13 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v13 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v14, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v9, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v13, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v7, v14, v9, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v6, v10, v7 -; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v8, v0, v7 -; GISEL-NEXT: v_xor_b32_e32 v9, v1, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc +; GISEL-NEXT: v_xor_b32_e32 v7, v10, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v8, v0, v4 +; GISEL-NEXT: v_xor_b32_e32 v9, v1, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v9 ; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; GISEL-NEXT: v_xor_b32_e32 v4, v4, v6 -; GISEL-NEXT: v_xor_b32_e32 v5, v5, v6 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v7 +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v7 ; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v10 @@ -2877,6 +2877,7 @@ ; GISEL-NEXT: v_mul_lo_u32 v16, v0, v13 ; GISEL-NEXT: v_mul_hi_u32 v17, v0, v15 ; GISEL-NEXT: v_mul_hi_u32 v15, v1, v15 +; GISEL-NEXT: v_xor_b32_e32 v4, v10, v4 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 @@ -2929,47 +2930,47 @@ ; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v2, v11 ; GISEL-NEXT: v_mul_lo_u32 v14, v3, v12 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v4, v6 -; GISEL-NEXT: v_mul_hi_u32 v4, v3, v11 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v6, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v2, v12 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v5, v3, v11 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v6, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v2, v12 ; GISEL-NEXT: v_mul_hi_u32 v11, v2, v11 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GISEL-NEXT: v_mul_hi_u32 v6, v3, v12 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v7, v3, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; GISEL-NEXT: v_mul_hi_u32 v11, v2, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; GISEL-NEXT: v_mul_lo_u32 v6, v9, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; GISEL-NEXT: v_mul_lo_u32 v7, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v6 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v5 +; GISEL-NEXT: v_mul_lo_u32 v12, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v12 -; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v2, v6, vcc -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v6 +; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v2, v7, vcc +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v7 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v9 ; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v9, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v8 ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v8 ; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v9 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v6, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v8 @@ -2981,10 +2982,9 @@ ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v8, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v10, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -3431,37 +3431,37 @@ ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v3 ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v1 ; GISEL-NEXT: v_subb_u32_e32 v8, vcc, 0, v3, vcc -; GISEL-NEXT: v_and_b32_e32 v0, s6, v0 +; GISEL-NEXT: v_and_b32_e32 v6, s6, v6 ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_and_b32_e32 v6, s6, v6 -; GISEL-NEXT: v_and_b32_e32 v2, s6, v2 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v5, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_and_b32_e32 v5, s6, v0 +; GISEL-NEXT: v_and_b32_e32 v0, s6, v2 +; GISEL-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v4 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 +; GISEL-NEXT: v_trunc_f32_e32 v4, v4 +; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v12, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v8, v2 +; GISEL-NEXT: v_mul_lo_u32 v10, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v12, v7, v2 +; GISEL-NEXT: v_mul_lo_u32 v11, v7, v2 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v10, v5, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, 0, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v4, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v14, v2, v11 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 0, v5 ; GISEL-NEXT: v_addc_u32_e64 v13, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v11, v4, v11 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v4, v9 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 @@ -3471,18 +3471,18 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v5, v9, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 +; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v4, v9, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v8, v2 ; GISEL-NEXT: v_mul_lo_u32 v11, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v12, v7, v4 -; GISEL-NEXT: v_mul_hi_u32 v7, v7, v4 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9 +; GISEL-NEXT: v_mul_lo_u32 v12, v7, v2 +; GISEL-NEXT: v_mul_hi_u32 v7, v7, v2 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; GISEL-NEXT: v_mul_hi_u32 v9, v4, v12 +; GISEL-NEXT: v_mul_hi_u32 v9, v2, v12 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 ; GISEL-NEXT: v_mul_lo_u32 v8, v10, v12 -; GISEL-NEXT: v_mul_lo_u32 v11, v4, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, v2, v7 ; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] @@ -3490,7 +3490,7 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v9, v10, v7 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; GISEL-NEXT: v_mul_hi_u32 v11, v4, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] @@ -3501,71 +3501,71 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v13, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v9, v0, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v4, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v13, v2 +; GISEL-NEXT: v_mul_lo_u32 v8, v5, v4 +; GISEL-NEXT: v_mul_hi_u32 v9, v5, v2 +; GISEL-NEXT: v_mul_hi_u32 v2, v13, v2 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v13, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v8, v5, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v7, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v1, v5 -; GISEL-NEXT: v_mul_hi_u32 v10, v1, v4 -; GISEL-NEXT: v_mul_lo_u32 v9, v1, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_mul_lo_u32 v7, v3, v2 +; GISEL-NEXT: v_mul_lo_u32 v8, v1, v4 +; GISEL-NEXT: v_mul_hi_u32 v10, v1, v2 +; GISEL-NEXT: v_mul_lo_u32 v9, v1, v2 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 ; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v13, v7, vcc ; GISEL-NEXT: v_sub_i32_e64 v7, s[4:5], v13, v7 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3 ; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v5, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3 ; GISEL-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v8, v9, v10, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v4 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v2 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v9 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v9 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v10, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, 0, v6 ; GISEL-NEXT: v_addc_u32_e64 v7, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v6 ; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v7 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v5, vcc ; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v11 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v11 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc +; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v5 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; GISEL-NEXT: v_trunc_f32_e32 v4, v4 ; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 @@ -3581,117 +3581,117 @@ ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; GISEL-NEXT: v_mul_lo_u32 v10, v4, v11 ; GISEL-NEXT: v_mul_lo_u32 v12, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v14, v3, v11 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, 0, v2 -; GISEL-NEXT: v_addc_u32_e64 v13, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, 0, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v3, v11 +; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v4, v9 ; GISEL-NEXT: v_mul_hi_u32 v11, v4, v11 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 ; GISEL-NEXT: v_mul_hi_u32 v12, v3, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v4, v9 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v4, v9, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v8, v3 -; GISEL-NEXT: v_mul_lo_u32 v11, v5, v10 -; GISEL-NEXT: v_mul_lo_u32 v12, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v5, v3 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; GISEL-NEXT: v_addc_u32_e64 v3, s[4:5], v4, v9, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v8, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v3 +; GISEL-NEXT: v_mul_lo_u32 v11, v5, v0 +; GISEL-NEXT: v_mul_hi_u32 v5, v5, v0 ; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; GISEL-NEXT: v_mul_hi_u32 v9, v3, v12 +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 +; GISEL-NEXT: v_mul_hi_u32 v9, v0, v11 ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v10, v12 -; GISEL-NEXT: v_mul_lo_u32 v11, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v8, v3, v11 +; GISEL-NEXT: v_mul_lo_u32 v10, v0, v5 +; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v5 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; GISEL-NEXT: v_mul_hi_u32 v11, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v10, v5 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5 +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v10, v0, v5 +; GISEL-NEXT: v_mul_hi_u32 v3, v3, v5 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 -; GISEL-NEXT: v_mul_lo_u32 v8, v2, v4 -; GISEL-NEXT: v_mul_hi_u32 v9, v2, v3 -; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v0 -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v10, v9 +; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], v3, v5 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v0, v8 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1 +; GISEL-NEXT: v_mul_lo_u32 v5, v14, v4 +; GISEL-NEXT: v_mul_lo_u32 v8, v13, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v2, vcc +; GISEL-NEXT: v_mul_hi_u32 v2, v13, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v14, v4 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v14, v3 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v3 +; GISEL-NEXT: v_mul_hi_u32 v3, v14, v3 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v13, v4 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v2, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_mul_lo_u32 v5, v7, v3 -; GISEL-NEXT: v_mul_lo_u32 v8, v6, v4 -; GISEL-NEXT: v_mul_hi_u32 v10, v6, v3 -; GISEL-NEXT: v_mul_lo_u32 v9, v6, v3 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v13, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v4, v7, v2 +; GISEL-NEXT: v_mul_lo_u32 v5, v6, v3 +; GISEL-NEXT: v_mul_hi_u32 v9, v6, v2 +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v2 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v13, v8 +; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v14, v4, vcc +; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v14, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v7 -; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v7, vcc +; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v4, v7, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v6 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v7 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v8, v9, v10, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v3 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v2 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v3, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v11, v5, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v9 ; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v10, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v5, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v5, v10, v6, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0, v2 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -1346,26 +1346,26 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s6, s9, 31 ; GFX8-NEXT: s_ashr_i32 s12, s1, 31 -; GFX8-NEXT: s_add_u32 s8, s8, s6 +; GFX8-NEXT: s_add_u32 s14, s8, s6 ; GFX8-NEXT: s_cselect_b32 s7, 1, 0 ; GFX8-NEXT: s_and_b32 s7, s7, 1 ; GFX8-NEXT: s_cmp_lg_u32 s7, 0 -; GFX8-NEXT: s_addc_u32 s9, s9, s6 +; GFX8-NEXT: s_addc_u32 s15, s9, s6 ; GFX8-NEXT: s_add_u32 s0, s0, s12 ; GFX8-NEXT: s_cselect_b32 s7, 1, 0 ; GFX8-NEXT: s_and_b32 s7, s7, 1 ; GFX8-NEXT: s_cmp_lg_u32 s7, 0 ; GFX8-NEXT: s_mov_b32 s13, s12 ; GFX8-NEXT: s_addc_u32 s1, s1, s12 -; GFX8-NEXT: s_xor_b64 s[14:15], s[0:1], s[12:13] -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s15 -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s14 +; GFX8-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s8 ; GFX8-NEXT: s_mov_b32 s7, s6 -; GFX8-NEXT: s_xor_b64 s[8:9], s[8:9], s[6:7] +; GFX8-NEXT: s_xor_b64 s[14:15], s[14:15], s[6:7] ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_sub_u32 s16, 0, s14 +; GFX8-NEXT: s_sub_u32 s16, 0, s8 ; GFX8-NEXT: s_cselect_b32 s0, 1, 0 ; GFX8-NEXT: s_and_b32 s0, s0, 1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1376,12 +1376,12 @@ ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: s_cmp_lg_u32 s0, 0 -; GFX8-NEXT: s_subb_u32 s17, 0, s15 +; GFX8-NEXT: s_subb_u32 s17, 0, s9 ; GFX8-NEXT: v_mul_lo_u32 v3, s17, v0 ; GFX8-NEXT: v_mul_lo_u32 v2, s16, v1 ; GFX8-NEXT: v_mul_hi_u32 v5, s16, v0 ; GFX8-NEXT: v_mul_lo_u32 v4, s16, v0 -; GFX8-NEXT: v_mov_b32_e32 v6, s15 +; GFX8-NEXT: v_mov_b32_e32 v6, s9 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_mul_lo_u32 v3, v1, v4 @@ -1438,19 +1438,19 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0 -; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1 -; GFX8-NEXT: v_mul_hi_u32 v5, s8, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: v_mul_lo_u32 v2, s15, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, s14, v1 +; GFX8-NEXT: v_mul_hi_u32 v5, s14, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, s15, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s15 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v5, s9, v1 +; GFX8-NEXT: v_mul_lo_u32 v5, s15, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_mul_hi_u32 v3, s8, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, s9, v1 +; GFX8-NEXT: v_mul_hi_u32 v3, s14, v1 +; GFX8-NEXT: v_mul_hi_u32 v1, s15, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 @@ -1460,33 +1460,33 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_mul_lo_u32 v2, s15, v0 -; GFX8-NEXT: v_mul_lo_u32 v3, s14, v1 -; GFX8-NEXT: v_mul_hi_u32 v7, s14, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, s14, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1 +; GFX8-NEXT: v_mul_hi_u32 v7, s8, v0 +; GFX8-NEXT: v_mul_lo_u32 v5, s8, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s8, v5 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s14, v5 ; GFX8-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v2, vcc -; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s9, v2 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v4 +; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s15, v2 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v3 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v4 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 ; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s14, v3 +; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s8, v3 ; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v2, vcc ; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v0 ; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v8 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 ; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s14, v7 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s8, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v8 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9 ; GFX8-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc @@ -1698,26 +1698,26 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s6, s9, 31 ; GFX9-NEXT: s_ashr_i32 s12, s1, 31 -; GFX9-NEXT: s_add_u32 s8, s8, s6 +; GFX9-NEXT: s_add_u32 s14, s8, s6 ; GFX9-NEXT: s_cselect_b32 s7, 1, 0 ; GFX9-NEXT: s_and_b32 s7, s7, 1 ; GFX9-NEXT: s_cmp_lg_u32 s7, 0 -; GFX9-NEXT: s_addc_u32 s9, s9, s6 +; GFX9-NEXT: s_addc_u32 s15, s9, s6 ; GFX9-NEXT: s_add_u32 s0, s0, s12 ; GFX9-NEXT: s_cselect_b32 s7, 1, 0 ; GFX9-NEXT: s_and_b32 s7, s7, 1 ; GFX9-NEXT: s_cmp_lg_u32 s7, 0 ; GFX9-NEXT: s_mov_b32 s13, s12 ; GFX9-NEXT: s_addc_u32 s1, s1, s12 -; GFX9-NEXT: s_xor_b64 s[14:15], s[0:1], s[12:13] -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s15 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s14 +; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8 ; GFX9-NEXT: s_mov_b32 s7, s6 -; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[6:7] +; GFX9-NEXT: s_xor_b64 s[14:15], s[14:15], s[6:7] ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_u32 s16, 0, s14 +; GFX9-NEXT: s_sub_u32 s16, 0, s8 ; GFX9-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9-NEXT: s_and_b32 s0, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1728,7 +1728,7 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_subb_u32 s17, 0, s15 +; GFX9-NEXT: s_subb_u32 s17, 0, s9 ; GFX9-NEXT: v_mul_lo_u32 v3, s17, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, s16, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0 @@ -1785,19 +1785,19 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-NEXT: v_mul_lo_u32 v2, s15, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s14, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s14, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s15, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, s15 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1 +; GFX9-NEXT: v_mul_lo_u32 v4, s15, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s14, v1 +; GFX9-NEXT: v_mul_hi_u32 v1, s15, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 @@ -1806,33 +1806,33 @@ ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1 -; GFX9-NEXT: v_mul_lo_u32 v2, s15, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s14, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s14, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, s14, v0 -; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v6, s8, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 ; GFX9-NEXT: v_add3_u32 v2, v2, v3, v4 -; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s8, v6 +; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s14, v6 ; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v7, v2, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v4 -; GFX9-NEXT: v_sub_u32_e32 v2, s9, v2 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 +; GFX9-NEXT: v_sub_u32_e32 v2, s15, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v3 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v4 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s14, v3 +; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s8, v3 ; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[0:1], 0, v2, vcc ; GFX9-NEXT: v_add_co_u32_e64 v9, s[0:1], 1, v0 ; GFX9-NEXT: v_addc_co_u32_e64 v10, s[0:1], 0, v1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v8 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 -; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s14, v7 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 +; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s8, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v8 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v12, s[0:1], 1, v9 ; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc Index: llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -21,43 +21,43 @@ ; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v0 ; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v3, v0, vcc -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v0 +; CHECK-NEXT: v_xor_b32_e32 v3, v1, v0 ; CHECK-NEXT: v_xor_b32_e32 v0, v2, v0 -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v1 -; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v0 -; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v3 +; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v1 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc +; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v6 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v6 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v6, vcc -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, 0, v1 +; CHECK-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 +; CHECK-NEXT: v_subb_u32_e32 v8, vcc, 0, v0, vcc +; CHECK-NEXT: v_xor_b32_e32 v4, v4, v1 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v5, v5 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v5 +; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v2 +; CHECK-NEXT: v_trunc_f32_e32 v6, v6 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v6 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CHECK-NEXT: v_subb_u32_e32 v8, vcc, 0, v0, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v3, v6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CHECK-NEXT: v_xor_b32_e32 v5, v5, v1 ; CHECK-NEXT: v_mul_lo_u32 v9, v8, v2 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v5 +; CHECK-NEXT: v_mul_lo_u32 v10, v7, v6 ; CHECK-NEXT: v_mul_hi_u32 v12, v7, v2 ; CHECK-NEXT: v_mul_lo_u32 v11, v7, v2 -; CHECK-NEXT: v_xor_b32_e32 v4, v4, v6 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CHECK-NEXT: v_mul_lo_u32 v10, v5, v11 +; CHECK-NEXT: v_mul_lo_u32 v10, v6, v11 ; CHECK-NEXT: v_mul_lo_u32 v12, v2, v9 ; CHECK-NEXT: v_mul_hi_u32 v13, v2, v11 -; CHECK-NEXT: v_mul_hi_u32 v11, v5, v11 +; CHECK-NEXT: v_mul_hi_u32 v11, v6, v11 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v13 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v13, v5, v9 +; CHECK-NEXT: v_mul_lo_u32 v13, v6, v9 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; CHECK-NEXT: v_mul_hi_u32 v12, v2, v9 -; CHECK-NEXT: v_mul_hi_u32 v9, v5, v9 +; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9 ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v12 @@ -68,12 +68,12 @@ ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CHECK-NEXT: v_addc_u32_e64 v10, s[4:5], v5, v9, vcc +; CHECK-NEXT: v_addc_u32_e64 v10, s[4:5], v6, v9, vcc ; CHECK-NEXT: v_mul_lo_u32 v8, v8, v2 ; CHECK-NEXT: v_mul_lo_u32 v11, v7, v10 ; CHECK-NEXT: v_mul_lo_u32 v12, v7, v2 ; CHECK-NEXT: v_mul_hi_u32 v7, v7, v2 -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v9 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 ; CHECK-NEXT: v_mul_hi_u32 v9, v2, v12 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 @@ -97,21 +97,21 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc +; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v4, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v9, v3, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v4, v2 +; CHECK-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; CHECK-NEXT: v_mul_lo_u32 v7, v5, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, v4, v6 +; CHECK-NEXT: v_mul_hi_u32 v9, v4, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v5, v2 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 +; CHECK-NEXT: v_mul_lo_u32 v9, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 +; CHECK-NEXT: v_mul_hi_u32 v8, v4, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 @@ -120,44 +120,44 @@ ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CHECK-NEXT: v_mul_lo_u32 v7, v0, v2 -; CHECK-NEXT: v_mul_lo_u32 v5, v1, v5 -; CHECK-NEXT: v_mul_lo_u32 v8, v1, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v8 -; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v4, v2, vcc -; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v4, v2 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 +; CHECK-NEXT: v_mul_lo_u32 v6, v3, v6 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v3, v2 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 +; CHECK-NEXT: v_subb_u32_e64 v6, s[4:5], v5, v2, vcc +; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v0 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v0 ; CHECK-NEXT: v_subb_u32_e32 v2, vcc, v2, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v3, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] +; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v4, v3 ; CHECK-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v2, vcc ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v0 ; CHECK-NEXT: v_subb_u32_e32 v0, vcc, v2, v0, vcc -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v7, v1 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v7, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 -; CHECK-NEXT: v_xor_b32_e32 v2, v0, v6 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v1, v6 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v2, v6, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; CHECK-NEXT: v_xor_b32_e32 v2, v2, v1 +; CHECK-NEXT: v_xor_b32_e32 v3, v0, v1 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v2, v1 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr4 ; CHECK-NEXT: BB0_2: ; %Flow @@ -205,23 +205,23 @@ ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: s_ashr_i32 s0, s5, 31 ; CHECK-NEXT: s_ashr_i32 s6, s3, 31 -; CHECK-NEXT: s_add_u32 s8, s2, s6 +; CHECK-NEXT: s_add_u32 s10, s2, s6 ; CHECK-NEXT: s_cselect_b32 s7, 1, 0 ; CHECK-NEXT: s_and_b32 s7, s7, 1 ; CHECK-NEXT: s_cmp_lg_u32 s7, 0 -; CHECK-NEXT: s_addc_u32 s9, s3, s6 -; CHECK-NEXT: s_add_u32 s10, s4, s0 +; CHECK-NEXT: s_addc_u32 s11, s3, s6 +; CHECK-NEXT: s_add_u32 s8, s4, s0 ; CHECK-NEXT: s_cselect_b32 s3, 1, 0 ; CHECK-NEXT: s_and_b32 s3, s3, 1 ; CHECK-NEXT: s_cmp_lg_u32 s3, 0 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: s_addc_u32 s11, s5, s0 -; CHECK-NEXT: s_xor_b64 s[10:11], s[10:11], s[0:1] -; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s10 -; CHECK-NEXT: v_cvt_f32_u32_e32 v1, s11 +; CHECK-NEXT: s_addc_u32 s9, s5, s0 +; CHECK-NEXT: s_xor_b64 s[8:9], s[8:9], s[0:1] +; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s8 +; CHECK-NEXT: v_cvt_f32_u32_e32 v1, s9 ; CHECK-NEXT: s_mov_b32 s7, s6 -; CHECK-NEXT: s_xor_b64 s[8:9], s[8:9], s[6:7] -; CHECK-NEXT: s_sub_u32 s3, 0, s10 +; CHECK-NEXT: s_xor_b64 s[10:11], s[10:11], s[6:7] +; CHECK-NEXT: s_sub_u32 s3, 0, s8 ; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CHECK-NEXT: s_cselect_b32 s0, 1, 0 @@ -233,8 +233,8 @@ ; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 ; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 -; CHECK-NEXT: s_subb_u32 s5, 0, s11 -; CHECK-NEXT: v_mov_b32_e32 v6, s11 +; CHECK-NEXT: s_subb_u32 s5, 0, s9 +; CHECK-NEXT: v_mov_b32_e32 v6, s9 ; CHECK-NEXT: v_mul_lo_u32 v2, s5, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, s3, v1 ; CHECK-NEXT: v_mul_hi_u32 v5, s3, v0 @@ -295,19 +295,19 @@ ; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, s9, v0 -; CHECK-NEXT: v_mul_lo_u32 v3, s8, v1 -; CHECK-NEXT: v_mul_hi_u32 v5, s8, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, s9, v0 -; CHECK-NEXT: v_mov_b32_e32 v4, s9 +; CHECK-NEXT: v_mul_lo_u32 v2, s11, v0 +; CHECK-NEXT: v_mul_lo_u32 v3, s10, v1 +; CHECK-NEXT: v_mul_hi_u32 v5, s10, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, s11, v0 +; CHECK-NEXT: v_mov_b32_e32 v4, s11 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, s9, v1 +; CHECK-NEXT: v_mul_lo_u32 v5, s11, v1 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_mul_hi_u32 v3, s8, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, s9, v1 +; CHECK-NEXT: v_mul_hi_u32 v3, s10, v1 +; CHECK-NEXT: v_mul_hi_u32 v1, s11, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 @@ -317,31 +317,31 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_mul_lo_u32 v2, s11, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, s10, v1 -; CHECK-NEXT: v_mul_lo_u32 v3, s10, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, s10, v0 +; CHECK-NEXT: v_mul_lo_u32 v2, s9, v0 +; CHECK-NEXT: v_mul_lo_u32 v1, s8, v1 +; CHECK-NEXT: v_mul_lo_u32 v3, s8, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, s8, v0 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, s8, v3 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, s10, v3 ; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v4, v0, vcc -; CHECK-NEXT: v_sub_i32_e64 v0, s[0:1], s9, v0 -; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v2 +; CHECK-NEXT: v_sub_i32_e64 v0, s[0:1], s11, v0 +; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v1 +; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v2 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v2 ; CHECK-NEXT: v_subb_u32_e32 v0, vcc, v0, v6, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1] -; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s10, v1 +; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s8, v1 ; CHECK-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s11, v0 +; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s9, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 +; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s11, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s9, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; CHECK-NEXT: v_subrev_i32_e32 v4, vcc, s10, v3 +; CHECK-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 @@ -398,43 +398,43 @@ ; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc +; GISEL-NEXT: v_xor_b32_e32 v9, v4, v8 ; GISEL-NEXT: v_xor_b32_e32 v5, v5, v8 -; GISEL-NEXT: v_xor_b32_e32 v4, v4, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5 -; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v10, vcc -; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v9 +; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v5 +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc +; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v10 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v4 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v9 ; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v10 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 ; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 -; GISEL-NEXT: v_trunc_f32_e32 v9, v9 -; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9 +; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v8 +; GISEL-NEXT: v_trunc_f32_e32 v10, v10 +; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v10 ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v10 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 ; GISEL-NEXT: v_mul_lo_u32 v13, v12, v8 -; GISEL-NEXT: v_mul_lo_u32 v14, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v14, v11, v10 ; GISEL-NEXT: v_mul_hi_u32 v16, v11, v8 ; GISEL-NEXT: v_mul_lo_u32 v15, v11, v8 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_mul_lo_u32 v14, v9, v15 +; GISEL-NEXT: v_mul_lo_u32 v14, v10, v15 ; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13 ; GISEL-NEXT: v_mul_hi_u32 v17, v8, v15 -; GISEL-NEXT: v_mul_hi_u32 v15, v9, v15 +; GISEL-NEXT: v_mul_hi_u32 v15, v10, v15 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v13 +; GISEL-NEXT: v_mul_lo_u32 v17, v10, v13 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 ; GISEL-NEXT: v_mul_hi_u32 v16, v8, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v10, v13 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 @@ -445,12 +445,12 @@ ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14 -; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], v9, v13, vcc +; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], v10, v13, vcc ; GISEL-NEXT: v_mul_lo_u32 v12, v12, v8 ; GISEL-NEXT: v_mul_lo_u32 v15, v11, v14 ; GISEL-NEXT: v_mul_lo_u32 v16, v11, v8 ; GISEL-NEXT: v_mul_hi_u32 v11, v11, v8 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 ; GISEL-NEXT: v_mul_hi_u32 v13, v8, v16 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 @@ -474,21 +474,21 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v11, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc ; GISEL-NEXT: v_mul_lo_u32 v11, v1, v8 -; GISEL-NEXT: v_mul_lo_u32 v12, v0, v9 +; GISEL-NEXT: v_mul_lo_u32 v12, v0, v10 ; GISEL-NEXT: v_mul_hi_u32 v13, v0, v8 ; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v1, v9 +; GISEL-NEXT: v_mul_lo_u32 v13, v1, v10 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v0, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 @@ -497,82 +497,82 @@ ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v4, v9 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v4, v8 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_lo_u32 v10, v9, v10 +; GISEL-NEXT: v_mul_lo_u32 v12, v9, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 -; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v8, vcc +; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v1, v8, vcc ; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v8 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v5 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v0, v4 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v0, v9 ; GISEL-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v9 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v11, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v5 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v11, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v11, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v12, v1, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v4 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 -; GISEL-NEXT: v_xor_b32_e32 v4, v6, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v5 -; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc -; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v5 -; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v10 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 -; GISEL-NEXT: v_trunc_f32_e32 v7, v7 -; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v5 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v5 +; GISEL-NEXT: v_xor_b32_e32 v5, v7, v5 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v5 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v8 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7 +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v6 +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 +; GISEL-NEXT: v_trunc_f32_e32 v8, v8 +; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v8 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v8 -; GISEL-NEXT: v_mul_lo_u32 v12, v11, v6 -; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7 -; GISEL-NEXT: v_mul_hi_u32 v15, v9, v6 -; GISEL-NEXT: v_mul_lo_u32 v14, v9, v6 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v10 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v9 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v9 +; GISEL-NEXT: v_mul_lo_u32 v12, v11, v7 +; GISEL-NEXT: v_mul_lo_u32 v13, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v15, v10, v7 +; GISEL-NEXT: v_mul_lo_u32 v14, v10, v7 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_mul_lo_u32 v13, v7, v14 -; GISEL-NEXT: v_mul_lo_u32 v15, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v6, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v14 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v14 +; GISEL-NEXT: v_mul_lo_u32 v15, v7, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v7, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v16, v7, v12 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v12 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v7, v12 +; GISEL-NEXT: v_mul_hi_u32 v15, v7, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 @@ -582,27 +582,27 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v13 -; GISEL-NEXT: v_addc_u32_e64 v13, s[4:5], v7, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v11, v6 -; GISEL-NEXT: v_mul_lo_u32 v14, v9, v13 -; GISEL-NEXT: v_mul_lo_u32 v15, v9, v6 -; GISEL-NEXT: v_mul_hi_u32 v9, v9, v6 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v12 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; GISEL-NEXT: v_addc_u32_e64 v13, s[4:5], v8, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v11, v7 +; GISEL-NEXT: v_mul_lo_u32 v14, v10, v13 +; GISEL-NEXT: v_mul_lo_u32 v15, v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v10, v10, v7 +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v12 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; GISEL-NEXT: v_mul_hi_u32 v12, v6, v15 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v7, v15 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 ; GISEL-NEXT: v_mul_lo_u32 v11, v13, v15 -; GISEL-NEXT: v_mul_lo_u32 v14, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v14, v7, v10 ; GISEL-NEXT: v_mul_hi_u32 v15, v13, v15 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v13, v9 +; GISEL-NEXT: v_mul_lo_u32 v12, v13, v10 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v13, v9 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v13, v10 ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 @@ -611,70 +611,70 @@ ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_mul_lo_u32 v9, v3, v6 -; GISEL-NEXT: v_mul_lo_u32 v11, v2, v7 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v10, vcc -; GISEL-NEXT: v_mul_hi_u32 v10, v2, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_mul_lo_u32 v10, v3, v7 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GISEL-NEXT: v_mul_hi_u32 v11, v2, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, v2, v8 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; GISEL-NEXT: v_mul_hi_u32 v4, v2, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v6 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v3, v8 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v3, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_mul_lo_u32 v9, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v7, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v10, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v5, v6 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_mul_lo_u32 v8, v5, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v6, v7 +; GISEL-NEXT: v_mul_lo_u32 v10, v6, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v6, v4 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v6, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v6 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v4 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v2, v5 +; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v4, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v5 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v2, v6 ; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v4 -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v9, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v5 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v8, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v8 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v8 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v9 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v9 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64: @@ -695,16 +695,16 @@ ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v0 ; CGP-NEXT: v_addc_u32_e32 v2, vcc, v5, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v1, v0 +; CGP-NEXT: v_xor_b32_e32 v3, v1, v0 ; CGP-NEXT: v_xor_b32_e32 v0, v2, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v0 -; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v11 -; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0 +; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v11 +; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v4 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v4 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v11, v4, vcc -; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v1 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v11, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v3 ; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v2 ; CGP-NEXT: v_trunc_f32_e32 v10, v10 @@ -712,12 +712,12 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10 ; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v3, v4 +; CGP-NEXT: v_xor_b32_e32 v4, v4, v1 ; CGP-NEXT: v_mul_lo_u32 v13, v12, v2 ; CGP-NEXT: v_mul_lo_u32 v14, v11, v10 ; CGP-NEXT: v_mul_hi_u32 v16, v11, v2 ; CGP-NEXT: v_mul_lo_u32 v15, v11, v2 -; CGP-NEXT: v_xor_b32_e32 v5, v5, v4 +; CGP-NEXT: v_xor_b32_e32 v5, v5, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; CGP-NEXT: v_mul_lo_u32 v14, v10, v15 @@ -775,8 +775,8 @@ ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 ; CGP-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc ; CGP-NEXT: v_mul_lo_u32 v11, v5, v2 -; CGP-NEXT: v_mul_lo_u32 v12, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v13, v3, v2 +; CGP-NEXT: v_mul_lo_u32 v12, v4, v10 +; CGP-NEXT: v_mul_hi_u32 v13, v4, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc @@ -784,7 +784,7 @@ ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v13, v5, v10 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_mul_hi_u32 v12, v3, v10 +; CGP-NEXT: v_mul_hi_u32 v12, v4, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v5, v10 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc @@ -796,42 +796,42 @@ ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_mul_lo_u32 v11, v0, v2 -; CGP-NEXT: v_mul_lo_u32 v10, v1, v10 -; CGP-NEXT: v_mul_lo_u32 v12, v1, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v1, v2 +; CGP-NEXT: v_mul_lo_u32 v10, v3, v10 +; CGP-NEXT: v_mul_lo_u32 v12, v3, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v3, v2 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v12 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v4, v12 ; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v5, v2, vcc ; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v3 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v0 ; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v0, vcc ; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v11, vcc, v3, v1 +; CGP-NEXT: v_sub_i32_e32 v11, vcc, v4, v3 ; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v2, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v0 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v3 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v0 ; CGP-NEXT: v_subb_u32_e32 v0, vcc, v2, v0, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v11, v1 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v11, v3 ; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 -; CGP-NEXT: v_xor_b32_e32 v2, v0, v4 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v2, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v1 +; CGP-NEXT: v_xor_b32_e32 v3, v0, v1 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v2, v1 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr10 ; CGP-NEXT: BB2_2: ; %Flow2 @@ -870,16 +870,16 @@ ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v6, v2 ; CGP-NEXT: v_addc_u32_e32 v4, vcc, v7, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v3, v2 +; CGP-NEXT: v_xor_b32_e32 v5, v3, v2 ; CGP-NEXT: v_xor_b32_e32 v2, v4, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v3 -; CGP-NEXT: v_cvt_f32_u32_e32 v5, v2 -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v9 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v5 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v2 +; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v9 +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v6 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, v9, v6, vcc -; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v3 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v9, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v5 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 @@ -887,12 +887,12 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_subb_u32_e32 v10, vcc, 0, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v5, v6 +; CGP-NEXT: v_xor_b32_e32 v6, v6, v3 ; CGP-NEXT: v_mul_lo_u32 v11, v10, v4 ; CGP-NEXT: v_mul_lo_u32 v12, v9, v8 ; CGP-NEXT: v_mul_hi_u32 v14, v9, v4 ; CGP-NEXT: v_mul_lo_u32 v13, v9, v4 -; CGP-NEXT: v_xor_b32_e32 v7, v7, v6 +; CGP-NEXT: v_xor_b32_e32 v7, v7, v3 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 ; CGP-NEXT: v_mul_lo_u32 v12, v8, v13 @@ -950,8 +950,8 @@ ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc ; CGP-NEXT: v_mul_lo_u32 v9, v7, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v5, v8 -; CGP-NEXT: v_mul_hi_u32 v11, v5, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v6, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v6, v4 ; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc @@ -959,7 +959,7 @@ ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v11, v7, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v5, v8 +; CGP-NEXT: v_mul_hi_u32 v10, v6, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc @@ -971,42 +971,42 @@ ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_mul_lo_u32 v9, v2, v4 -; CGP-NEXT: v_mul_lo_u32 v8, v3, v8 -; CGP-NEXT: v_mul_lo_u32 v10, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 +; CGP-NEXT: v_mul_lo_u32 v8, v5, v8 +; CGP-NEXT: v_mul_lo_u32 v10, v5, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v5, v4 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v10 +; CGP-NEXT: v_sub_i32_e32 v6, vcc, v6, v10 ; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v7, v4, vcc ; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v7, v4 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v5 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v2 ; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v2, vcc ; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v5, v3 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v6, v5 ; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v4, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v2 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v2 ; CGP-NEXT: v_subb_u32_e32 v2, vcc, v4, v2, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v9, v3 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v9, v5 ; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 -; CGP-NEXT: v_xor_b32_e32 v4, v2, v6 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v3, v6 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v4, v6, vcc +; CGP-NEXT: v_xor_b32_e32 v4, v4, v3 +; CGP-NEXT: v_xor_b32_e32 v5, v2, v3 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v4, v3 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc ; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: BB2_6: ; %Flow @@ -1043,32 +1043,32 @@ ; CHECK-LABEL: v_srem_i64_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000 +; CHECK-NEXT: v_cvt_f32_u32_e32 v3, 0x1000 ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 ; CHECK-NEXT: s_movk_i32 s6, 0xf000 -; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1 -; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v4 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 +; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 +; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 +; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; CHECK-NEXT: v_trunc_f32_e32 v4, v4 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 ; CHECK-NEXT: s_bfe_i32 s7, -1, 0x10000 -; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, -1, v3 ; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 +; CHECK-NEXT: v_mul_hi_u32 v8, s6, v3 +; CHECK-NEXT: v_mul_lo_u32 v7, s6, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 -; CHECK-NEXT: v_mul_lo_u32 v8, v2, v5 -; CHECK-NEXT: v_mul_hi_u32 v9, v2, v7 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v9, v3, v7 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc @@ -1076,7 +1076,7 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v8, v3, v5 ; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc @@ -1087,18 +1087,18 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_addc_u32_e64 v6, s[4:5], v4, v5, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, -1, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, -1, v3 ; CHECK-NEXT: v_mul_lo_u32 v8, s6, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, s6, v2 +; CHECK-NEXT: v_mul_hi_u32 v10, s6, v3 +; CHECK-NEXT: v_mul_lo_u32 v9, s6, v3 ; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v10 ; CHECK-NEXT: v_mul_lo_u32 v8, v6, v9 -; CHECK-NEXT: v_mul_lo_u32 v10, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v5, v2, v9 +; CHECK-NEXT: v_mul_lo_u32 v10, v3, v7 +; CHECK-NEXT: v_mul_hi_u32 v5, v3, v9 ; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9 ; CHECK-NEXT: s_movk_i32 s6, 0x1000 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 @@ -1107,7 +1107,7 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] ; CHECK-NEXT: v_mul_lo_u32 v8, v6, v7 ; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 -; CHECK-NEXT: v_mul_hi_u32 v10, v2, v7 +; CHECK-NEXT: v_mul_hi_u32 v10, v3, v7 ; CHECK-NEXT: v_mul_hi_u32 v6, v6, v7 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] @@ -1119,12 +1119,12 @@ ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v8 ; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 ; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v1, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v1, v3 ; CHECK-NEXT: v_mul_lo_u32 v6, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v7, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 +; CHECK-NEXT: v_mul_hi_u32 v7, v0, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -1133,30 +1133,30 @@ ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_mul_hi_u32 v6, v0, v4 ; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_mul_lo_u32 v5, 0, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, 0, v3 ; CHECK-NEXT: v_mul_lo_u32 v4, s6, v4 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, s6, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, s6, v3 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CHECK-NEXT: v_mov_b32_e32 v5, s7 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v1, v2, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v2 +; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v1, v3, vcc +; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v3 ; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; CHECK-NEXT: v_subrev_i32_e32 v5, vcc, s6, v0 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: s_bfe_i32 s4, -1, 0x10000 @@ -1170,13 +1170,13 @@ ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CHECK-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %num, 4096 ret i64 %result @@ -1195,44 +1195,44 @@ ; GISEL-NEXT: s_mov_b32 s7, s6 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s9 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s9 ; GISEL-NEXT: s_sub_u32 s11, 0, s8 ; GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GISEL-NEXT: s_and_b32 s4, s4, 1 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GISEL-NEXT: s_subb_u32 s12, 0, s9 -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v5, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v6, v6 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, s12, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, s11, v5 -; GISEL-NEXT: v_mul_hi_u32 v10, s11, v4 -; GISEL-NEXT: v_mul_lo_u32 v9, s11, v4 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, s12, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s11, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, s11, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, s11, v5 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v8, v5, v9 -; GISEL-NEXT: v_mul_lo_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GISEL-NEXT: v_mul_hi_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 @@ -1242,18 +1242,18 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v5, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, s12, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v6, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, s12, v5 ; GISEL-NEXT: v_mul_lo_u32 v10, s11, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, s11, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, s11, v4 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v12, s11, v5 +; GISEL-NEXT: v_mul_lo_u32 v11, s11, v5 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; GISEL-NEXT: v_mul_lo_u32 v10, v8, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v7, v4, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] @@ -1261,7 +1261,7 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v10, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 -; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v5, v9 ; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] @@ -1272,47 +1272,47 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v10 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v1, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v10, v0, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v1, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, v0, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 ; GISEL-NEXT: v_mov_b32_e32 v9, s9 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v1, v5 +; GISEL-NEXT: v_mul_lo_u32 v10, v1, v6 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v8, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v7, s9, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, s8, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, s8, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, s8, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_mul_lo_u32 v7, s9, v5 +; GISEL-NEXT: v_mul_lo_u32 v6, s8, v6 +; GISEL-NEXT: v_mul_lo_u32 v8, s8, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, s8, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v1, v4, vcc -; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v4 -; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s9, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], v1, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v5 +; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s9, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s9, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s9, v6 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] ; GISEL-NEXT: v_subrev_i32_e32 v7, vcc, s8, v0 ; GISEL-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v1, vcc ; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s9, v8 @@ -1333,50 +1333,50 @@ ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] ; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s7 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s7 ; GISEL-NEXT: s_sub_u32 s8, 0, s6 ; GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GISEL-NEXT: s_and_b32 s4, s4, 1 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GISEL-NEXT: s_subb_u32 s9, 0, s7 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v5, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v6, v6 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v6 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 -; GISEL-NEXT: v_mul_lo_u32 v7, s9, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, s8, v5 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v10, s8, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v3 -; GISEL-NEXT: v_mul_lo_u32 v9, s8, v4 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, s9, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s8, v6 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_mul_hi_u32 v10, s8, v5 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GISEL-NEXT: v_mul_lo_u32 v9, s8, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v8, v5, v9 -; GISEL-NEXT: v_mul_lo_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v6 +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9 +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GISEL-NEXT: v_mul_hi_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 @@ -1386,27 +1386,27 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v5, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, s9, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v6, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, s9, v5 ; GISEL-NEXT: v_mul_lo_u32 v10, s8, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, s8, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, s8, v4 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v12, s8, v5 +; GISEL-NEXT: v_mul_lo_u32 v11, s8, v5 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; GISEL-NEXT: v_mul_lo_u32 v10, v8, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v7, v4, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v6 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v10, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 -; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v5, v9 ; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] @@ -1417,47 +1417,47 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v10 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v2, v5 -; GISEL-NEXT: v_mul_hi_u32 v10, v2, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v3, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v2, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 ; GISEL-NEXT: v_mov_b32_e32 v9, s7 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v3, v5 +; GISEL-NEXT: v_mul_lo_u32 v10, v3, v6 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v2, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v8, v2, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v7, s7, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, s6, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, s6, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, s6, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_mul_lo_u32 v7, s7, v5 +; GISEL-NEXT: v_mul_lo_u32 v6, s6, v6 +; GISEL-NEXT: v_mul_lo_u32 v8, s6, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, s6, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v3, v4, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4 -; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v5 +; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s7, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s7, v6 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] ; GISEL-NEXT: v_subrev_i32_e32 v7, vcc, s6, v2 ; GISEL-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v3, vcc ; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v8 @@ -1472,35 +1472,35 @@ ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v6 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v6 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 +; CGP-NEXT: v_cvt_f32_u32_e32 v5, 0x1000 ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 ; CGP-NEXT: s_movk_i32 s6, 0xf000 -; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 -; CGP-NEXT: v_mov_b32_e32 v7, v4 +; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; CGP-NEXT: v_mov_b32_e32 v7, v5 ; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 ; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 ; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 ; CGP-NEXT: s_movk_i32 s7, 0x1000 ; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 ; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 @@ -1513,7 +1513,7 @@ ; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 ; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 +; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 @@ -1544,7 +1544,7 @@ ; CGP-NEXT: v_mul_lo_u32 v14, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v13 ; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 @@ -1569,7 +1569,7 @@ ; CGP-NEXT: v_mul_lo_u32 v10, v0, v8 ; CGP-NEXT: v_mul_hi_u32 v11, v0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 @@ -1617,33 +1617,33 @@ ; CGP-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4 +; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 ; CGP-NEXT: v_trunc_f32_e32 v7, v7 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, -1, v4 +; CGP-NEXT: v_mul_lo_u32 v8, -1, v5 ; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 -; CGP-NEXT: v_mul_hi_u32 v11, s6, v4 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v4 +; CGP-NEXT: v_mul_hi_u32 v11, s6, v5 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; CGP-NEXT: v_mul_lo_u32 v9, v7, v10 -; CGP-NEXT: v_mul_lo_u32 v11, v4, v8 -; CGP-NEXT: v_mul_hi_u32 v12, v4, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v5, v8 +; CGP-NEXT: v_mul_hi_u32 v12, v5, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v7, v10 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v12, v7, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_mul_hi_u32 v11, v4, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v5, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc @@ -1654,18 +1654,18 @@ ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v7, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v10, -1, v4 +; CGP-NEXT: v_mul_lo_u32 v10, -1, v5 ; CGP-NEXT: v_mul_lo_u32 v11, s6, v9 -; CGP-NEXT: v_mul_hi_u32 v13, s6, v4 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v4 +; CGP-NEXT: v_mul_hi_u32 v13, s6, v5 +; CGP-NEXT: v_mul_lo_u32 v12, s6, v5 ; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 ; CGP-NEXT: v_mul_lo_u32 v11, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v8, v4, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v5, v10 +; CGP-NEXT: v_mul_hi_u32 v8, v5, v12 ; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 ; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 @@ -1674,7 +1674,7 @@ ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; CGP-NEXT: v_mul_lo_u32 v11, v9, v10 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 -; CGP-NEXT: v_mul_hi_u32 v13, v4, v10 +; CGP-NEXT: v_mul_hi_u32 v13, v5, v10 ; CGP-NEXT: v_mul_hi_u32 v9, v9, v10 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] @@ -1686,30 +1686,30 @@ ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v11 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 ; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_mul_lo_u32 v8, v3, v4 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_mul_lo_u32 v8, v3, v5 ; CGP-NEXT: v_mul_lo_u32 v9, v2, v7 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; CGP-NEXT: v_mul_hi_u32 v5, v2, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; CGP-NEXT: v_mul_hi_u32 v4, v2, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v3, v5 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v8, v3, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; CGP-NEXT: v_mul_hi_u32 v9, v2, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 @@ -1758,32 +1758,32 @@ ; CHECK-LABEL: v_srem_i64_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb +; CHECK-NEXT: v_cvt_f32_u32_e32 v3, 0x12d8fb ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 ; CHECK-NEXT: s_mov_b32 s6, 0xffed2705 -; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1 -; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v4 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 +; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 +; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 +; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; CHECK-NEXT: v_trunc_f32_e32 v4, v4 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 ; CHECK-NEXT: s_bfe_i32 s7, -1, 0x10000 -; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, -1, v3 ; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 +; CHECK-NEXT: v_mul_hi_u32 v8, s6, v3 +; CHECK-NEXT: v_mul_lo_u32 v7, s6, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 -; CHECK-NEXT: v_mul_lo_u32 v8, v2, v5 -; CHECK-NEXT: v_mul_hi_u32 v9, v2, v7 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v9, v3, v7 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc @@ -1791,7 +1791,7 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v8, v3, v5 ; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc @@ -1802,18 +1802,18 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_addc_u32_e64 v6, s[4:5], v4, v5, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, -1, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, -1, v3 ; CHECK-NEXT: v_mul_lo_u32 v8, s6, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, s6, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, s6, v2 +; CHECK-NEXT: v_mul_hi_u32 v10, s6, v3 +; CHECK-NEXT: v_mul_lo_u32 v9, s6, v3 ; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v10 ; CHECK-NEXT: v_mul_lo_u32 v8, v6, v9 -; CHECK-NEXT: v_mul_lo_u32 v10, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v5, v2, v9 +; CHECK-NEXT: v_mul_lo_u32 v10, v3, v7 +; CHECK-NEXT: v_mul_hi_u32 v5, v3, v9 ; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9 ; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 @@ -1822,7 +1822,7 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] ; CHECK-NEXT: v_mul_lo_u32 v8, v6, v7 ; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 -; CHECK-NEXT: v_mul_hi_u32 v10, v2, v7 +; CHECK-NEXT: v_mul_hi_u32 v10, v3, v7 ; CHECK-NEXT: v_mul_hi_u32 v6, v6, v7 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] @@ -1834,12 +1834,12 @@ ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v8 ; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 ; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v1, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v1, v3 ; CHECK-NEXT: v_mul_lo_u32 v6, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v7, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 +; CHECK-NEXT: v_mul_hi_u32 v7, v0, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -1848,30 +1848,30 @@ ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_mul_hi_u32 v6, v0, v4 ; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_mul_lo_u32 v5, 0, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, 0, v3 ; CHECK-NEXT: v_mul_lo_u32 v4, s6, v4 -; CHECK-NEXT: v_mul_lo_u32 v6, s6, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, s6, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, s6, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, s6, v3 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CHECK-NEXT: v_mov_b32_e32 v5, s7 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v1, v2, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v2 +; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v1, v3, vcc +; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v3 ; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; CHECK-NEXT: v_subrev_i32_e32 v5, vcc, s6, v0 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: s_bfe_i32 s4, -1, 0x10000 @@ -1885,13 +1885,13 @@ ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CHECK-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %num, 1235195 ret i64 %result @@ -1910,44 +1910,44 @@ ; GISEL-NEXT: s_mov_b32 s7, s6 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s9 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s9 ; GISEL-NEXT: s_sub_u32 s11, 0, s8 ; GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GISEL-NEXT: s_and_b32 s4, s4, 1 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GISEL-NEXT: s_subb_u32 s12, 0, s9 -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v5, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v6, v6 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, s12, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, s11, v5 -; GISEL-NEXT: v_mul_hi_u32 v10, s11, v4 -; GISEL-NEXT: v_mul_lo_u32 v9, s11, v4 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, s12, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s11, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, s11, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, s11, v5 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v8, v5, v9 -; GISEL-NEXT: v_mul_lo_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GISEL-NEXT: v_mul_hi_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 @@ -1957,18 +1957,18 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v5, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, s12, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v6, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, s12, v5 ; GISEL-NEXT: v_mul_lo_u32 v10, s11, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, s11, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, s11, v4 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v12, s11, v5 +; GISEL-NEXT: v_mul_lo_u32 v11, s11, v5 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; GISEL-NEXT: v_mul_lo_u32 v10, v8, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v7, v4, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] @@ -1976,7 +1976,7 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v10, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 -; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v5, v9 ; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] @@ -1987,47 +1987,47 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v10 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v1, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v10, v0, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v1, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, v0, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 ; GISEL-NEXT: v_mov_b32_e32 v9, s9 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v1, v5 +; GISEL-NEXT: v_mul_lo_u32 v10, v1, v6 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v8, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v7, s9, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, s8, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, s8, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, s8, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_mul_lo_u32 v7, s9, v5 +; GISEL-NEXT: v_mul_lo_u32 v6, s8, v6 +; GISEL-NEXT: v_mul_lo_u32 v8, s8, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, s8, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v1, v4, vcc -; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v4 -; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s9, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], v1, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v5 +; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s9, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s9, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s9, v6 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] ; GISEL-NEXT: v_subrev_i32_e32 v7, vcc, s8, v0 ; GISEL-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v1, vcc ; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s9, v8 @@ -2048,50 +2048,50 @@ ; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] ; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s7 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s7 ; GISEL-NEXT: s_sub_u32 s8, 0, s6 ; GISEL-NEXT: s_cselect_b32 s4, 1, 0 ; GISEL-NEXT: s_and_b32 s4, s4, 1 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GISEL-NEXT: s_subb_u32 s9, 0, s7 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v5, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v6, v6 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v6 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 -; GISEL-NEXT: v_mul_lo_u32 v7, s9, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, s8, v5 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v10, s8, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v3 -; GISEL-NEXT: v_mul_lo_u32 v9, s8, v4 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, s9, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, s8, v6 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_mul_hi_u32 v10, s8, v5 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GISEL-NEXT: v_mul_lo_u32 v9, s8, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v8, v5, v9 -; GISEL-NEXT: v_mul_lo_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v6 +; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9 +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, v6, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GISEL-NEXT: v_mul_hi_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v10, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 @@ -2101,27 +2101,27 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v5, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, s9, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v6, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, s9, v5 ; GISEL-NEXT: v_mul_lo_u32 v10, s8, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, s8, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, s8, v4 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v12, s8, v5 +; GISEL-NEXT: v_mul_lo_u32 v11, s8, v5 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; GISEL-NEXT: v_mul_lo_u32 v10, v8, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v7, v4, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v6 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v10, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 -; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v5, v9 ; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] @@ -2132,47 +2132,47 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v10 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v2, v5 -; GISEL-NEXT: v_mul_hi_u32 v10, v2, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v3, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v2, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 ; GISEL-NEXT: v_mov_b32_e32 v9, s7 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v3, v5 +; GISEL-NEXT: v_mul_lo_u32 v10, v3, v6 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v2, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v8, v2, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v7, s7, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, s6, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, s6, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, s6, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_mul_lo_u32 v7, s7, v5 +; GISEL-NEXT: v_mul_lo_u32 v6, s6, v6 +; GISEL-NEXT: v_mul_lo_u32 v8, s6, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, s6, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v3, v4, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4 -; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v5 +; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s7, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s7, v6 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] ; GISEL-NEXT: v_subrev_i32_e32 v7, vcc, s6, v2 ; GISEL-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v3, vcc ; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v8 @@ -2187,35 +2187,35 @@ ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v6 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v6 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb +; CGP-NEXT: v_cvt_f32_u32_e32 v5, 0x12d8fb ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 ; CGP-NEXT: s_mov_b32 s6, 0xffed2705 -; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 -; CGP-NEXT: v_mov_b32_e32 v7, v4 +; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; CGP-NEXT: v_mov_b32_e32 v7, v5 ; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 ; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 ; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 ; CGP-NEXT: s_mov_b32 s7, 0x12d8fb ; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 ; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 @@ -2228,7 +2228,7 @@ ; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 ; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 +; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 @@ -2259,7 +2259,7 @@ ; CGP-NEXT: v_mul_lo_u32 v14, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v13 ; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 @@ -2284,7 +2284,7 @@ ; CGP-NEXT: v_mul_lo_u32 v10, v0, v8 ; CGP-NEXT: v_mul_hi_u32 v11, v0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 @@ -2332,33 +2332,33 @@ ; CGP-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4 +; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 ; CGP-NEXT: v_trunc_f32_e32 v7, v7 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, -1, v4 +; CGP-NEXT: v_mul_lo_u32 v8, -1, v5 ; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 -; CGP-NEXT: v_mul_hi_u32 v11, s6, v4 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v4 +; CGP-NEXT: v_mul_hi_u32 v11, s6, v5 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; CGP-NEXT: v_mul_lo_u32 v9, v7, v10 -; CGP-NEXT: v_mul_lo_u32 v11, v4, v8 -; CGP-NEXT: v_mul_hi_u32 v12, v4, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v5, v8 +; CGP-NEXT: v_mul_hi_u32 v12, v5, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v7, v10 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v12, v7, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_mul_hi_u32 v11, v4, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v5, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc @@ -2369,18 +2369,18 @@ ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v7, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v10, -1, v4 +; CGP-NEXT: v_mul_lo_u32 v10, -1, v5 ; CGP-NEXT: v_mul_lo_u32 v11, s6, v9 -; CGP-NEXT: v_mul_hi_u32 v13, s6, v4 -; CGP-NEXT: v_mul_lo_u32 v12, s6, v4 +; CGP-NEXT: v_mul_hi_u32 v13, s6, v5 +; CGP-NEXT: v_mul_lo_u32 v12, s6, v5 ; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 ; CGP-NEXT: v_mul_lo_u32 v11, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v13, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v8, v4, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v5, v10 +; CGP-NEXT: v_mul_hi_u32 v8, v5, v12 ; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 ; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 @@ -2389,7 +2389,7 @@ ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; CGP-NEXT: v_mul_lo_u32 v11, v9, v10 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 -; CGP-NEXT: v_mul_hi_u32 v13, v4, v10 +; CGP-NEXT: v_mul_hi_u32 v13, v5, v10 ; CGP-NEXT: v_mul_hi_u32 v9, v9, v10 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] @@ -2401,30 +2401,30 @@ ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v11 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 ; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_mul_lo_u32 v8, v3, v4 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_mul_lo_u32 v8, v3, v5 ; CGP-NEXT: v_mul_lo_u32 v9, v2, v7 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; CGP-NEXT: v_mul_hi_u32 v5, v2, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; CGP-NEXT: v_mul_hi_u32 v4, v2, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v3, v5 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v8, v3, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; CGP-NEXT: v_mul_hi_u32 v9, v2, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 @@ -2488,43 +2488,43 @@ ; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v6 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v5, v0 ; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v6, v0, vcc -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v0 +; CHECK-NEXT: v_xor_b32_e32 v5, v1, v0 ; CHECK-NEXT: v_xor_b32_e32 v0, v2, v0 -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v1 -; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v0 -; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v5 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v5 +; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v1 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc +; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v6 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, 0, v1 +; CHECK-NEXT: v_sub_i32_e32 v7, vcc, 0, v5 ; CHECK-NEXT: v_subb_u32_e32 v8, vcc, 0, v0, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v3, v6 +; CHECK-NEXT: v_xor_b32_e32 v3, v3, v1 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v5, v5 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v5 +; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v2 +; CHECK-NEXT: v_trunc_f32_e32 v6, v6 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v6 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CHECK-NEXT: v_xor_b32_e32 v4, v4, v6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CHECK-NEXT: v_xor_b32_e32 v4, v4, v1 ; CHECK-NEXT: v_mul_lo_u32 v9, v8, v2 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v5 +; CHECK-NEXT: v_mul_lo_u32 v10, v7, v6 ; CHECK-NEXT: v_mul_hi_u32 v12, v7, v2 ; CHECK-NEXT: v_mul_lo_u32 v11, v7, v2 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; CHECK-NEXT: v_mul_lo_u32 v10, v5, v11 +; CHECK-NEXT: v_mul_lo_u32 v10, v6, v11 ; CHECK-NEXT: v_mul_lo_u32 v12, v2, v9 ; CHECK-NEXT: v_mul_hi_u32 v13, v2, v11 -; CHECK-NEXT: v_mul_hi_u32 v11, v5, v11 +; CHECK-NEXT: v_mul_hi_u32 v11, v6, v11 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v13 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v13, v5, v9 +; CHECK-NEXT: v_mul_lo_u32 v13, v6, v9 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; CHECK-NEXT: v_mul_hi_u32 v12, v2, v9 -; CHECK-NEXT: v_mul_hi_u32 v9, v5, v9 +; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9 ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v12 @@ -2535,12 +2535,12 @@ ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CHECK-NEXT: v_addc_u32_e64 v10, s[4:5], v5, v9, vcc +; CHECK-NEXT: v_addc_u32_e64 v10, s[4:5], v6, v9, vcc ; CHECK-NEXT: v_mul_lo_u32 v8, v8, v2 ; CHECK-NEXT: v_mul_lo_u32 v11, v7, v10 ; CHECK-NEXT: v_mul_lo_u32 v12, v7, v2 ; CHECK-NEXT: v_mul_hi_u32 v7, v7, v2 -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9 +; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v9 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 ; CHECK-NEXT: v_mul_hi_u32 v9, v2, v12 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 @@ -2564,21 +2564,21 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc +; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; CHECK-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; CHECK-NEXT: v_mul_lo_u32 v7, v4, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v3, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, v6 ; CHECK-NEXT: v_mul_hi_u32 v9, v3, v2 ; CHECK-NEXT: v_mul_hi_u32 v2, v4, v2 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 +; CHECK-NEXT: v_mul_lo_u32 v9, v4, v6 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 +; CHECK-NEXT: v_mul_hi_u32 v8, v3, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v4, v6 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 @@ -2587,44 +2587,44 @@ ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CHECK-NEXT: v_mul_lo_u32 v7, v0, v2 -; CHECK-NEXT: v_mul_lo_u32 v5, v1, v5 -; CHECK-NEXT: v_mul_lo_u32 v8, v1, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, v5, v6 +; CHECK-NEXT: v_mul_lo_u32 v8, v5, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v5, v2 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v8 -; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v4, v2, vcc +; CHECK-NEXT: v_subb_u32_e64 v6, s[4:5], v4, v2, vcc ; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v4, v2 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v0 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v0 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v0 ; CHECK-NEXT: v_subb_u32_e32 v2, vcc, v2, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v3, v1 +; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v3, v5 ; CHECK-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v2, vcc ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v0 ; CHECK-NEXT: v_subb_u32_e32 v0, vcc, v2, v0, vcc -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v7, v1 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v7, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 -; CHECK-NEXT: v_xor_b32_e32 v2, v0, v6 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v1, v6 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v2, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; CHECK-NEXT: v_xor_b32_e32 v2, v2, v1 +; CHECK-NEXT: v_xor_b32_e32 v3, v0, v1 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v2, v1 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 ; CHECK-NEXT: BB7_2: ; %Flow @@ -2664,28 +2664,28 @@ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b64 s[6:7], 0x1000 ; GISEL-NEXT: v_lshl_b64 v[4:5], s[6:7], v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc +; GISEL-NEXT: v_xor_b32_e32 v8, v4, v7 ; GISEL-NEXT: v_xor_b32_e32 v5, v5, v7 -; GISEL-NEXT: v_xor_b32_e32 v4, v4, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 -; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5 +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc +; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GISEL-NEXT: v_xor_b32_e32 v8, v0, v9 +; GISEL-NEXT: v_xor_b32_e32 v9, v0, v4 +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v8 ; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v16, v1, v9 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v7 ; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v0 ; GISEL-NEXT: v_trunc_f32_e32 v7, v7 ; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v7 ; GISEL-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GISEL-NEXT: v_xor_b32_e32 v16, v1, v4 ; GISEL-NEXT: v_mul_lo_u32 v12, v11, v0 ; GISEL-NEXT: v_mul_lo_u32 v13, v10, v7 ; GISEL-NEXT: v_mul_hi_u32 v15, v10, v0 @@ -2747,9 +2747,9 @@ ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v0, v11 ; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v11, v16, v7 -; GISEL-NEXT: v_mul_lo_u32 v12, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v12, v9, v10 ; GISEL-NEXT: v_lshl_b64 v[0:1], s[6:7], v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v6, v9, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, v16, v7 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc @@ -2757,7 +2757,7 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v11, v16, v10 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v12, v6 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v10 ; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc @@ -2769,66 +2769,66 @@ ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; GISEL-NEXT: v_mul_lo_u32 v10, v5, v6 -; GISEL-NEXT: v_mul_lo_u32 v7, v4, v7 -; GISEL-NEXT: v_mul_lo_u32 v11, v4, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v7, v8, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v8, v6 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v8, v11 -; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v16, v6, vcc +; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v9, v11 +; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v16, v6, vcc ; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v16, v6 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v5 ; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v6, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v7, v4 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v7, v8 ; GISEL-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v6, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v5 ; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v6, v5, vcc -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v11, v4 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v11, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v11, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; GISEL-NEXT: v_xor_b32_e32 v7, v0, v6 -; GISEL-NEXT: v_xor_b32_e32 v6, v1, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v6 -; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v3 -; GISEL-NEXT: v_xor_b32_e32 v4, v4, v9 -; GISEL-NEXT: v_xor_b32_e32 v5, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc +; GISEL-NEXT: v_xor_b32_e32 v8, v0, v7 +; GISEL-NEXT: v_xor_b32_e32 v7, v1, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v4 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 ; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v8 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v3, v1, v8 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v9 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v3, v1, v9 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GISEL-NEXT: v_trunc_f32_e32 v1, v1 ; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GISEL-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GISEL-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v7 -; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v6, vcc +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v8 +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v7, vcc ; GISEL-NEXT: v_mul_lo_u32 v12, v11, v0 ; GISEL-NEXT: v_mul_lo_u32 v13, v10, v1 ; GISEL-NEXT: v_mul_hi_u32 v15, v10, v0 ; GISEL-NEXT: v_mul_lo_u32 v14, v10, v0 -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v8 +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v9 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GISEL-NEXT: v_mul_lo_u32 v13, v1, v14 @@ -2885,66 +2885,66 @@ ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v10, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v11 ; GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v6, v4 ; GISEL-NEXT: v_mul_lo_u32 v12, v2, v10 ; GISEL-NEXT: v_mul_lo_u32 v13, v3, v11 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v4, v9 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc ; GISEL-NEXT: v_mul_hi_u32 v4, v3, v10 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v9, vcc +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v10 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v5, v2, v11 -; GISEL-NEXT: v_mul_hi_u32 v10, v2, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v9, v3, v11 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GISEL-NEXT: v_mul_hi_u32 v6, v3, v11 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; GISEL-NEXT: v_mul_hi_u32 v10, v2, v11 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, v6, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v7, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, v7, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v7, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v6, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v8, v5 +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v10 ; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v2, v4, vcc ; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v6 -; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v3, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v7 +; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v3, v8 ; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v2, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v7 -; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v6, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v8 +; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v6 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v9, v7 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v7 +; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v6, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v8 -; GISEL-NEXT: v_xor_b32_e32 v4, v2, v8 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v3, v8 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v4, v8, vcc +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v9 +; GISEL-NEXT: v_xor_b32_e32 v4, v2, v9 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v3, v9 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v4, v9, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_pow2_shl_denom: @@ -2968,16 +2968,16 @@ ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v0 ; CGP-NEXT: v_addc_u32_e32 v2, vcc, v3, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v1, v0 +; CGP-NEXT: v_xor_b32_e32 v3, v1, v0 ; CGP-NEXT: v_xor_b32_e32 v0, v2, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v0 -; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v9 -; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0 +; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v9 +; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v4 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v4 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v9, v4, vcc -; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v1 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v1 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v9, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 ; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v2 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 @@ -2985,12 +2985,12 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v3, v4 +; CGP-NEXT: v_xor_b32_e32 v4, v4, v1 ; CGP-NEXT: v_mul_lo_u32 v13, v12, v2 ; CGP-NEXT: v_mul_lo_u32 v14, v9, v8 ; CGP-NEXT: v_mul_hi_u32 v16, v9, v2 ; CGP-NEXT: v_mul_lo_u32 v15, v9, v2 -; CGP-NEXT: v_xor_b32_e32 v6, v6, v4 +; CGP-NEXT: v_xor_b32_e32 v6, v6, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; CGP-NEXT: v_mul_lo_u32 v14, v8, v15 @@ -3048,8 +3048,8 @@ ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 ; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc ; CGP-NEXT: v_mul_lo_u32 v9, v6, v2 -; CGP-NEXT: v_mul_lo_u32 v12, v3, v8 -; CGP-NEXT: v_mul_hi_u32 v13, v3, v2 +; CGP-NEXT: v_mul_lo_u32 v12, v4, v8 +; CGP-NEXT: v_mul_hi_u32 v13, v4, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v6, v2 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc @@ -3057,7 +3057,7 @@ ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v13, v6, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; CGP-NEXT: v_mul_hi_u32 v12, v3, v8 +; CGP-NEXT: v_mul_hi_u32 v12, v4, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc @@ -3069,42 +3069,42 @@ ; CGP-NEXT: v_add_i32_e32 v9, vcc, v12, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_mul_lo_u32 v9, v0, v2 -; CGP-NEXT: v_mul_lo_u32 v8, v1, v8 -; CGP-NEXT: v_mul_lo_u32 v12, v1, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v1, v2 +; CGP-NEXT: v_mul_lo_u32 v8, v3, v8 +; CGP-NEXT: v_mul_lo_u32 v12, v3, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v3, v2 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v12 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v4, v12 ; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v6, v2, vcc ; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v6, v2 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v3 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v0 ; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v0, vcc ; CGP-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v3, v1 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v4, v3 ; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v2, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v0 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v0 ; CGP-NEXT: v_subb_u32_e32 v0, vcc, v2, v0, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v9, v1 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v9, v3 ; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 -; CGP-NEXT: v_xor_b32_e32 v2, v0, v4 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v2, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v1 +; CGP-NEXT: v_xor_b32_e32 v3, v0, v1 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v2, v1 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: BB8_2: ; %Flow2 @@ -3143,43 +3143,43 @@ ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v11 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v2 ; CGP-NEXT: v_addc_u32_e32 v4, vcc, v11, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v3, v2 +; CGP-NEXT: v_xor_b32_e32 v6, v3, v2 ; CGP-NEXT: v_xor_b32_e32 v2, v4, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v3 -; CGP-NEXT: v_cvt_f32_u32_e32 v6, v2 -; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CGP-NEXT: v_xor_b32_e32 v5, v5, v8 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v6 +; CGP-NEXT: v_cvt_f32_u32_e32 v8, v2 +; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v3 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v8 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v8, vcc -; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v6 ; CGP-NEXT: v_subb_u32_e32 v10, vcc, 0, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v5, v3 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v7, v7 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 +; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v4 +; CGP-NEXT: v_trunc_f32_e32 v8, v8 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_xor_b32_e32 v6, v6, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_xor_b32_e32 v7, v7, v3 ; CGP-NEXT: v_mul_lo_u32 v11, v10, v4 -; CGP-NEXT: v_mul_lo_u32 v12, v9, v7 +; CGP-NEXT: v_mul_lo_u32 v12, v9, v8 ; CGP-NEXT: v_mul_hi_u32 v14, v9, v4 ; CGP-NEXT: v_mul_lo_u32 v13, v9, v4 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; CGP-NEXT: v_mul_lo_u32 v12, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v12, v8, v13 ; CGP-NEXT: v_mul_lo_u32 v14, v4, v11 ; CGP-NEXT: v_mul_hi_u32 v15, v4, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v7, v11 +; CGP-NEXT: v_mul_lo_u32 v15, v8, v11 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 ; CGP-NEXT: v_mul_hi_u32 v14, v4, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 @@ -3190,12 +3190,12 @@ ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_addc_u32_e64 v12, s[4:5], v7, v11, vcc +; CGP-NEXT: v_addc_u32_e64 v12, s[4:5], v8, v11, vcc ; CGP-NEXT: v_mul_lo_u32 v10, v10, v4 ; CGP-NEXT: v_mul_lo_u32 v13, v9, v12 ; CGP-NEXT: v_mul_lo_u32 v14, v9, v4 ; CGP-NEXT: v_mul_hi_u32 v9, v9, v4 -; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v11 +; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 ; CGP-NEXT: v_mul_hi_u32 v11, v4, v14 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 @@ -3219,21 +3219,21 @@ ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v6, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v5, v7 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v5, v8 ; CGP-NEXT: v_mul_hi_u32 v11, v5, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v6, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v6, v7 +; CGP-NEXT: v_mul_lo_u32 v11, v7, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v5, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 +; CGP-NEXT: v_mul_hi_u32 v10, v5, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 @@ -3242,44 +3242,44 @@ ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_mul_lo_u32 v9, v2, v4 -; CGP-NEXT: v_mul_lo_u32 v7, v3, v7 -; CGP-NEXT: v_mul_lo_u32 v10, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v8, v6, v8 +; CGP-NEXT: v_mul_lo_u32 v10, v6, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v6, v4 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v10 -; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v6, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v6, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v2 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 +; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v7, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v7, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v2 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v2 ; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v2, vcc -; CGP-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v5, v3 +; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v5, v6 ; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v4, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v2 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v6 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v2 ; CGP-NEXT: v_subb_u32_e32 v2, vcc, v4, v2, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v9, v3 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v9, v6 ; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; CGP-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v3, v8 -; CGP-NEXT: v_xor_b32_e32 v4, v2, v8 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v3, v8 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v4, v8, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; CGP-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v4, v4, v3 +; CGP-NEXT: v_xor_b32_e32 v5, v2, v3 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v4, v3 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc ; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 ; CGP-NEXT: ; implicit-def: $vgpr5 ; CGP-NEXT: BB8_6: ; %Flow @@ -3379,37 +3379,37 @@ ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v3 ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v1 ; GISEL-NEXT: v_subb_u32_e32 v8, vcc, 0, v3, vcc -; GISEL-NEXT: v_and_b32_e32 v0, s6, v0 +; GISEL-NEXT: v_and_b32_e32 v6, s6, v6 ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_and_b32_e32 v6, s6, v6 -; GISEL-NEXT: v_and_b32_e32 v2, s6, v2 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v5, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_and_b32_e32 v5, s6, v0 +; GISEL-NEXT: v_and_b32_e32 v0, s6, v2 +; GISEL-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v4 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 +; GISEL-NEXT: v_trunc_f32_e32 v4, v4 +; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v12, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v8, v2 +; GISEL-NEXT: v_mul_lo_u32 v10, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v12, v7, v2 +; GISEL-NEXT: v_mul_lo_u32 v11, v7, v2 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v10, v5, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, 0, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v4, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v14, v2, v11 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 0, v5 ; GISEL-NEXT: v_addc_u32_e64 v13, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v11, v4, v11 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v4, v9 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 @@ -3419,18 +3419,18 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v5, v9, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 +; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v4, v9, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v8, v2 ; GISEL-NEXT: v_mul_lo_u32 v11, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v12, v7, v4 -; GISEL-NEXT: v_mul_hi_u32 v7, v7, v4 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9 +; GISEL-NEXT: v_mul_lo_u32 v12, v7, v2 +; GISEL-NEXT: v_mul_hi_u32 v7, v7, v2 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; GISEL-NEXT: v_mul_hi_u32 v9, v4, v12 +; GISEL-NEXT: v_mul_hi_u32 v9, v2, v12 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 ; GISEL-NEXT: v_mul_lo_u32 v8, v10, v12 -; GISEL-NEXT: v_mul_lo_u32 v11, v4, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, v2, v7 ; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] @@ -3438,7 +3438,7 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v9, v10, v7 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; GISEL-NEXT: v_mul_hi_u32 v11, v4, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] @@ -3449,197 +3449,197 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v13, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v9, v0, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v4, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v13, v2 +; GISEL-NEXT: v_mul_lo_u32 v8, v5, v4 +; GISEL-NEXT: v_mul_hi_u32 v9, v5, v2 +; GISEL-NEXT: v_mul_hi_u32 v2, v13, v2 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v13, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v8, v5, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v7, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v1, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v13, v4, vcc -; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v13, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_mul_lo_u32 v7, v3, v2 +; GISEL-NEXT: v_mul_lo_u32 v4, v1, v4 +; GISEL-NEXT: v_mul_lo_u32 v8, v1, v2 +; GISEL-NEXT: v_mul_hi_u32 v2, v1, v2 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v5, v8 +; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v13, v2, vcc +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v13, v2 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v3 -; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v4, v3, vcc +; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v0, v1 -; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v4, vcc +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v4, v1 +; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v2, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 +; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, 0, v6 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; GISEL-NEXT: v_addc_u32_e64 v6, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v3 ; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc ; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v11 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v8 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v8 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GISEL-NEXT: v_trunc_f32_e32 v5, v5 -; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v4 +; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 ; GISEL-NEXT: v_subb_u32_e32 v8, vcc, 0, v6, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v8, v3 +; GISEL-NEXT: v_mul_lo_u32 v9, v8, v4 ; GISEL-NEXT: v_mul_lo_u32 v10, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v12, v7, v3 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v3 +; GISEL-NEXT: v_mul_hi_u32 v12, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v7, v4 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; GISEL-NEXT: v_mul_lo_u32 v10, v5, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v14, v3, v11 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, 0, v2 -; GISEL-NEXT: v_addc_u32_e64 v13, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v9 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, 0, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v4, v11 +; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v5, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v9 ; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_mul_hi_u32 v12, v3, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v5, v9, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v8, v3 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v12, v7, v3 -; GISEL-NEXT: v_mul_hi_u32 v7, v7, v3 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_addc_u32_e64 v4, s[4:5], v5, v9, vcc +; GISEL-NEXT: v_mul_lo_u32 v8, v8, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v7, v0 +; GISEL-NEXT: v_mul_hi_u32 v7, v7, v0 ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; GISEL-NEXT: v_mul_hi_u32 v9, v3, v12 +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 +; GISEL-NEXT: v_mul_hi_u32 v9, v0, v11 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; GISEL-NEXT: v_mul_lo_u32 v8, v10, v12 -; GISEL-NEXT: v_mul_lo_u32 v11, v3, v7 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v8, v4, v11 +; GISEL-NEXT: v_mul_lo_u32 v10, v0, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v4, v11 +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v7 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; GISEL-NEXT: v_mul_hi_u32 v11, v3, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v9, v4, v7 +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v10, v0, v7 +; GISEL-NEXT: v_mul_hi_u32 v4, v4, v7 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v13, v3 -; GISEL-NEXT: v_mul_lo_u32 v8, v2, v5 -; GISEL-NEXT: v_mul_hi_u32 v9, v2, v3 -; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v0 -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v9 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v7 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v0, v8 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1 +; GISEL-NEXT: v_mul_lo_u32 v7, v14, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v13, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v2, vcc +; GISEL-NEXT: v_mul_hi_u32 v2, v13, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v14, v4 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v14, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v13, v5 -; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v2, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v7, v6, v3 -; GISEL-NEXT: v_mul_lo_u32 v5, v4, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v4, v3 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v13, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_mul_lo_u32 v5, v6, v2 +; GISEL-NEXT: v_mul_lo_u32 v4, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v3, v2 +; GISEL-NEXT: v_mul_hi_u32 v2, v3, v2 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v13, v7 +; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v14, v2, vcc +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v14, v2 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v6 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v2, v4 -; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v4, v3 +; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v2, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v4 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3 +; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v6, vcc +; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v8, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0, v2 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc +; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v4, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_24bit: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -2470,29 +2470,29 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b64 s[4:5], 0x1000 -; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v4 -; GISEL-NEXT: v_lshl_b64 v[6:7], s[4:5], v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5 -; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 +; GISEL-NEXT: v_lshl_b64 v[7:8], s[4:5], v4 +; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8 +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6 ; GISEL-NEXT: v_trunc_f32_e32 v9, v9 -; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 -; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, v11, v8 +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v7 +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v6 +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v6 ; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v15, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v15, v10, v6 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_mul_lo_u32 v14, v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v15, v8, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v8, v12 +; GISEL-NEXT: v_mul_lo_u32 v15, v6, v13 +; GISEL-NEXT: v_mul_hi_u32 v16, v6, v12 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 @@ -2500,7 +2500,7 @@ ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; GISEL-NEXT: v_mul_lo_u32 v15, v9, v13 ; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v8, v13 +; GISEL-NEXT: v_mul_hi_u32 v16, v6, v13 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 @@ -2511,18 +2511,18 @@ ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 ; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], v9, v13, vcc ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v10, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v11, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v10, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v11, v6 ; GISEL-NEXT: v_mul_lo_u32 v14, v10, v12 -; GISEL-NEXT: v_mul_hi_u32 v10, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v10, v10, v6 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 ; GISEL-NEXT: v_mul_lo_u32 v11, v12, v13 -; GISEL-NEXT: v_mul_lo_u32 v14, v8, v10 -; GISEL-NEXT: v_mul_hi_u32 v15, v8, v13 +; GISEL-NEXT: v_mul_lo_u32 v14, v6, v10 +; GISEL-NEXT: v_mul_hi_u32 v15, v6, v13 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 @@ -2530,7 +2530,7 @@ ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 ; GISEL-NEXT: v_mul_lo_u32 v14, v12, v10 ; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v8, v10 +; GISEL-NEXT: v_mul_hi_u32 v15, v6, v10 ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 @@ -2541,93 +2541,93 @@ ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 ; GISEL-NEXT: v_mul_hi_u32 v10, v12, v10 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v11 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc ; GISEL-NEXT: v_addc_u32_e64 v9, vcc, 0, v9, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v10, v1, v8 +; GISEL-NEXT: v_mul_lo_u32 v10, v1, v6 ; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v8 +; GISEL-NEXT: v_mul_hi_u32 v12, v0, v6 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_mul_lo_u32 v11, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 +; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 ; GISEL-NEXT: v_mul_hi_u32 v12, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v13, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v10, v7, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v13, v7, v6 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v1, v11, vcc ; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v10, v11, v12, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v7 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc ; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v8 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v6 ; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v9, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v8 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v13, v0, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v11 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v12, vcc +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v11, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v7 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v5, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v6 -; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v5 +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 +; GISEL-NEXT: v_trunc_f32_e32 v7, v7 +; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v6 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_mul_lo_u32 v12, v5, v10 -; GISEL-NEXT: v_mul_lo_u32 v13, v4, v11 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v10 +; GISEL-NEXT: v_mul_lo_u32 v12, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v13, v6, v11 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v10 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_lo_u32 v13, v5, v11 -; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v11 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 @@ -2636,20 +2636,20 @@ ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v7, v11 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v5, v11, vcc -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v11 -; GISEL-NEXT: v_mul_lo_u32 v11, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v7, v11, vcc +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v11 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v6 ; GISEL-NEXT: v_mul_lo_u32 v12, v8, v10 -; GISEL-NEXT: v_mul_hi_u32 v8, v8, v4 +; GISEL-NEXT: v_mul_hi_u32 v8, v8, v6 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 ; GISEL-NEXT: v_mul_lo_u32 v9, v10, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 -; GISEL-NEXT: v_mul_hi_u32 v13, v4, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v13, v6, v11 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 @@ -2657,7 +2657,7 @@ ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 ; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8 ; GISEL-NEXT: v_mul_hi_u32 v11, v10, v11 -; GISEL-NEXT: v_mul_hi_u32 v13, v4, v8 +; GISEL-NEXT: v_mul_hi_u32 v13, v6, v8 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 @@ -2668,64 +2668,64 @@ ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 ; GISEL-NEXT: v_mul_hi_u32 v8, v10, v8 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v5, vcc, 0, v5, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v8, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v9, v2, v5 -; GISEL-NEXT: v_mul_hi_u32 v10, v2, v4 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v9 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc +; GISEL-NEXT: v_addc_u32_e64 v7, vcc, 0, v7, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v8, v3, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v6 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 -; GISEL-NEXT: v_mul_hi_u32 v10, v2, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_mul_lo_u32 v8, v6, v4 -; GISEL-NEXT: v_mul_lo_u32 v9, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v6, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, v6, v4 +; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v5, v6 +; GISEL-NEXT: v_mul_lo_u32 v10, v4, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v4, v6 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 ; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v9, vcc ; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v9 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v7 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v8, v9, v10, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v6 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v4 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc ; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v4 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v5, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v6 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v7, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v9 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v10, vcc +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v10, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64_pow2_shl_denom: @@ -3131,61 +3131,61 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b32 s6, 0xffffff -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v1, 0 -; GISEL-NEXT: v_and_b32_e32 v3, s6, v4 -; GISEL-NEXT: v_and_b32_e32 v4, s6, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v3 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 -; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v4 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 +; GISEL-NEXT: v_and_b32_e32 v1, s6, v4 +; GISEL-NEXT: v_and_b32_e32 v3, s6, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v1 +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 +; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3 +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 ; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v1 -; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v1 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v8 -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v1 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v5 +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 +; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v7 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v8 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7 ; GISEL-NEXT: v_trunc_f32_e32 v8, v8 ; GISEL-NEXT: v_trunc_f32_e32 v11, v11 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v8 +; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v11 +; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11 ; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GISEL-NEXT: v_mul_lo_u32 v12, v6, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v6, v1 -; GISEL-NEXT: v_mul_lo_u32 v15, v7, v1 -; GISEL-NEXT: v_mul_hi_u32 v16, v6, v1 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v5 -; GISEL-NEXT: v_mul_lo_u32 v18, v10, v5 -; GISEL-NEXT: v_mul_hi_u32 v19, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v15, v5, v6 +; GISEL-NEXT: v_mul_hi_u32 v16, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7 +; GISEL-NEXT: v_mul_lo_u32 v18, v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v19, v9, v7 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 ; GISEL-NEXT: v_mul_lo_u32 v15, v11, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v5, v17 +; GISEL-NEXT: v_mul_hi_u32 v18, v7, v17 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v5, v13 +; GISEL-NEXT: v_mul_lo_u32 v19, v7, v13 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18 ; GISEL-NEXT: v_mul_lo_u32 v15, v8, v14 -; GISEL-NEXT: v_mul_hi_u32 v18, v1, v14 +; GISEL-NEXT: v_mul_hi_u32 v18, v6, v14 ; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14 ; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17 ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v12 +; GISEL-NEXT: v_mul_lo_u32 v16, v6, v12 ; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18 ; GISEL-NEXT: v_mul_lo_u32 v15, v8, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v1, v12 +; GISEL-NEXT: v_mul_hi_u32 v18, v6, v12 ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v18 @@ -3195,7 +3195,7 @@ ; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 ; GISEL-NEXT: v_mul_lo_u32 v19, v11, v13 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_hi_u32 v19, v5, v13 +; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc @@ -3216,35 +3216,35 @@ ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v18 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 ; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], v8, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v6, v1 -; GISEL-NEXT: v_mul_lo_u32 v7, v7, v1 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v17 +; GISEL-NEXT: v_mul_lo_u32 v15, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v5, v5, v6 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v17 ; GISEL-NEXT: v_addc_u32_e64 v16, s[6:7], v11, v13, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, v10, v5 -; GISEL-NEXT: v_mul_hi_u32 v18, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7 +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v18, v9, v7 ; GISEL-NEXT: v_mul_lo_u32 v9, v9, v16 ; GISEL-NEXT: v_mul_lo_u32 v19, v16, v17 ; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v10, v5, v17 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v17 ; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v5, v9 +; GISEL-NEXT: v_mul_lo_u32 v18, v7, v9 ; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], v19, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v18, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v6, v1 -; GISEL-NEXT: v_mul_lo_u32 v6, v6, v14 +; GISEL-NEXT: v_mul_hi_u32 v10, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v4, v4, v14 ; GISEL-NEXT: v_mul_lo_u32 v18, v14, v15 -; GISEL-NEXT: v_add_i32_e64 v6, s[8:9], v7, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v1, v15 -; GISEL-NEXT: v_add_i32_e64 v6, s[8:9], v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v1, v6 +; GISEL-NEXT: v_add_i32_e64 v4, s[8:9], v5, v4 +; GISEL-NEXT: v_mul_hi_u32 v5, v6, v15 +; GISEL-NEXT: v_add_i32_e64 v4, s[8:9], v4, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v6, v4 ; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v18, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v7, s[8:9], v10, v7 -; GISEL-NEXT: v_mov_b32_e32 v7, s10 +; GISEL-NEXT: v_add_i32_e64 v5, s[8:9], v10, v5 +; GISEL-NEXT: v_mov_b32_e32 v5, s10 ; GISEL-NEXT: v_mov_b32_e32 v10, s11 ; GISEL-NEXT: v_add_i32_e64 v8, s[10:11], v8, v12 ; GISEL-NEXT: v_mov_b32_e32 v12, s12 @@ -3253,26 +3253,26 @@ ; GISEL-NEXT: v_mul_hi_u32 v15, v16, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] ; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_lo_u32 v18, v14, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v14, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v18, v14, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v14, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v6, v4 ; GISEL-NEXT: v_add_i32_e64 v13, s[8:9], v18, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v6, s[8:9], v13, v6 +; GISEL-NEXT: v_add_i32_e64 v4, s[8:9], v13, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[8:9] ; GISEL-NEXT: v_add_i32_e64 v13, s[8:9], v18, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], v19, v18 ; GISEL-NEXT: v_mul_lo_u32 v19, v16, v9 ; GISEL-NEXT: v_mul_hi_u32 v16, v16, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 ; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v19, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v15, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v19, v15 ; GISEL-NEXT: v_mov_b32_e32 v19, s13 -; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v17 +; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] @@ -3282,16 +3282,16 @@ ; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v16, v15 ; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc ; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v14, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v6 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v0, v1 -; GISEL-NEXT: v_mul_hi_u32 v1, 0, v1 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4 +; GISEL-NEXT: v_mul_hi_u32 v13, v0, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v14, v2, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 +; GISEL-NEXT: v_mul_lo_u32 v11, 0, v7 +; GISEL-NEXT: v_mul_hi_u32 v14, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 ; GISEL-NEXT: v_mul_lo_u32 v15, v0, v6 ; GISEL-NEXT: v_mul_lo_u32 v16, 0, v6 ; GISEL-NEXT: v_mul_hi_u32 v17, v0, v6 @@ -3305,54 +3305,54 @@ ; GISEL-NEXT: v_mul_hi_u32 v9, 0, v9 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v16, v1 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v16, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v11, v5 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v17 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v17 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v15 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, v3, v1 -; GISEL-NEXT: v_mul_lo_u32 v15, 0, v1 -; GISEL-NEXT: v_mul_hi_u32 v16, v3, v1 +; GISEL-NEXT: v_mul_lo_u32 v13, v1, v4 +; GISEL-NEXT: v_mul_lo_u32 v15, 0, v4 +; GISEL-NEXT: v_mul_hi_u32 v16, v1, v4 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v5 -; GISEL-NEXT: v_mul_lo_u32 v17, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v18, v4, v5 +; GISEL-NEXT: v_mul_lo_u32 v14, v3, v7 +; GISEL-NEXT: v_mul_lo_u32 v17, 0, v7 +; GISEL-NEXT: v_mul_hi_u32 v18, v3, v7 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v9, v3, v6 -; GISEL-NEXT: v_mul_lo_u32 v11, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v9, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v3, v8 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v15, v9 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v17, v11 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v1 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v4 ; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v6, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v13 ; GISEL-NEXT: v_subb_u32_e64 v13, s[4:5], 0, v9, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], 1, v5 +; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], 1, v7 ; GISEL-NEXT: v_addc_u32_e64 v18, s[6:7], 0, v8, s[6:7] ; GISEL-NEXT: v_sub_i32_e64 v2, s[6:7], v2, v14 ; GISEL-NEXT: v_subb_u32_e64 v14, s[8:9], 0, v11, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v16, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v16, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, v16, s[4:5] @@ -3360,34 +3360,34 @@ ; GISEL-NEXT: v_addc_u32_e64 v16, s[4:5], 0, v17, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], 0, v11 ; GISEL-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v11, s[6:7] -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v4 +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v3 ; GISEL-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v11, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3 ; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 1, v13 -; GISEL-NEXT: v_addc_u32_e64 v4, s[6:7], 0, v18, s[6:7] +; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], 0, v18, s[6:7] ; GISEL-NEXT: v_sub_i32_e64 v9, s[6:7], 0, v9 ; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v19, v1, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v15, v14, vcc -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v13, v2, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v3, v17, v16, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, v13, v2, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v9, v17, v16, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v18, v3, s[4:5] ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v4, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v2, v7, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64_24bit: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -1827,29 +1827,29 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b64 s[4:5], 0x1000 -; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v4 -; GISEL-NEXT: v_lshl_b64 v[6:7], s[4:5], v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5 -; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 +; GISEL-NEXT: v_lshl_b64 v[7:8], s[4:5], v4 +; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8 +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6 ; GISEL-NEXT: v_trunc_f32_e32 v9, v9 -; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 -; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, v11, v8 +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v7 +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v6 +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v6 ; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v15, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v15, v10, v6 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_mul_lo_u32 v14, v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v15, v8, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v8, v12 +; GISEL-NEXT: v_mul_lo_u32 v15, v6, v13 +; GISEL-NEXT: v_mul_hi_u32 v16, v6, v12 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 @@ -1857,7 +1857,7 @@ ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; GISEL-NEXT: v_mul_lo_u32 v15, v9, v13 ; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v8, v13 +; GISEL-NEXT: v_mul_hi_u32 v16, v6, v13 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 @@ -1868,18 +1868,18 @@ ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 ; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], v9, v13, vcc ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v10, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v11, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v10, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v11, v6 ; GISEL-NEXT: v_mul_lo_u32 v14, v10, v12 -; GISEL-NEXT: v_mul_hi_u32 v10, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v10, v10, v6 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 ; GISEL-NEXT: v_mul_lo_u32 v11, v12, v13 -; GISEL-NEXT: v_mul_lo_u32 v14, v8, v10 -; GISEL-NEXT: v_mul_hi_u32 v15, v8, v13 +; GISEL-NEXT: v_mul_lo_u32 v14, v6, v10 +; GISEL-NEXT: v_mul_hi_u32 v15, v6, v13 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 @@ -1887,7 +1887,7 @@ ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 ; GISEL-NEXT: v_mul_lo_u32 v14, v12, v10 ; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v8, v10 +; GISEL-NEXT: v_mul_hi_u32 v15, v6, v10 ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 @@ -1898,92 +1898,92 @@ ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 ; GISEL-NEXT: v_mul_hi_u32 v10, v12, v10 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v11 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc ; GISEL-NEXT: v_addc_u32_e64 v9, vcc, 0, v9, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v10, v1, v8 +; GISEL-NEXT: v_mul_lo_u32 v10, v1, v6 ; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v8 +; GISEL-NEXT: v_mul_hi_u32 v12, v0, v6 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_mul_lo_u32 v11, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 +; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 ; GISEL-NEXT: v_mul_hi_u32 v12, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v10, v7, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v6, v7, v6 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v8, vcc -; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v8 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 +; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v6, vcc +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], v0, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], v0, v7 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc ; GISEL-NEXT: v_subbrev_u32_e64 v11, vcc, 0, v1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v5 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v4 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v5 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v8 ; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v10, v4 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v1, v5, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v10, v7 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v1, v8, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v7 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v5, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v6 -; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v7, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v5 +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 +; GISEL-NEXT: v_trunc_f32_e32 v7, v7 +; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v6 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_mul_lo_u32 v12, v5, v10 -; GISEL-NEXT: v_mul_lo_u32 v13, v4, v11 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v10 +; GISEL-NEXT: v_mul_lo_u32 v12, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v13, v6, v11 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v10 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_lo_u32 v13, v5, v11 -; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v11 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 @@ -1992,20 +1992,20 @@ ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v7, v11 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v5, v11, vcc -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v11 -; GISEL-NEXT: v_mul_lo_u32 v11, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v9, v9, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v7, v11, vcc +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v11 +; GISEL-NEXT: v_mul_lo_u32 v11, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v6 ; GISEL-NEXT: v_mul_lo_u32 v12, v8, v10 -; GISEL-NEXT: v_mul_hi_u32 v8, v8, v4 +; GISEL-NEXT: v_mul_hi_u32 v8, v8, v6 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 ; GISEL-NEXT: v_mul_lo_u32 v9, v10, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 -; GISEL-NEXT: v_mul_hi_u32 v13, v4, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v13, v6, v11 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 @@ -2013,7 +2013,7 @@ ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 ; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8 ; GISEL-NEXT: v_mul_hi_u32 v11, v10, v11 -; GISEL-NEXT: v_mul_hi_u32 v13, v4, v8 +; GISEL-NEXT: v_mul_hi_u32 v13, v6, v8 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 @@ -2024,63 +2024,63 @@ ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 ; GISEL-NEXT: v_mul_hi_u32 v8, v10, v8 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v5, vcc, 0, v5, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v8, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v9, v2, v5 -; GISEL-NEXT: v_mul_hi_u32 v10, v2, v4 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v9 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc +; GISEL-NEXT: v_addc_u32_e64 v7, vcc, 0, v7, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v8, v3, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v6 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 -; GISEL-NEXT: v_mul_hi_u32 v10, v2, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 +; GISEL-NEXT: v_mul_hi_u32 v10, v2, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_mul_lo_u32 v8, v6, v4 -; GISEL-NEXT: v_mul_lo_u32 v9, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v6, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v6, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v5, v6 +; GISEL-NEXT: v_mul_lo_u32 v7, v4, v7 +; GISEL-NEXT: v_mul_hi_u32 v6, v4, v6 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v3, v4, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 +; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v6, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v2, v6 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v2, v4 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc ; GISEL-NEXT: v_subbrev_u32_e64 v9, vcc, 0, v3, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v7 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v6 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v7 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v7, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v5, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64_pow2_shl_denom: @@ -2480,61 +2480,61 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b32 s6, 0xffffff -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v1, 0 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 ; GISEL-NEXT: v_and_b32_e32 v3, s6, v4 -; GISEL-NEXT: v_and_b32_e32 v4, s6, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v3 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 -; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v4 +; GISEL-NEXT: v_and_b32_e32 v1, s6, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3 +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 +; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v1 +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1 ; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v1 -; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v1 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v8 -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v1 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v5 +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 +; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v7 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v8 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7 ; GISEL-NEXT: v_trunc_f32_e32 v8, v8 ; GISEL-NEXT: v_trunc_f32_e32 v11, v11 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v8 +; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v11 +; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11 ; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GISEL-NEXT: v_mul_lo_u32 v12, v6, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v6, v1 -; GISEL-NEXT: v_mul_lo_u32 v15, v7, v1 -; GISEL-NEXT: v_mul_hi_u32 v16, v6, v1 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v5 -; GISEL-NEXT: v_mul_lo_u32 v18, v10, v5 -; GISEL-NEXT: v_mul_hi_u32 v19, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v15, v5, v6 +; GISEL-NEXT: v_mul_hi_u32 v16, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7 +; GISEL-NEXT: v_mul_lo_u32 v18, v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v19, v9, v7 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 ; GISEL-NEXT: v_mul_lo_u32 v15, v11, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v5, v17 +; GISEL-NEXT: v_mul_hi_u32 v18, v7, v17 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v5, v13 +; GISEL-NEXT: v_mul_lo_u32 v19, v7, v13 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18 ; GISEL-NEXT: v_mul_lo_u32 v15, v8, v14 -; GISEL-NEXT: v_mul_hi_u32 v18, v1, v14 +; GISEL-NEXT: v_mul_hi_u32 v18, v6, v14 ; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14 ; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17 ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v12 +; GISEL-NEXT: v_mul_lo_u32 v16, v6, v12 ; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18 ; GISEL-NEXT: v_mul_lo_u32 v15, v8, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v1, v12 +; GISEL-NEXT: v_mul_hi_u32 v18, v6, v12 ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v18 @@ -2544,7 +2544,7 @@ ; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 ; GISEL-NEXT: v_mul_lo_u32 v19, v11, v13 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_hi_u32 v19, v5, v13 +; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc @@ -2565,35 +2565,35 @@ ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v18 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 ; GISEL-NEXT: v_addc_u32_e64 v14, s[4:5], v8, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v6, v1 -; GISEL-NEXT: v_mul_lo_u32 v7, v7, v1 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v17 +; GISEL-NEXT: v_mul_lo_u32 v15, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v5, v5, v6 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v17 ; GISEL-NEXT: v_addc_u32_e64 v16, s[6:7], v11, v13, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, v10, v5 -; GISEL-NEXT: v_mul_hi_u32 v18, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7 +; GISEL-NEXT: v_mul_lo_u32 v10, v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v18, v9, v7 ; GISEL-NEXT: v_mul_lo_u32 v9, v9, v16 ; GISEL-NEXT: v_mul_lo_u32 v19, v16, v17 ; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v10, v5, v17 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v17 ; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v5, v9 +; GISEL-NEXT: v_mul_lo_u32 v18, v7, v9 ; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], v19, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v18, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v6, v1 -; GISEL-NEXT: v_mul_lo_u32 v6, v6, v14 +; GISEL-NEXT: v_mul_hi_u32 v10, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v4, v4, v14 ; GISEL-NEXT: v_mul_lo_u32 v18, v14, v15 -; GISEL-NEXT: v_add_i32_e64 v6, s[8:9], v7, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v1, v15 -; GISEL-NEXT: v_add_i32_e64 v6, s[8:9], v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v1, v6 +; GISEL-NEXT: v_add_i32_e64 v4, s[8:9], v5, v4 +; GISEL-NEXT: v_mul_hi_u32 v5, v6, v15 +; GISEL-NEXT: v_add_i32_e64 v4, s[8:9], v4, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v6, v4 ; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v18, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v7, s[8:9], v10, v7 -; GISEL-NEXT: v_mov_b32_e32 v7, s10 +; GISEL-NEXT: v_add_i32_e64 v5, s[8:9], v10, v5 +; GISEL-NEXT: v_mov_b32_e32 v5, s10 ; GISEL-NEXT: v_mov_b32_e32 v10, s11 ; GISEL-NEXT: v_add_i32_e64 v8, s[10:11], v8, v12 ; GISEL-NEXT: v_mov_b32_e32 v12, s12 @@ -2602,26 +2602,26 @@ ; GISEL-NEXT: v_mul_hi_u32 v15, v16, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] ; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_lo_u32 v18, v14, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v14, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v18, v14, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v14, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v6, v4 ; GISEL-NEXT: v_add_i32_e64 v13, s[8:9], v18, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v6, s[8:9], v13, v6 +; GISEL-NEXT: v_add_i32_e64 v4, s[8:9], v13, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[8:9] ; GISEL-NEXT: v_add_i32_e64 v13, s[8:9], v18, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], v19, v18 ; GISEL-NEXT: v_mul_lo_u32 v19, v16, v9 ; GISEL-NEXT: v_mul_hi_u32 v16, v16, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 ; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v19, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v15, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v19, v15 ; GISEL-NEXT: v_mov_b32_e32 v19, s13 -; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v17 +; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] @@ -2631,16 +2631,16 @@ ; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v16, v15 ; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc ; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v14, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v6 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v0, v1 -; GISEL-NEXT: v_mul_hi_u32 v1, 0, v1 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4 +; GISEL-NEXT: v_mul_hi_u32 v13, v0, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v14, v2, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 +; GISEL-NEXT: v_mul_lo_u32 v11, 0, v7 +; GISEL-NEXT: v_mul_hi_u32 v14, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 ; GISEL-NEXT: v_mul_lo_u32 v15, v0, v6 ; GISEL-NEXT: v_mul_lo_u32 v16, 0, v6 ; GISEL-NEXT: v_mul_hi_u32 v17, v0, v6 @@ -2654,84 +2654,84 @@ ; GISEL-NEXT: v_mul_hi_u32 v9, 0, v9 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v16, v1 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v16, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v11, v5 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v17 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v17 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v15 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, v3, v1 -; GISEL-NEXT: v_mul_lo_u32 v15, 0, v1 -; GISEL-NEXT: v_mul_hi_u32 v1, v3, v1 +; GISEL-NEXT: v_mul_lo_u32 v13, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v15, 0, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v5 -; GISEL-NEXT: v_mul_lo_u32 v16, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5 +; GISEL-NEXT: v_mul_lo_u32 v14, v1, v7 +; GISEL-NEXT: v_mul_lo_u32 v16, 0, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v11 ; GISEL-NEXT: v_mul_lo_u32 v6, v3, v6 -; GISEL-NEXT: v_mul_lo_u32 v8, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v1, v8 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v16, v8 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v7 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v1, vcc -; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], 0, v1 +; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], 0, v4, vcc +; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], 0, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v14 -; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], 0, v5, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v5, s[6:7], 0, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v4 +; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], 0, v6, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v6, s[6:7], 0, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[6:7] +; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v8, v12, v11, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v5, vcc, 0, v5, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e64 v6, vcc, 0, v6, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v0, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v2, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v13, v4 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v2, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v13, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v11, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v4, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v14, v19, v14, vcc -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v13, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v5, vcc +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v6, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v13, v4, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v1, v13, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v15, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v6, v15, s[4:5] ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; Index: llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -8507,16 +8507,16 @@ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 -; GFX6-NEXT: s_ashr_i32 s12, s3, 31 -; GFX6-NEXT: s_add_u32 s2, s2, s12 -; GFX6-NEXT: s_mov_b32 s13, s12 -; GFX6-NEXT: s_addc_u32 s3, s3, s12 -; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX6-NEXT: s_sub_u32 s4, 0, s2 -; GFX6-NEXT: s_subb_u32 s5, 0, s3 +; GFX6-NEXT: s_lshl_b64 s[4:5], s[2:3], s4 +; GFX6-NEXT: s_ashr_i32 s2, s5, 31 +; GFX6-NEXT: s_add_u32 s4, s4, s2 +; GFX6-NEXT: s_mov_b32 s3, s2 +; GFX6-NEXT: s_addc_u32 s5, s5, s2 +; GFX6-NEXT: s_xor_b64 s[12:13], s[4:5], s[2:3] +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s13 +; GFX6-NEXT: s_sub_u32 s4, 0, s12 +; GFX6-NEXT: s_subb_u32 s5, 0, s13 ; GFX6-NEXT: s_ashr_i32 s14, s11, 31 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 @@ -8594,23 +8594,23 @@ ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, s3 +; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, s13 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s12, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s11, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s10, v3 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 +; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s12, v3 ; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 ; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] @@ -8620,16 +8620,16 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v6, s11 ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: s_xor_b64 s[0:1], s[14:15], s[12:13] +; GFX6-NEXT: s_xor_b64 s[0:1], s[14:15], s[2:3] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_xor_b32_e32 v1, s1, v1 @@ -9153,244 +9153,244 @@ ; GFX6-NEXT: v_mul_f32_e32 v1, s20, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 -; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s6, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1 +; GFX6-NEXT: v_mul_hi_u32 v1, s6, v3 +; GFX6-NEXT: v_mul_lo_u32 v0, s6, v2 +; GFX6-NEXT: v_mul_lo_u32 v4, s7, v3 +; GFX6-NEXT: v_mul_lo_u32 v5, s6, v3 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v4 +; GFX6-NEXT: v_mul_lo_u32 v0, v3, v1 +; GFX6-NEXT: v_mul_hi_u32 v4, v3, v5 +; GFX6-NEXT: v_mul_hi_u32 v6, v3, v1 +; GFX6-NEXT: v_mul_hi_u32 v7, v2, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 -; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc -; GFX6-NEXT: v_mov_b32_e32 v4, 0 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, 0 -; GFX6-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc -; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[2:3] -; GFX6-NEXT: v_mul_lo_u32 v5, s6, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, s6, v0 -; GFX6-NEXT: v_mul_lo_u32 v8, s7, v0 +; GFX6-NEXT: v_mul_lo_u32 v6, v2, v5 +; GFX6-NEXT: v_mul_hi_u32 v5, v2, v5 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc +; GFX6-NEXT: v_mul_lo_u32 v5, v2, v1 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v7, v0, vcc +; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GFX6-NEXT: v_add_i32_e64 v3, s[2:3], v3, v4 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v6, vcc +; GFX6-NEXT: v_addc_u32_e64 v4, vcc, v2, v5, s[2:3] +; GFX6-NEXT: v_mul_lo_u32 v6, s6, v4 +; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 +; GFX6-NEXT: v_mul_lo_u32 v8, s7, v3 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GFX6-NEXT: v_mul_lo_u32 v7, s6, v0 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GFX6-NEXT: v_mul_lo_u32 v10, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v11, v0, v7 -; GFX6-NEXT: v_mul_hi_u32 v9, v2, v7 -; GFX6-NEXT: v_mul_lo_u32 v7, v2, v7 -; GFX6-NEXT: v_mul_hi_u32 v8, v2, v5 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_mul_lo_u32 v7, s6, v3 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GFX6-NEXT: v_mul_lo_u32 v10, v3, v6 +; GFX6-NEXT: v_mul_hi_u32 v12, v3, v6 +; GFX6-NEXT: v_mul_hi_u32 v11, v3, v7 +; GFX6-NEXT: v_mul_hi_u32 v9, v4, v7 +; GFX6-NEXT: v_mul_lo_u32 v7, v4, v7 +; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 ; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v2, v5 +; GFX6-NEXT: v_mul_lo_u32 v4, v4, v6 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] +; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v8, v0, vcc +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v1, v6, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v2, v6, s[2:3] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_ashr_i32 s2, s9, 31 ; GFX6-NEXT: s_add_u32 s0, s8, s2 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: s_mov_b32 s3, s2 ; GFX6-NEXT: s_addc_u32 s1, s9, s2 ; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3] -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s8, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s8, v0 -; GFX6-NEXT: v_mul_hi_u32 v5, s8, v1 -; GFX6-NEXT: v_mul_hi_u32 v7, s9, v1 -; GFX6-NEXT: v_mul_lo_u32 v1, s9, v1 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s8, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, s8, v3 +; GFX6-NEXT: v_mul_hi_u32 v6, s8, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, s9, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, s9, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, s9, v3 +; GFX6-NEXT: v_mul_hi_u32 v3, s9, v3 +; GFX6-NEXT: s_xor_b64 s[16:17], s[2:3], s[16:17] +; GFX6-NEXT: s_ashr_i32 s2, s13, 31 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v7, v0, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s9, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[16:17] -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s14, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s14, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s15, v0 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v1, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s14, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s14, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s15, v2 ; GFX6-NEXT: v_mov_b32_e32 v7, s15 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, s14, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s9, v2 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 -; GFX6-NEXT: v_subb_u32_e64 v5, s[0:1], v5, v7, vcc -; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s14, v3 -; GFX6-NEXT: v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v5 +; GFX6-NEXT: s_mov_b32 s3, s2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_mul_lo_u32 v5, s14, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s9, v4 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s8, v5 +; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc +; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s14, v5 +; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, v8, v7, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v0 -; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 1, v0 -; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1] -; GFX6-NEXT: s_ashr_i32 s8, s13, 31 -; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GFX6-NEXT: s_add_u32 s12, s12, s8 -; GFX6-NEXT: v_cndmask_b32_e64 v5, v10, v8, s[0:1] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v6 +; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v2 +; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 1, v2 +; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] +; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 +; GFX6-NEXT: s_add_u32 s8, s12, s2 +; GFX6-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v8, s9 -; GFX6-NEXT: s_mov_b32 s9, s8 -; GFX6-NEXT: s_addc_u32 s13, s13, s8 -; GFX6-NEXT: s_xor_b64 s[12:13], s[12:13], s[8:9] -; GFX6-NEXT: v_cvt_f32_u32_e32 v10, s12 -; GFX6-NEXT: v_cvt_f32_u32_e32 v11, s13 -; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s15, v2 +; GFX6-NEXT: s_addc_u32 s9, s13, s2 +; GFX6-NEXT: s_xor_b64 s[8:9], s[8:9], s[2:3] +; GFX6-NEXT: v_cvt_f32_u32_e32 v10, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v11, s9 +; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s15, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s14, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s15, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s14, v5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s15, v4 ; GFX6-NEXT: v_mac_f32_e32 v10, s18, v11 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc -; GFX6-NEXT: v_rcp_f32_e32 v3, v10 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX6-NEXT: s_sub_u32 s14, 0, s12 -; GFX6-NEXT: v_mul_f32_e32 v3, s19, v3 -; GFX6-NEXT: v_mul_f32_e32 v5, s20, v3 -; GFX6-NEXT: v_trunc_f32_e32 v5, v5 -; GFX6-NEXT: v_mac_f32_e32 v3, s21, v5 -; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc +; GFX6-NEXT: v_rcp_f32_e32 v5, v10 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX6-NEXT: s_sub_u32 s12, 0, s8 +; GFX6-NEXT: v_mul_f32_e32 v5, s19, v5 +; GFX6-NEXT: v_mul_f32_e32 v6, s20, v5 +; GFX6-NEXT: v_trunc_f32_e32 v6, v6 +; GFX6-NEXT: v_mac_f32_e32 v5, s21, v6 ; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v9, v7, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_mul_hi_u32 v2, s14, v3 -; GFX6-NEXT: v_mul_lo_u32 v7, s14, v5 -; GFX6-NEXT: s_subb_u32 s15, 0, s13 -; GFX6-NEXT: v_mul_lo_u32 v8, s15, v3 -; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; GFX6-NEXT: v_mul_lo_u32 v7, s14, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; GFX6-NEXT: v_mul_lo_u32 v8, v3, v2 -; GFX6-NEXT: v_mul_hi_u32 v10, v3, v2 -; GFX6-NEXT: v_mul_hi_u32 v9, v3, v7 -; GFX6-NEXT: v_mul_hi_u32 v11, v5, v2 -; GFX6-NEXT: v_mul_lo_u32 v2, v5, v2 -; GFX6-NEXT: v_xor_b32_e32 v1, s3, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX6-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-NEXT: v_mul_hi_u32 v4, s12, v5 +; GFX6-NEXT: v_mul_lo_u32 v7, s12, v6 +; GFX6-NEXT: s_subb_u32 s13, 0, s9 +; GFX6-NEXT: v_mul_lo_u32 v8, s13, v5 +; GFX6-NEXT: v_xor_b32_e32 v2, s16, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GFX6-NEXT: v_mul_lo_u32 v7, s12, v5 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GFX6-NEXT: v_mul_lo_u32 v8, v5, v4 +; GFX6-NEXT: v_mul_hi_u32 v10, v5, v4 +; GFX6-NEXT: v_mul_hi_u32 v9, v5, v7 +; GFX6-NEXT: v_mul_hi_u32 v11, v6, v4 +; GFX6-NEXT: v_mul_lo_u32 v4, v6, v4 +; GFX6-NEXT: v_xor_b32_e32 v3, s17, v3 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc -; GFX6-NEXT: v_mul_lo_u32 v10, v5, v7 -; GFX6-NEXT: v_mul_hi_u32 v7, v5, v7 +; GFX6-NEXT: v_mul_lo_u32 v10, v6, v7 +; GFX6-NEXT: v_mul_hi_u32 v7, v6, v7 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v11, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GFX6-NEXT: v_add_i32_e64 v2, s[0:1], v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v6, v8, vcc -; GFX6-NEXT: v_addc_u32_e64 v3, vcc, v5, v7, s[0:1] -; GFX6-NEXT: v_mul_lo_u32 v8, s14, v3 -; GFX6-NEXT: v_mul_hi_u32 v9, s14, v2 -; GFX6-NEXT: v_mul_lo_u32 v10, s15, v2 +; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v11, v0, vcc +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GFX6-NEXT: v_add_i32_e64 v4, s[0:1], v5, v4 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v1, v8, vcc +; GFX6-NEXT: v_addc_u32_e64 v5, vcc, v6, v7, s[0:1] +; GFX6-NEXT: v_mul_lo_u32 v8, s12, v5 +; GFX6-NEXT: v_mul_hi_u32 v9, s12, v4 +; GFX6-NEXT: v_mul_lo_u32 v10, s13, v4 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GFX6-NEXT: v_mul_lo_u32 v9, s14, v2 +; GFX6-NEXT: v_mul_lo_u32 v9, s12, v4 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GFX6-NEXT: v_mul_lo_u32 v12, v2, v8 -; GFX6-NEXT: v_mul_hi_u32 v14, v2, v8 -; GFX6-NEXT: v_mul_hi_u32 v13, v2, v9 -; GFX6-NEXT: v_mul_hi_u32 v11, v3, v9 -; GFX6-NEXT: v_mul_lo_u32 v9, v3, v9 -; GFX6-NEXT: v_mul_hi_u32 v10, v3, v8 +; GFX6-NEXT: v_mul_lo_u32 v12, v4, v8 +; GFX6-NEXT: v_mul_hi_u32 v14, v4, v8 +; GFX6-NEXT: v_mul_hi_u32 v13, v4, v9 +; GFX6-NEXT: v_mul_hi_u32 v11, v5, v9 +; GFX6-NEXT: v_mul_lo_u32 v9, v5, v9 +; GFX6-NEXT: v_mul_hi_u32 v10, v5, v8 ; GFX6-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; GFX6-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, v3, v8 +; GFX6-NEXT: v_mul_lo_u32 v5, v5, v8 ; GFX6-NEXT: v_add_i32_e32 v9, vcc, v12, v9 ; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v13, v11, vcc -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v10, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v6, v8, vcc -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GFX6-NEXT: s_ashr_i32 s14, s11, 31 -; GFX6-NEXT: v_addc_u32_e64 v5, vcc, v5, v8, s[0:1] -; GFX6-NEXT: s_add_u32 s0, s10, s14 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: s_mov_b32 s15, s14 -; GFX6-NEXT: s_addc_u32 s1, s11, s14 -; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s10, v3 -; GFX6-NEXT: v_mul_hi_u32 v7, s10, v2 -; GFX6-NEXT: v_mul_hi_u32 v9, s10, v3 -; GFX6-NEXT: v_mul_hi_u32 v10, s11, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s11, v3 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v10, v0, vcc +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GFX6-NEXT: s_ashr_i32 s12, s11, 31 +; GFX6-NEXT: v_addc_u32_e64 v6, vcc, v6, v8, s[0:1] +; GFX6-NEXT: s_add_u32 s0, s10, s12 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GFX6-NEXT: s_mov_b32 s13, s12 +; GFX6-NEXT: s_addc_u32 s1, s11, s12 +; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[12:13] +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, s10, v5 +; GFX6-NEXT: v_mul_hi_u32 v7, s10, v4 +; GFX6-NEXT: v_mul_hi_u32 v9, s10, v5 +; GFX6-NEXT: v_mul_hi_u32 v10, s11, v5 +; GFX6-NEXT: v_mul_lo_u32 v5, s11, v5 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc -; GFX6-NEXT: v_mul_lo_u32 v9, s11, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s11, v2 -; GFX6-NEXT: v_mov_b32_e32 v8, s3 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v2, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 -; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 -; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s11, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, s13 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s10, v5 +; GFX6-NEXT: v_mul_lo_u32 v9, s11, v4 +; GFX6-NEXT: v_mul_hi_u32 v4, s11, v4 +; GFX6-NEXT: v_mov_b32_e32 v8, s17 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v10, v0, vcc +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v0, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, s8, v5 +; GFX6-NEXT: v_mul_hi_u32 v7, s8, v4 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s16, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, s9, v4 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v3, v8, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v7, v6 +; GFX6-NEXT: v_mov_b32_e32 v7, s9 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_mul_lo_u32 v3, s8, v4 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s11, v2 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s10, v3 ; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc -; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s12, v5 +; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s8, v3 ; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v2 -; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 1, v2 -; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v4 +; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v5, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 1, v4 +; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v5, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v8, s11 -; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 +; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: s_xor_b64 s[0:1], s[14:15], s[8:9] -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX6-NEXT: v_xor_b32_e32 v2, s0, v2 -; GFX6-NEXT: v_xor_b32_e32 v3, s1, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, s1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v2 -; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[2:3] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX6-NEXT: v_xor_b32_e32 v3, s0, v3 +; GFX6-NEXT: v_xor_b32_e32 v4, s1, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, s1 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v3 +; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v4, v5, vcc ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -9416,217 +9416,217 @@ ; GFX9-NEXT: s_subb_u32 s4, 0, s11 ; GFX9-NEXT: v_mac_f32_e32 v0, s18, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mul_f32_e32 v0, s19, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, s20, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, s21, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s14, v0 -; GFX9-NEXT: v_mul_lo_u32 v2, s14, v1 -; GFX9-NEXT: v_mul_lo_u32 v5, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, s14, v0 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 -; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v4, s14, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, s14, v0 -; GFX9-NEXT: v_mul_lo_u32 v8, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v9, s14, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v1 +; GFX9-NEXT: v_mul_hi_u32 v1, s14, v3 +; GFX9-NEXT: v_mul_lo_u32 v0, s14, v2 +; GFX9-NEXT: v_mul_lo_u32 v5, s4, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, s14, v3 +; GFX9-NEXT: v_add_u32_e32 v0, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v5, v0, v5 +; GFX9-NEXT: v_mul_hi_u32 v1, v3, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v3, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, v3, v5 +; GFX9-NEXT: v_mul_hi_u32 v8, v2, v5 +; GFX9-NEXT: v_mul_lo_u32 v5, v2, v5 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, v2, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v2, v4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_add_co_u32_e64 v3, s[2:3], v3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v4, vcc, v2, v5, s[2:3] +; GFX9-NEXT: v_mul_lo_u32 v6, s14, v4 +; GFX9-NEXT: v_mul_hi_u32 v7, s14, v3 +; GFX9-NEXT: v_mul_lo_u32 v8, s4, v3 +; GFX9-NEXT: v_mul_lo_u32 v9, s14, v3 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: v_add_u32_e32 v4, v7, v4 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v8 -; GFX9-NEXT: v_mul_lo_u32 v10, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v11, v0, v9 -; GFX9-NEXT: v_mul_hi_u32 v12, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v8, v2, v9 -; GFX9-NEXT: v_mul_lo_u32 v9, v2, v9 +; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 +; GFX9-NEXT: v_mul_lo_u32 v10, v3, v6 +; GFX9-NEXT: v_mul_hi_u32 v11, v3, v9 +; GFX9-NEXT: v_mul_hi_u32 v12, v3, v6 +; GFX9-NEXT: v_mul_hi_u32 v8, v4, v9 +; GFX9-NEXT: v_mul_lo_u32 v9, v4, v9 ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 -; GFX9-NEXT: v_mul_hi_u32 v7, v2, v4 +; GFX9-NEXT: v_mul_hi_u32 v7, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 +; GFX9-NEXT: v_mul_lo_u32 v4, v4, v6 ; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s14, s5, 31 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v5, v4, vcc -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v1, v6, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v2, v6, s[2:3] ; GFX9-NEXT: s_add_u32 s2, s4, s14 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 ; GFX9-NEXT: s_addc_u32 s3, s5, s14 ; GFX9-NEXT: s_mov_b32 s15, s14 ; GFX9-NEXT: s_xor_b64 s[16:17], s[2:3], s[14:15] -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s16, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s16, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s16, v1 -; GFX9-NEXT: v_mul_hi_u32 v7, s17, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s17, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s17, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s17, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, s16, v2 +; GFX9-NEXT: v_mul_hi_u32 v5, s16, v3 +; GFX9-NEXT: v_mul_hi_u32 v6, s16, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, s17, v2 +; GFX9-NEXT: v_mul_lo_u32 v2, s17, v2 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, s17, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, s17, v3 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_xor_b64 s[12:13], s[14:15], s[12:13] -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s10, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s10, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, s11, v0 +; GFX9-NEXT: s_xor_b64 s[14:15], s[14:15], s[12:13] +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, s10, v3 +; GFX9-NEXT: v_mul_hi_u32 v5, s10, v2 +; GFX9-NEXT: v_mul_lo_u32 v6, s11, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, s11 -; GFX9-NEXT: s_ashr_i32 s14, s9, 31 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_lo_u32 v3, s10, v0 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_sub_u32_e32 v4, s17, v2 -; GFX9-NEXT: s_mov_b32 s15, s14 -; GFX9-NEXT: v_sub_co_u32_e64 v3, s[0:1], s16, v3 -; GFX9-NEXT: v_subb_co_u32_e64 v4, vcc, v4, v7, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s10, v3 -; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v4 +; GFX9-NEXT: s_ashr_i32 s12, s9, 31 +; GFX9-NEXT: v_add_u32_e32 v4, v5, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, s10, v2 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v6 +; GFX9-NEXT: v_sub_u32_e32 v6, s17, v4 +; GFX9-NEXT: s_mov_b32 s13, s12 +; GFX9-NEXT: v_sub_co_u32_e64 v5, s[0:1], s16, v5 +; GFX9-NEXT: v_subb_co_u32_e64 v6, vcc, v6, v7, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s10, v5 +; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, 1, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v1, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, v8, s[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 2, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, s17 -; GFX9-NEXT: v_subb_co_u32_e64 v2, vcc, v8, v2, s[0:1] -; GFX9-NEXT: s_add_u32 s0, s8, s14 -; GFX9-NEXT: s_addc_u32 s1, s9, s14 -; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[14:15] +; GFX9-NEXT: v_subb_co_u32_e64 v4, vcc, v8, v4, s[0:1] +; GFX9-NEXT: s_add_u32 s0, s8, s12 +; GFX9-NEXT: s_addc_u32 s1, s9, s12 +; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] ; GFX9-NEXT: v_cvt_f32_u32_e32 v10, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v11, s9 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v4 ; GFX9-NEXT: v_mac_f32_e32 v10, s18, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc -; GFX9-NEXT: v_rcp_f32_e32 v3, v10 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v9, v7, s[2:3] -; GFX9-NEXT: v_mul_f32_e32 v3, s19, v3 -; GFX9-NEXT: v_mul_f32_e32 v4, s20, v3 -; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_mac_f32_e32 v3, s21, v4 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc +; GFX9-NEXT: v_rcp_f32_e32 v5, v10 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[2:3] +; GFX9-NEXT: v_mul_f32_e32 v5, s19, v5 +; GFX9-NEXT: v_mul_f32_e32 v6, s20, v5 +; GFX9-NEXT: v_trunc_f32_e32 v6, v6 +; GFX9-NEXT: v_mac_f32_e32 v5, s21, v6 +; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX9-NEXT: s_sub_u32 s2, 0, s8 ; GFX9-NEXT: s_subb_u32 s3, 0, s9 -; GFX9-NEXT: v_mul_hi_u32 v7, s2, v3 -; GFX9-NEXT: v_mul_lo_u32 v8, s2, v4 -; GFX9-NEXT: v_mul_lo_u32 v9, s3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s2, v3 +; GFX9-NEXT: v_mul_hi_u32 v7, s2, v5 +; GFX9-NEXT: v_mul_lo_u32 v8, s2, v6 +; GFX9-NEXT: v_mul_lo_u32 v9, s3, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, s2, v5 ; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 ; GFX9-NEXT: v_add_u32_e32 v7, v7, v9 -; GFX9-NEXT: v_mul_lo_u32 v8, v3, v7 -; GFX9-NEXT: v_mul_hi_u32 v9, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v10, v3, v7 -; GFX9-NEXT: v_mul_hi_u32 v11, v4, v7 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v7 +; GFX9-NEXT: v_mul_lo_u32 v8, v5, v7 +; GFX9-NEXT: v_mul_hi_u32 v9, v5, v4 +; GFX9-NEXT: v_mul_hi_u32 v10, v5, v7 +; GFX9-NEXT: v_mul_hi_u32 v11, v6, v7 +; GFX9-NEXT: v_mul_lo_u32 v7, v6, v7 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, v4, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, v4, v2 +; GFX9-NEXT: v_mul_lo_u32 v10, v6, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v6, v4 ; GFX9-NEXT: s_ashr_i32 s10, s7, 31 ; GFX9-NEXT: s_mov_b32 s11, s10 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v9, v2, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v5, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v3, vcc, v4, v7, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v8, s2, v3 -; GFX9-NEXT: v_mul_hi_u32 v9, s2, v2 -; GFX9-NEXT: v_mul_lo_u32 v10, s3, v2 -; GFX9-NEXT: v_mul_lo_u32 v11, s2, v2 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 +; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], v5, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v8, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v5, vcc, v6, v7, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v8, s2, v5 +; GFX9-NEXT: v_mul_hi_u32 v9, s2, v4 +; GFX9-NEXT: v_mul_lo_u32 v10, s3, v4 +; GFX9-NEXT: v_mul_lo_u32 v11, s2, v4 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v7 ; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 ; GFX9-NEXT: v_add_u32_e32 v8, v8, v10 -; GFX9-NEXT: v_mul_lo_u32 v12, v2, v8 -; GFX9-NEXT: v_mul_hi_u32 v13, v2, v11 -; GFX9-NEXT: v_mul_hi_u32 v14, v2, v8 -; GFX9-NEXT: v_mul_hi_u32 v10, v3, v11 -; GFX9-NEXT: v_mul_lo_u32 v11, v3, v11 +; GFX9-NEXT: v_mul_lo_u32 v12, v4, v8 +; GFX9-NEXT: v_mul_hi_u32 v13, v4, v11 +; GFX9-NEXT: v_mul_hi_u32 v14, v4, v8 +; GFX9-NEXT: v_mul_hi_u32 v10, v5, v11 +; GFX9-NEXT: v_mul_lo_u32 v11, v5, v11 ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v13, v12 -; GFX9-NEXT: v_mul_hi_u32 v9, v3, v8 +; GFX9-NEXT: v_mul_hi_u32 v9, v5, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v14, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v3, v8 +; GFX9-NEXT: v_mul_lo_u32 v5, v5, v8 ; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v12, v11 ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v10, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v10, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v5, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v4, vcc, v4, v8, s[0:1] +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v10, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v1, v8, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v6, vcc, v6, v8, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s6, s10 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9-NEXT: s_addc_u32 s1, s7, s10 ; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s6, v3 -; GFX9-NEXT: v_mul_hi_u32 v7, s6, v2 -; GFX9-NEXT: v_mul_hi_u32 v9, s6, v3 -; GFX9-NEXT: v_mul_hi_u32 v10, s7, v3 -; GFX9-NEXT: v_mul_lo_u32 v3, s7, v3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, s6, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, s6, v4 +; GFX9-NEXT: v_mul_hi_u32 v9, s6, v5 +; GFX9-NEXT: v_mul_hi_u32 v10, s7, v5 +; GFX9-NEXT: v_mul_lo_u32 v5, s7, v5 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v9, vcc -; GFX9-NEXT: v_mul_lo_u32 v9, s7, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, s7, v2 -; GFX9-NEXT: v_xor_b32_e32 v0, s12, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s13, v1 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v2, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v10, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s8, v3 -; GFX9-NEXT: v_mul_hi_u32 v5, s8, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, s9, v2 -; GFX9-NEXT: v_mov_b32_e32 v8, s13 -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s12, v0 -; GFX9-NEXT: v_add_u32_e32 v4, v5, v4 -; GFX9-NEXT: v_mul_lo_u32 v5, s8, v2 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc -; GFX9-NEXT: v_sub_u32_e32 v7, s7, v4 +; GFX9-NEXT: v_mul_lo_u32 v9, s7, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, s7, v4 +; GFX9-NEXT: v_xor_b32_e32 v2, s14, v2 +; GFX9-NEXT: v_xor_b32_e32 v3, s15, v3 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v9 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v10, v0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, s8, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, s8, v4 +; GFX9-NEXT: v_mov_b32_e32 v8, s15 +; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s14, v2 +; GFX9-NEXT: v_mul_lo_u32 v9, s9, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v8, vcc +; GFX9-NEXT: v_add_u32_e32 v3, v7, v6 +; GFX9-NEXT: v_mul_lo_u32 v6, s8, v4 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v9 +; GFX9-NEXT: v_sub_u32_e32 v7, s7, v3 ; GFX9-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-NEXT: v_sub_co_u32_e64 v5, s[0:1], s6, v5 +; GFX9-NEXT: v_sub_co_u32_e64 v6, s[0:1], s6, v6 ; GFX9-NEXT: v_subb_co_u32_e64 v7, vcc, v7, v8, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s8, v5 +; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s8, v6 ; GFX9-NEXT: v_subbrev_co_u32_e32 v7, vcc, 0, v7, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc @@ -9634,32 +9634,32 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 1, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 1, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v5, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v9, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v9, s7 -; GFX9-NEXT: v_subb_co_u32_e64 v4, vcc, v9, v4, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v4 +; GFX9-NEXT: v_subb_co_u32_e64 v3, vcc, v9, v3, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, v8, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[14:15] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX9-NEXT: v_xor_b32_e32 v2, s0, v2 -; GFX9-NEXT: v_xor_b32_e32 v3, s1, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v6, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v5, v10, v8, s[2:3] +; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX9-NEXT: v_xor_b32_e32 v4, s0, v4 +; GFX9-NEXT: v_xor_b32_e32 v5, s1, v3 +; GFX9-NEXT: v_mov_b32_e32 v6, s1 +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v5, v6, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[4:5] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y %r = sdiv <2 x i64> %x, %shl.y @@ -10355,211 +10355,211 @@ ; GFX6-NEXT: v_mul_f32_e32 v1, s20, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_ashr_i32 s12, s9, 31 ; GFX6-NEXT: s_add_u32 s0, s8, s12 -; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 -; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s6, v0 +; GFX6-NEXT: v_mul_hi_u32 v1, s6, v3 +; GFX6-NEXT: v_mul_lo_u32 v0, s6, v2 +; GFX6-NEXT: v_mul_lo_u32 v4, s7, v3 +; GFX6-NEXT: v_mul_lo_u32 v5, s6, v3 ; GFX6-NEXT: s_mov_b32 s13, s12 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 -; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v4 +; GFX6-NEXT: v_mul_lo_u32 v0, v3, v1 +; GFX6-NEXT: v_mul_hi_u32 v4, v3, v5 +; GFX6-NEXT: v_mul_hi_u32 v6, v3, v1 +; GFX6-NEXT: v_mul_hi_u32 v7, v2, v1 ; GFX6-NEXT: s_addc_u32 s1, s9, s12 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, v2, v5 +; GFX6-NEXT: v_mul_hi_u32 v5, v2, v5 ; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc -; GFX6-NEXT: v_mov_b32_e32 v4, 0 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, 0 -; GFX6-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc -; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[2:3] -; GFX6-NEXT: v_mul_lo_u32 v5, s6, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, s6, v0 -; GFX6-NEXT: v_mul_lo_u32 v8, s7, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc +; GFX6-NEXT: v_mul_lo_u32 v5, v2, v1 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v7, v0, vcc +; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GFX6-NEXT: v_add_i32_e64 v3, s[2:3], v3, v4 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v6, vcc +; GFX6-NEXT: v_addc_u32_e64 v4, vcc, v2, v5, s[2:3] +; GFX6-NEXT: v_mul_lo_u32 v6, s6, v4 +; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 +; GFX6-NEXT: v_mul_lo_u32 v8, s7, v3 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GFX6-NEXT: v_mul_lo_u32 v7, s6, v0 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GFX6-NEXT: v_mul_lo_u32 v10, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v11, v0, v7 -; GFX6-NEXT: v_mul_hi_u32 v9, v2, v7 -; GFX6-NEXT: v_mul_lo_u32 v7, v2, v7 -; GFX6-NEXT: v_mul_hi_u32 v8, v2, v5 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_mul_lo_u32 v7, s6, v3 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GFX6-NEXT: v_mul_lo_u32 v10, v3, v6 +; GFX6-NEXT: v_mul_hi_u32 v12, v3, v6 +; GFX6-NEXT: v_mul_hi_u32 v11, v3, v7 +; GFX6-NEXT: v_mul_hi_u32 v9, v4, v7 +; GFX6-NEXT: v_mul_lo_u32 v7, v4, v7 +; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 ; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v2, v5 +; GFX6-NEXT: v_mul_lo_u32 v4, v4, v6 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s8, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s8, v0 -; GFX6-NEXT: v_mul_hi_u32 v5, s8, v1 -; GFX6-NEXT: v_mul_hi_u32 v7, s9, v1 -; GFX6-NEXT: v_mul_lo_u32 v1, s9, v1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s9, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v8, v0, vcc +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v1, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, s16, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s16, v0 -; GFX6-NEXT: v_mul_lo_u32 v3, s17, v0 -; GFX6-NEXT: v_mul_lo_u32 v0, s16, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 -; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc -; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s16, v0 -; GFX6-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v2, s[0:1] +; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v2, v6, s[2:3] +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s8, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, s8, v3 +; GFX6-NEXT: v_mul_hi_u32 v6, s8, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, s9, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, s9, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, s9, v3 +; GFX6-NEXT: v_mul_hi_u32 v3, s9, v3 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v7, v0, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v1, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v3, s16, v3 +; GFX6-NEXT: v_mul_hi_u32 v4, s16, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s17, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, s16, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s9, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, s17 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s8, v2 +; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc +; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s16, v2 +; GFX6-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1] ; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s17, v7 -; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] +; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] -; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s16, v5 -; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s16, v5 +; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s16, v6 +; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s16, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] ; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s17, v7 ; GFX6-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] ; GFX6-NEXT: s_ashr_i32 s2, s15, 31 -; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] +; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 ; GFX6-NEXT: s_add_u32 s8, s14, s2 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v7, s9 ; GFX6-NEXT: s_mov_b32 s3, s2 ; GFX6-NEXT: s_addc_u32 s9, s15, s2 ; GFX6-NEXT: s_xor_b64 s[8:9], s[8:9], s[2:3] ; GFX6-NEXT: v_cvt_f32_u32_e32 v8, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v9, s9 -; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v7, v1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s17, v1 +; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s17, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; GFX6-NEXT: v_mac_f32_e32 v8, s18, v9 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s16, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s16, v2 ; GFX6-NEXT: v_rcp_f32_e32 v8, v8 ; GFX6-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s17, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s17, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v3, s[0:1] -; GFX6-NEXT: v_mul_f32_e32 v3, s19, v8 -; GFX6-NEXT: v_mul_f32_e32 v5, s20, v3 -; GFX6-NEXT: v_trunc_f32_e32 v5, v5 -; GFX6-NEXT: v_mac_f32_e32 v3, s21, v5 -; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] +; GFX6-NEXT: v_mul_f32_e32 v5, s19, v8 +; GFX6-NEXT: v_mul_f32_e32 v6, s20, v5 +; GFX6-NEXT: v_trunc_f32_e32 v6, v6 +; GFX6-NEXT: v_mac_f32_e32 v5, s21, v6 ; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX6-NEXT: s_sub_u32 s2, 0, s8 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_mul_hi_u32 v2, s2, v3 -; GFX6-NEXT: v_mul_lo_u32 v7, s2, v5 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-NEXT: v_mul_hi_u32 v4, s2, v5 +; GFX6-NEXT: v_mul_lo_u32 v7, s2, v6 ; GFX6-NEXT: s_subb_u32 s3, 0, s9 -; GFX6-NEXT: v_mul_lo_u32 v8, s3, v3 +; GFX6-NEXT: v_mul_lo_u32 v8, s3, v5 ; GFX6-NEXT: s_ashr_i32 s14, s11, 31 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; GFX6-NEXT: v_mul_lo_u32 v7, s2, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; GFX6-NEXT: v_mul_lo_u32 v8, v3, v2 -; GFX6-NEXT: v_mul_hi_u32 v10, v3, v2 -; GFX6-NEXT: v_mul_hi_u32 v9, v3, v7 -; GFX6-NEXT: v_mul_hi_u32 v11, v5, v2 -; GFX6-NEXT: v_mul_lo_u32 v2, v5, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GFX6-NEXT: v_mul_lo_u32 v7, s2, v5 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GFX6-NEXT: v_mul_lo_u32 v8, v5, v4 +; GFX6-NEXT: v_mul_hi_u32 v10, v5, v4 +; GFX6-NEXT: v_mul_hi_u32 v9, v5, v7 +; GFX6-NEXT: v_mul_hi_u32 v11, v6, v4 +; GFX6-NEXT: v_mul_lo_u32 v4, v6, v4 ; GFX6-NEXT: s_mov_b32 s15, s14 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc -; GFX6-NEXT: v_mul_lo_u32 v10, v5, v7 -; GFX6-NEXT: v_mul_hi_u32 v7, v5, v7 -; GFX6-NEXT: v_xor_b32_e32 v0, s12, v0 -; GFX6-NEXT: v_xor_b32_e32 v1, s12, v1 +; GFX6-NEXT: v_mul_lo_u32 v10, v6, v7 +; GFX6-NEXT: v_mul_hi_u32 v7, v6, v7 +; GFX6-NEXT: v_xor_b32_e32 v2, s12, v2 +; GFX6-NEXT: v_xor_b32_e32 v3, s12, v3 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v11, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GFX6-NEXT: v_add_i32_e64 v2, s[0:1], v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v6, v8, vcc -; GFX6-NEXT: v_addc_u32_e64 v3, vcc, v5, v7, s[0:1] -; GFX6-NEXT: v_mul_lo_u32 v8, s2, v3 -; GFX6-NEXT: v_mul_hi_u32 v9, s2, v2 -; GFX6-NEXT: v_mul_lo_u32 v10, s3, v2 +; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v11, v0, vcc +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GFX6-NEXT: v_add_i32_e64 v4, s[0:1], v5, v4 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v1, v8, vcc +; GFX6-NEXT: v_addc_u32_e64 v5, vcc, v6, v7, s[0:1] +; GFX6-NEXT: v_mul_lo_u32 v8, s2, v5 +; GFX6-NEXT: v_mul_hi_u32 v9, s2, v4 +; GFX6-NEXT: v_mul_lo_u32 v10, s3, v4 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GFX6-NEXT: v_mul_lo_u32 v9, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v9, s2, v4 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GFX6-NEXT: v_mul_lo_u32 v12, v2, v8 -; GFX6-NEXT: v_mul_hi_u32 v14, v2, v8 -; GFX6-NEXT: v_mul_hi_u32 v13, v2, v9 -; GFX6-NEXT: v_mul_hi_u32 v11, v3, v9 -; GFX6-NEXT: v_mul_lo_u32 v9, v3, v9 -; GFX6-NEXT: v_mul_hi_u32 v10, v3, v8 +; GFX6-NEXT: v_mul_lo_u32 v12, v4, v8 +; GFX6-NEXT: v_mul_hi_u32 v14, v4, v8 +; GFX6-NEXT: v_mul_hi_u32 v13, v4, v9 +; GFX6-NEXT: v_mul_hi_u32 v11, v5, v9 +; GFX6-NEXT: v_mul_lo_u32 v9, v5, v9 +; GFX6-NEXT: v_mul_hi_u32 v10, v5, v8 ; GFX6-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; GFX6-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, v3, v8 +; GFX6-NEXT: v_mul_lo_u32 v5, v5, v8 ; GFX6-NEXT: v_add_i32_e32 v9, vcc, v12, v9 ; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v13, v11, vcc -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v10, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v6, v8, vcc -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GFX6-NEXT: v_addc_u32_e64 v5, vcc, v5, v8, s[0:1] +; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v10, v0, vcc +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GFX6-NEXT: v_addc_u32_e64 v6, vcc, v6, v8, s[0:1] ; GFX6-NEXT: s_add_u32 s0, s10, s14 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GFX6-NEXT: s_addc_u32 s1, s11, s14 ; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s10, v3 -; GFX6-NEXT: v_mul_hi_u32 v7, s10, v2 -; GFX6-NEXT: v_mul_hi_u32 v9, s10, v3 -; GFX6-NEXT: v_mul_hi_u32 v10, s11, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s11, v3 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, s10, v5 +; GFX6-NEXT: v_mul_hi_u32 v7, s10, v4 +; GFX6-NEXT: v_mul_hi_u32 v9, s10, v5 +; GFX6-NEXT: v_mul_hi_u32 v10, s11, v5 +; GFX6-NEXT: v_mul_lo_u32 v5, s11, v5 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc -; GFX6-NEXT: v_mul_lo_u32 v9, s11, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s11, v2 +; GFX6-NEXT: v_mul_lo_u32 v9, s11, v4 +; GFX6-NEXT: v_mul_hi_u32 v4, s11, v4 ; GFX6-NEXT: v_mov_b32_e32 v8, s12 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v2, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, s8, v3 -; GFX6-NEXT: v_mul_hi_u32 v4, s8, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s9, v2 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s8, v2 -; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s11, v3 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v10, v0, vcc +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v1, v0, vcc +; GFX6-NEXT: v_mul_lo_u32 v5, s8, v0 +; GFX6-NEXT: v_mul_hi_u32 v6, s8, v4 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, s9, v4 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v3, v8, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v5 ; GFX6-NEXT: v_mov_b32_e32 v5, s9 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s10, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_mul_lo_u32 v3, s8, v4 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s11, v2 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s10, v3 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s8, v2 +; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s8, v3 ; GFX6-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1] ; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v7 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1] @@ -10573,22 +10573,22 @@ ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v7, s11 -; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 +; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v7, v2, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_xor_b32_e32 v2, s14, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX6-NEXT: v_xor_b32_e32 v3, s14, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, s14 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s14, v2 -; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GFX6-NEXT: v_xor_b32_e32 v4, s14, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, s14 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s14, v3 +; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v4, v5, vcc ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -10614,107 +10614,107 @@ ; GFX9-NEXT: s_subb_u32 s4, 0, s13 ; GFX9-NEXT: v_mac_f32_e32 v0, s16, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: v_mul_f32_e32 v0, s17, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, s18, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s8, v0 -; GFX9-NEXT: v_mul_lo_u32 v2, s8, v1 -; GFX9-NEXT: v_mul_lo_u32 v5, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, s8, v0 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 -; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v4, s8, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, s8, v0 -; GFX9-NEXT: v_mul_lo_u32 v8, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v9, s8, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v1 +; GFX9-NEXT: v_mul_hi_u32 v1, s8, v3 +; GFX9-NEXT: v_mul_lo_u32 v0, s8, v2 +; GFX9-NEXT: v_mul_lo_u32 v5, s4, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, s8, v3 +; GFX9-NEXT: v_add_u32_e32 v0, v1, v0 +; GFX9-NEXT: v_add_u32_e32 v5, v0, v5 +; GFX9-NEXT: v_mul_hi_u32 v1, v3, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v3, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, v3, v5 +; GFX9-NEXT: v_mul_hi_u32 v8, v2, v5 +; GFX9-NEXT: v_mul_lo_u32 v5, v2, v5 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, v2, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v2, v4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_add_co_u32_e64 v3, s[2:3], v3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v4, vcc, v2, v5, s[2:3] +; GFX9-NEXT: v_mul_lo_u32 v6, s8, v4 +; GFX9-NEXT: v_mul_hi_u32 v7, s8, v3 +; GFX9-NEXT: v_mul_lo_u32 v8, s4, v3 +; GFX9-NEXT: v_mul_lo_u32 v9, s8, v3 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: v_add_u32_e32 v4, v7, v4 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v8 -; GFX9-NEXT: v_mul_lo_u32 v10, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v11, v0, v9 -; GFX9-NEXT: v_mul_hi_u32 v12, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v8, v2, v9 -; GFX9-NEXT: v_mul_lo_u32 v9, v2, v9 +; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 +; GFX9-NEXT: v_mul_lo_u32 v10, v3, v6 +; GFX9-NEXT: v_mul_hi_u32 v11, v3, v9 +; GFX9-NEXT: v_mul_hi_u32 v12, v3, v6 +; GFX9-NEXT: v_mul_hi_u32 v8, v4, v9 +; GFX9-NEXT: v_mul_lo_u32 v9, v4, v9 ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 -; GFX9-NEXT: v_mul_hi_u32 v7, v2, v4 +; GFX9-NEXT: v_mul_hi_u32 v7, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 +; GFX9-NEXT: v_mul_lo_u32 v4, v4, v6 ; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s8, s5, 31 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v5, v4, vcc -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v1, v6, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v2, v6, s[2:3] ; GFX9-NEXT: s_add_u32 s2, s4, s8 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 ; GFX9-NEXT: s_addc_u32 s3, s5, s8 ; GFX9-NEXT: s_mov_b32 s9, s8 ; GFX9-NEXT: s_xor_b64 s[14:15], s[2:3], s[8:9] -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s14, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s14, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s14, v1 -; GFX9-NEXT: v_mul_hi_u32 v7, s15, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s15, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s15, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s15, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, s14, v2 +; GFX9-NEXT: v_mul_hi_u32 v5, s14, v3 +; GFX9-NEXT: v_mul_hi_u32 v6, s14, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, s15, v2 +; GFX9-NEXT: v_mul_lo_u32 v2, s15, v2 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, s15, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, s15, v3 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc -; GFX9-NEXT: v_mul_lo_u32 v1, s12, v1 -; GFX9-NEXT: v_mul_hi_u32 v2, s12, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s13, v0 -; GFX9-NEXT: v_mul_lo_u32 v0, s12, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_sub_co_u32_e64 v0, s[0:1], s14, v0 -; GFX9-NEXT: v_sub_u32_e32 v2, s15, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: v_subb_co_u32_e64 v2, vcc, v2, v3, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[2:3], s12, v0 -; GFX9-NEXT: v_subbrev_co_u32_e64 v7, vcc, 0, v2, s[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, s12, v3 +; GFX9-NEXT: v_mul_hi_u32 v4, s12, v2 +; GFX9-NEXT: v_mul_lo_u32 v5, s13, v2 +; GFX9-NEXT: v_mul_lo_u32 v2, s12, v2 +; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 +; GFX9-NEXT: v_sub_co_u32_e64 v2, s[0:1], s14, v2 +; GFX9-NEXT: v_sub_u32_e32 v4, s15, v3 +; GFX9-NEXT: v_mov_b32_e32 v5, s13 +; GFX9-NEXT: v_subb_co_u32_e64 v4, vcc, v4, v5, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[2:3], s12, v2 +; GFX9-NEXT: v_subbrev_co_u32_e64 v7, vcc, 0, v4, s[2:3] ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v4 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s13, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; GFX9-NEXT: v_subb_co_u32_e64 v2, vcc, v2, v3, s[2:3] -; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s12, v4 -; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX9-NEXT: v_subb_co_u32_e64 v4, vcc, v4, v5, s[2:3] +; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s12, v6 +; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v7, s15 -; GFX9-NEXT: v_subb_co_u32_e64 v1, vcc, v7, v1, s[0:1] +; GFX9-NEXT: v_subb_co_u32_e64 v3, vcc, v7, v3, s[0:1] ; GFX9-NEXT: s_ashr_i32 s0, s11, 31 ; GFX9-NEXT: s_add_u32 s10, s10, s0 ; GFX9-NEXT: s_mov_b32 s1, s0 @@ -10722,138 +10722,138 @@ ; GFX9-NEXT: s_xor_b64 s[10:11], s[10:11], s[0:1] ; GFX9-NEXT: v_cvt_f32_u32_e32 v9, s10 ; GFX9-NEXT: v_cvt_f32_u32_e32 v10, s11 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s13, v3 ; GFX9-NEXT: v_mac_f32_e32 v9, s16, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX9-NEXT: v_rcp_f32_e32 v8, v9 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[2:3] -; GFX9-NEXT: v_mul_f32_e32 v3, s17, v8 -; GFX9-NEXT: v_mul_f32_e32 v4, s18, v3 -; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_mac_f32_e32 v3, s19, v4 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[2:3] +; GFX9-NEXT: v_mul_f32_e32 v5, s17, v8 +; GFX9-NEXT: v_mul_f32_e32 v6, s18, v5 +; GFX9-NEXT: v_trunc_f32_e32 v6, v6 +; GFX9-NEXT: v_mac_f32_e32 v5, s19, v6 +; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX9-NEXT: s_sub_u32 s2, 0, s10 ; GFX9-NEXT: s_subb_u32 s3, 0, s11 -; GFX9-NEXT: v_mul_hi_u32 v7, s2, v3 -; GFX9-NEXT: v_mul_lo_u32 v8, s2, v4 -; GFX9-NEXT: v_mul_lo_u32 v9, s3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s2, v3 +; GFX9-NEXT: v_mul_hi_u32 v7, s2, v5 +; GFX9-NEXT: v_mul_lo_u32 v8, s2, v6 +; GFX9-NEXT: v_mul_lo_u32 v9, s3, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, s2, v5 ; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 ; GFX9-NEXT: v_add_u32_e32 v7, v7, v9 -; GFX9-NEXT: v_mul_lo_u32 v8, v3, v7 -; GFX9-NEXT: v_mul_hi_u32 v9, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v10, v3, v7 -; GFX9-NEXT: v_mul_hi_u32 v11, v4, v7 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v7 +; GFX9-NEXT: v_mul_lo_u32 v8, v5, v7 +; GFX9-NEXT: v_mul_hi_u32 v9, v5, v4 +; GFX9-NEXT: v_mul_hi_u32 v10, v5, v7 +; GFX9-NEXT: v_mul_hi_u32 v11, v6, v7 +; GFX9-NEXT: v_mul_lo_u32 v7, v6, v7 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, v4, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, v4, v2 +; GFX9-NEXT: v_mul_lo_u32 v10, v6, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v6, v4 ; GFX9-NEXT: s_ashr_i32 s12, s7, 31 ; GFX9-NEXT: s_mov_b32 s13, s12 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v9, v2, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v5, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v3, vcc, v4, v7, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v8, s2, v3 -; GFX9-NEXT: v_mul_hi_u32 v9, s2, v2 -; GFX9-NEXT: v_mul_lo_u32 v10, s3, v2 -; GFX9-NEXT: v_mul_lo_u32 v11, s2, v2 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 +; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], v5, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v8, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v5, vcc, v6, v7, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v8, s2, v5 +; GFX9-NEXT: v_mul_hi_u32 v9, s2, v4 +; GFX9-NEXT: v_mul_lo_u32 v10, s3, v4 +; GFX9-NEXT: v_mul_lo_u32 v11, s2, v4 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v7 ; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 ; GFX9-NEXT: v_add_u32_e32 v8, v8, v10 -; GFX9-NEXT: v_mul_lo_u32 v12, v2, v8 -; GFX9-NEXT: v_mul_hi_u32 v13, v2, v11 -; GFX9-NEXT: v_mul_hi_u32 v14, v2, v8 -; GFX9-NEXT: v_mul_hi_u32 v10, v3, v11 -; GFX9-NEXT: v_mul_lo_u32 v11, v3, v11 +; GFX9-NEXT: v_mul_lo_u32 v12, v4, v8 +; GFX9-NEXT: v_mul_hi_u32 v13, v4, v11 +; GFX9-NEXT: v_mul_hi_u32 v14, v4, v8 +; GFX9-NEXT: v_mul_hi_u32 v10, v5, v11 +; GFX9-NEXT: v_mul_lo_u32 v11, v5, v11 ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v13, v12 -; GFX9-NEXT: v_mul_hi_u32 v9, v3, v8 +; GFX9-NEXT: v_mul_hi_u32 v9, v5, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v14, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v3, v8 +; GFX9-NEXT: v_mul_lo_u32 v5, v5, v8 ; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v12, v11 ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v10, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v10, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v5, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v4, vcc, v4, v8, s[0:1] +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v10, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v1, v8, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v6, vcc, v6, v8, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s6, s12 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9-NEXT: s_addc_u32 s1, s7, s12 ; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s6, v3 -; GFX9-NEXT: v_mul_hi_u32 v7, s6, v2 -; GFX9-NEXT: v_mul_hi_u32 v9, s6, v3 -; GFX9-NEXT: v_mul_hi_u32 v10, s7, v3 -; GFX9-NEXT: v_mul_lo_u32 v3, s7, v3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, s6, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, s6, v4 +; GFX9-NEXT: v_mul_hi_u32 v9, s6, v5 +; GFX9-NEXT: v_mul_hi_u32 v10, s7, v5 +; GFX9-NEXT: v_mul_lo_u32 v5, s7, v5 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v9, vcc -; GFX9-NEXT: v_mul_lo_u32 v9, s7, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, s7, v2 -; GFX9-NEXT: v_xor_b32_e32 v0, s8, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v2, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v10, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s10, v3 -; GFX9-NEXT: v_mul_hi_u32 v4, s10, v2 -; GFX9-NEXT: v_mul_lo_u32 v5, s11, v2 -; GFX9-NEXT: v_mul_lo_u32 v2, s10, v2 +; GFX9-NEXT: v_mul_lo_u32 v9, s7, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, s7, v4 +; GFX9-NEXT: v_xor_b32_e32 v2, s8, v2 +; GFX9-NEXT: v_xor_b32_e32 v3, s8, v3 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v9 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v10, v0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v5, s10, v1 +; GFX9-NEXT: v_mul_hi_u32 v6, s10, v4 +; GFX9-NEXT: v_mul_lo_u32 v7, s11, v4 +; GFX9-NEXT: v_mul_lo_u32 v4, s10, v4 ; GFX9-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s8, v0 -; GFX9-NEXT: v_sub_co_u32_e64 v2, s[0:1], s6, v2 -; GFX9-NEXT: v_sub_u32_e32 v4, s7, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, s11 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc -; GFX9-NEXT: v_subb_co_u32_e64 v4, vcc, v4, v5, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e64 v7, s[2:3], s10, v2 -; GFX9-NEXT: v_subbrev_co_u32_e64 v8, vcc, 0, v4, s[2:3] +; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s8, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v8, vcc +; GFX9-NEXT: v_add_u32_e32 v3, v6, v5 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v7 +; GFX9-NEXT: v_sub_co_u32_e64 v4, s[0:1], s6, v4 +; GFX9-NEXT: v_sub_u32_e32 v5, s7, v3 +; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: v_subb_co_u32_e64 v5, vcc, v5, v6, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e64 v7, s[2:3], s10, v4 +; GFX9-NEXT: v_subbrev_co_u32_e64 v8, vcc, 0, v5, s[2:3] ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc -; GFX9-NEXT: v_subb_co_u32_e64 v4, vcc, v4, v5, s[2:3] -; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s10, v7 -; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: v_subb_co_u32_e64 v5, vcc, v5, v6, s[2:3] +; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s10, v7 +; GFX9-NEXT: v_subbrev_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, s7 ; GFX9-NEXT: v_subb_co_u32_e64 v3, vcc, v8, v3, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v5, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v2, s12, v2 -; GFX9-NEXT: v_xor_b32_e32 v3, s12, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s12, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v6, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX9-NEXT: v_xor_b32_e32 v4, s12, v4 +; GFX9-NEXT: v_xor_b32_e32 v5, s12, v3 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s12, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v5, v6, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[4:5] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y %r = srem <2 x i64> %x, %shl.y Index: llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -327,25 +327,25 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v4i8_x2_extra_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 -; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: s_mov_b64 s[12:13], s[10:11] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 ; SI-NEXT: s_mov_b32 s16, 0xff00 ; SI-NEXT: s_movk_i32 s17, 0xff -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -361,9 +361,9 @@ ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_x2_extra_use: Index: llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -906,20 +906,20 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_2_uses: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_movk_i32 s0, 0xff -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v4 @@ -929,7 +929,7 @@ ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, s0, v4 ; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5 @@ -942,25 +942,25 @@ ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v5, 9 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_movk_i32 s0, 0x900 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v4 @@ -968,7 +968,7 @@ ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v4 ; VI-NEXT: v_add_u16_e32 v8, 9, v4 ; VI-NEXT: v_add_u16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -979,7 +979,7 @@ ; VI-NEXT: v_add_u16_e32 v0, s0, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_v4i8_to_v4f32_2_uses: Index: llvm/test/CodeGen/AMDGPU/frem.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/frem.ll +++ llvm/test/CodeGen/AMDGPU/frem.ll @@ -1002,20 +1002,20 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, ; SI-LABEL: fast_frem_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s4, s8 +; SI-NEXT: s_mov_b32 s5, s9 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] @@ -1029,7 +1029,7 @@ ; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6 ; SI-NEXT: s_mov_b32 s1, 0xfffff -; SI-NEXT: s_mov_b32 s0, s10 +; SI-NEXT: s_mov_b32 s0, s6 ; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8 ; SI-NEXT: v_not_b32_e32 v6, v6 ; SI-NEXT: v_and_b32_e32 v6, v4, v6 @@ -1043,7 +1043,7 @@ ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc ; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: fast_frem_f64: @@ -1160,20 +1160,20 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, ; SI-LABEL: unsafe_frem_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s4, s8 +; SI-NEXT: s_mov_b32 s5, s9 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] @@ -1187,7 +1187,7 @@ ; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6 ; SI-NEXT: s_mov_b32 s1, 0xfffff -; SI-NEXT: s_mov_b32 s0, s10 +; SI-NEXT: s_mov_b32 s0, s6 ; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8 ; SI-NEXT: v_not_b32_e32 v6, v6 ; SI-NEXT: v_and_b32_e32 v6, v4, v6 @@ -1201,7 +1201,7 @@ ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc ; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: unsafe_frem_f64: Index: llvm/test/CodeGen/AMDGPU/greedy-global-heuristic.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/greedy-global-heuristic.mir @@ -0,0 +1,250 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=greedy -verify-machineinstrs -o - %s |FileCheck %s + +# Testcase is limited to 24 VGPRs. Only a maxiumum of 6 vreg_128s can +# be allocated at the same time. + +# This testcase is intended to stress the heuristic in +# RAGreedy::enqueue to switch from local to global. If an interval is +# in one basic block, the usual preference is to allocate registers in +# instruction order. If the estimated live range length is more than +# twice the number of registers in the class, the global heuristic is +# used which increases the priority of the longest live ranges. By +# accounting for the number of reserved registers in vreg_128, the +# heuristic changes end up avoiding a spill of %0. + +--- | + + define void @use_global_assign() #0 { + entry: + unreachable + } + + attributes #0 = { "amdgpu-waves-per-eu"="10,10" } + +... +--- +name: use_global_assign +tracksRegLiveness: true +registers: + - { id: 0, class: vreg_128, preferred-register: '%0' } + - { id: 1, class: vreg_128, preferred-register: '%0' } + - { id: 2, class: vreg_128, preferred-register: '%0' } + - { id: 3, class: vreg_128, preferred-register: '%0' } + - { id: 4, class: vreg_128, preferred-register: '%0' } + - { id: 5, class: vreg_128, preferred-register: '%0' } + - { id: 6, class: vreg_128, preferred-register: '%0' } + - { id: 7, class: vreg_128, preferred-register: '%0' } + - { id: 8, class: vreg_128, preferred-register: '%0' } + - { id: 9, class: vreg_128, preferred-register: '%0' } + - { id: 10, class: vreg_128, preferred-register: '%0' } + - { id: 11, class: vreg_128, preferred-register: '%0' } + - { id: 12, class: vreg_128, preferred-register: '%0' } + - { id: 13, class: vreg_128, preferred-register: '%0' } + - { id: 14, class: vreg_128, preferred-register: '%0' } + - { id: 15, class: vreg_128, preferred-register: '%0' } + +machineFunctionInfo: + waveLimiter: true + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' +body: | + ; CHECK-LABEL: name: use_global_assign + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: S_NOP 0, implicit-def %0 + ; CHECK: S_NOP 0, implicit-def %18 + ; CHECK: SI_SPILL_V128_SAVE %18, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) + ; CHECK: S_NOP 0, implicit-def %35 + ; CHECK: S_NOP 0, implicit-def %27 + ; CHECK: S_NOP 0, implicit-def %29 + ; CHECK: S_NOP 0, implicit-def %31 + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: S_NOP 0, implicit %31 + ; CHECK: S_NOP 0, implicit %29 + ; CHECK: S_NOP 0, implicit %27 + ; CHECK: S_NOP 0, implicit %35 + ; CHECK: SI_SPILL_V128_SAVE %35, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) + ; CHECK: S_NOP 0, implicit [[SI_SPILL_V128_RESTORE]] + ; CHECK: S_NOP 0, implicit %0 + ; CHECK: S_NOP 0, implicit-def %10 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0, implicit %0 + ; CHECK: S_NOP 0, implicit-def %33 + ; CHECK: SI_SPILL_V128_SAVE %33, %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5) + ; CHECK: S_NOP 0, implicit %10 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0, implicit-def %40 + ; CHECK: SI_SPILL_V128_SAVE %40, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5) + ; CHECK: S_NOP 0, implicit %33 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0, implicit-def %42 + ; CHECK: SI_SPILL_V128_SAVE %42, %stack.3, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.3, align 4, addrspace 5) + ; CHECK: S_NOP 0, implicit %40 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: S_NOP 0 + ; CHECK: [[COPY:%[0-9]+]]:vreg_128 = COPY %31 + ; CHECK: S_NOP 0, implicit %31 + ; CHECK: [[COPY1:%[0-9]+]]:vreg_128 = COPY %29 + ; CHECK: S_NOP 0, implicit %29 + ; CHECK: [[COPY2:%[0-9]+]]:vreg_128 = COPY %27 + ; CHECK: S_NOP 0, implicit %27 + ; CHECK: [[SI_SPILL_V128_RESTORE1:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5) + ; CHECK: [[COPY3:%[0-9]+]]:vreg_128 = COPY [[SI_SPILL_V128_RESTORE1]] + ; CHECK: S_NOP 0, implicit [[SI_SPILL_V128_RESTORE1]] + ; CHECK: [[SI_SPILL_V128_RESTORE2:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) + ; CHECK: S_NOP 0, implicit [[SI_SPILL_V128_RESTORE2]] + ; CHECK: S_NOP 0, implicit %0 + ; CHECK: [[SI_SPILL_V128_RESTORE3:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) + ; CHECK: S_NOP 0, implicit [[SI_SPILL_V128_RESTORE3]] + ; CHECK: [[SI_SPILL_V128_RESTORE4:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5) + ; CHECK: S_NOP 0, implicit [[SI_SPILL_V128_RESTORE4]] + ; CHECK: [[SI_SPILL_V128_RESTORE5:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.3, align 4, addrspace 5) + ; CHECK: S_NOP 0, implicit [[SI_SPILL_V128_RESTORE5]] + ; CHECK: bb.2: + ; CHECK: S_NOP 0, implicit %0 + ; CHECK: [[SI_SPILL_V128_RESTORE6:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) + ; CHECK: S_NOP 0, implicit [[SI_SPILL_V128_RESTORE6]] + ; CHECK: S_NOP 0, implicit [[COPY3]] + ; CHECK: S_NOP 0, implicit [[COPY2]] + ; CHECK: S_NOP 0, implicit [[COPY1]] + ; CHECK: S_NOP 0, implicit [[COPY]] + bb.0: + S_NOP 0, implicit-def %0:vreg_128 + S_NOP 0, implicit-def %1:vreg_128 + S_NOP 0, implicit-def %2:vreg_128 + S_NOP 0, implicit-def %3:vreg_128 + S_NOP 0, implicit-def %4:vreg_128 + S_NOP 0, implicit-def %5:vreg_128 + + bb.1: + S_NOP 0, implicit %5 + S_NOP 0, implicit %4 + S_NOP 0, implicit %3 + S_NOP 0, implicit %2 + S_NOP 0, implicit %1 + S_NOP 0, implicit %0 + S_NOP 0, implicit-def %10:vreg_128 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0, implicit %0 + S_NOP 0, implicit-def %11:vreg_128 + S_NOP 0, implicit %10 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0, implicit-def %12:vreg_128 + S_NOP 0, implicit %11 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0, implicit-def %13:vreg_128 + S_NOP 0, implicit %12 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0, implicit %5 + S_NOP 0, implicit %4 + S_NOP 0, implicit %3 + S_NOP 0, implicit %2 + S_NOP 0, implicit %1 + S_NOP 0, implicit %0 + S_NOP 0, implicit %11 + S_NOP 0, implicit %12 + S_NOP 0, implicit %13 + + bb.2: + S_NOP 0, implicit %0 + S_NOP 0, implicit %1 + S_NOP 0, implicit %2 + S_NOP 0, implicit %3 + S_NOP 0, implicit %4 + S_NOP 0, implicit %5 + +... Index: llvm/test/CodeGen/AMDGPU/half.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/half.ll +++ llvm/test/CodeGen/AMDGPU/half.ll @@ -1327,89 +1327,89 @@ ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_add_u32 s2, s2, 16 ; SI-NEXT: s_addc_u32 s3, s3, 0 -; SI-NEXT: v_mov_b32_e32 v5, s3 -; SI-NEXT: v_mov_b32_e32 v4, s2 -; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; SI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; SI-NEXT: flat_load_dwordx4 v[0:3], v[2:3] ; SI-NEXT: s_add_u32 s2, s0, 48 ; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: v_mov_b32_e32 v15, s3 -; SI-NEXT: v_mov_b32_e32 v14, s2 +; SI-NEXT: v_mov_b32_e32 v14, s3 +; SI-NEXT: v_mov_b32_e32 v13, s2 ; SI-NEXT: s_add_u32 s2, s0, 32 ; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: v_mov_b32_e32 v17, s3 -; SI-NEXT: v_mov_b32_e32 v16, s2 +; SI-NEXT: v_mov_b32_e32 v16, s3 +; SI-NEXT: v_mov_b32_e32 v15, s2 ; SI-NEXT: s_add_u32 s2, s0, 16 ; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: v_mov_b32_e32 v19, s3 -; SI-NEXT: v_mov_b32_e32 v18, s2 +; SI-NEXT: v_mov_b32_e32 v18, s3 +; SI-NEXT: v_mov_b32_e32 v17, s2 ; SI-NEXT: s_add_u32 s2, s0, 0x70 ; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: v_mov_b32_e32 v13, s1 -; SI-NEXT: v_mov_b32_e32 v12, s0 +; SI-NEXT: v_mov_b32_e32 v12, s1 +; SI-NEXT: v_mov_b32_e32 v11, s0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v5 -; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; SI-NEXT: v_mov_b32_e32 v15, s3 -; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v14, s2 -; SI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v3 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 +; SI-NEXT: v_cvt_f64_f32_e32 v[7:8], v7 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: flat_store_dwordx4 v[13:14], v[7:10] +; SI-NEXT: v_mov_b32_e32 v14, s3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; SI-NEXT: v_mov_b32_e32 v13, s2 +; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 ; SI-NEXT: s_add_u32 s2, s0, 0x60 +; SI-NEXT: s_addc_u32 s3, s1, 0 +; SI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] +; SI-NEXT: v_mov_b32_e32 v16, s3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_mov_b32_e32 v15, s2 +; SI-NEXT: s_add_u32 s2, s0, 0x50 +; SI-NEXT: flat_store_dwordx4 v[17:18], v[4:7] ; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 -; SI-NEXT: v_mov_b32_e32 v17, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v4 +; SI-NEXT: s_add_u32 s0, s0, 64 +; SI-NEXT: s_addc_u32 s1, s1, 0 +; SI-NEXT: flat_store_dwordx4 v[11:12], v[0:3] ; SI-NEXT: v_cvt_f32_f16_e32 v12, v5 -; SI-NEXT: v_mov_b32_e32 v16, s2 -; SI-NEXT: s_add_u32 s2, s0, 0x50 -; SI-NEXT: s_addc_u32 s3, s1, 0 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v19 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v20 ; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 ; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; SI-NEXT: s_add_u32 s0, s0, 64 -; SI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] -; SI-NEXT: s_addc_u32 s1, s1, 0 -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v21 -; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 -; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; SI-NEXT: v_mov_b32_e32 v19, s3 +; SI-NEXT: flat_store_dwordx4 v[13:14], v[0:3] +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v21 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v12 +; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v19 +; SI-NEXT: v_mov_b32_e32 v18, s3 ; SI-NEXT: v_mov_b32_e32 v13, s1 -; SI-NEXT: v_mov_b32_e32 v18, s2 +; SI-NEXT: v_mov_b32_e32 v17, s2 ; SI-NEXT: v_mov_b32_e32 v12, s0 -; SI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; SI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; SI-NEXT: flat_store_dwordx4 v[15:16], v[8:11] +; SI-NEXT: flat_store_dwordx4 v[17:18], v[0:3] ; SI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; SI-NEXT: s_endpgm ; @@ -1421,82 +1421,82 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_add_u32 s2, s2, 16 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; VI-NEXT: flat_load_dwordx4 v[0:3], v[2:3] ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v15, s3 -; VI-NEXT: v_mov_b32_e32 v14, s2 +; VI-NEXT: v_mov_b32_e32 v14, s3 +; VI-NEXT: v_mov_b32_e32 v13, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v17, s3 -; VI-NEXT: v_mov_b32_e32 v16, s2 +; VI-NEXT: v_mov_b32_e32 v16, s3 +; VI-NEXT: v_mov_b32_e32 v15, s2 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v19, s3 -; VI-NEXT: v_mov_b32_e32 v18, s2 +; VI-NEXT: v_mov_b32_e32 v18, s3 +; VI-NEXT: v_mov_b32_e32 v17, s2 ; VI-NEXT: s_add_u32 s2, s0, 0x70 -; VI-NEXT: v_mov_b32_e32 v13, s1 +; VI-NEXT: v_mov_b32_e32 v12, s1 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v12, s0 +; VI-NEXT: v_mov_b32_e32 v11, s0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cvt_f32_f16_sdwa v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 +; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8 +; VI-NEXT: flat_store_dwordx4 v[13:14], v[7:10] +; VI-NEXT: v_mov_b32_e32 v14, s3 +; VI-NEXT: v_cvt_f32_f16_sdwa v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_f16_e32 v8, v3 -; VI-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v10, v0 +; VI-NEXT: v_mov_b32_e32 v13, s2 ; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3 -; VI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; VI-NEXT: v_mov_b32_e32 v15, s3 -; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 -; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; VI-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_mov_b32_e32 v14, s2 -; VI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 ; VI-NEXT: s_add_u32 s2, s0, 0x60 -; VI-NEXT: v_cvt_f32_f16_e32 v8, v0 -; VI-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_cvt_f32_f16_e32 v10, v4 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v17, s3 -; VI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; VI-NEXT: v_cvt_f32_f16_sdwa v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v8 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v9 -; VI-NEXT: v_cvt_f32_f16_e32 v4, v7 -; VI-NEXT: v_cvt_f32_f16_sdwa v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v8, v5 -; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; VI-NEXT: v_cvt_f32_f16_sdwa v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 -; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; VI-NEXT: v_cvt_f32_f16_sdwa v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_mov_b32_e32 v16, s2 +; VI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] +; VI-NEXT: v_mov_b32_e32 v16, s3 +; VI-NEXT: v_cvt_f32_f16_e32 v6, v5 +; VI-NEXT: v_cvt_f32_f16_sdwa v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v8, v4 +; VI-NEXT: v_cvt_f32_f16_sdwa v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 +; VI-NEXT: v_mov_b32_e32 v15, s2 ; VI-NEXT: s_add_u32 s2, s0, 0x50 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v8 -; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v7 -; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v6 +; VI-NEXT: flat_store_dwordx4 v[17:18], v[4:7] +; VI-NEXT: v_cvt_f32_f16_sdwa v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v8 +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v9 +; VI-NEXT: v_cvt_f32_f16_sdwa v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v3 ; VI-NEXT: s_add_u32 s0, s0, 64 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v12 +; VI-NEXT: flat_store_dwordx4 v[11:12], v[4:7] +; VI-NEXT: v_cvt_f32_f16_sdwa v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v8 +; VI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v0 +; VI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v10 +; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 +; VI-NEXT: flat_store_dwordx4 v[13:14], v[3:6] ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v18 -; VI-NEXT: v_mov_b32_e32 v21, s3 +; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 +; VI-NEXT: v_mov_b32_e32 v20, s3 ; VI-NEXT: v_mov_b32_e32 v13, s1 -; VI-NEXT: v_mov_b32_e32 v20, s2 +; VI-NEXT: v_mov_b32_e32 v19, s2 ; VI-NEXT: v_mov_b32_e32 v12, s0 -; VI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; VI-NEXT: flat_store_dwordx4 v[20:21], v[0:3] -; VI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; VI-NEXT: flat_store_dwordx4 v[15:16], v[8:11] +; VI-NEXT: flat_store_dwordx4 v[19:20], v[4:7] +; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; VI-NEXT: s_endpgm %val = load <16 x half>, <16 x half> addrspace(1)* %in %cvt = fpext <16 x half> %val to <16 x double> Index: llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -403,65 +403,65 @@ ; SI-NEXT: s_mov_b32 s20, s22 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_u32 s2, s7, 0xb0014 -; SI-NEXT: s_add_i32 s26, s2, s23 -; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s26 -; SI-NEXT: s_brev_b32 s27, 1 -; SI-NEXT: s_andn2_b64 s[24:25], s[6:7], s[2:3] -; SI-NEXT: s_and_b32 s2, s7, s27 +; SI-NEXT: s_add_i32 s25, s2, s23 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s25 +; SI-NEXT: s_brev_b32 s24, 1 +; SI-NEXT: s_andn2_b64 s[26:27], s[6:7], s[2:3] +; SI-NEXT: s_and_b32 s2, s7, s24 ; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_mov_b32_e32 v0, s25 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s26, 0 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_cmp_lt_i32_e64 vcc, s25, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s26, 51 +; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s25, 51 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] -; SI-NEXT: v_mov_b32_e32 v0, s24 +; SI-NEXT: v_mov_b32_e32 v0, s26 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] ; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] ; SI-NEXT: s_bfe_u32 s2, s5, 0xb0014 -; SI-NEXT: s_add_i32 s25, s2, s23 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 -; SI-NEXT: s_brev_b32 s24, -2 -; SI-NEXT: v_mov_b32_e32 v18, 0x3ff00000 ; SI-NEXT: v_mov_b32_e32 v4, s7 -; SI-NEXT: v_bfi_b32 v4, s24, v18, v4 -; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s25 +; SI-NEXT: s_add_i32 s7, s2, s23 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 +; SI-NEXT: s_brev_b32 s6, -2 +; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 +; SI-NEXT: v_bfi_b32 v4, s6, v8, v4 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s7 ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; SI-NEXT: v_mov_b32_e32 v2, 0 -; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[2:3] -; SI-NEXT: s_and_b32 s2, s5, s27 +; SI-NEXT: s_andn2_b64 s[26:27], s[4:5], s[2:3] +; SI-NEXT: s_and_b32 s2, s5, s24 ; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] ; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s25, 0 +; SI-NEXT: v_mov_b32_e32 v0, s27 +; SI-NEXT: v_cmp_lt_i32_e64 vcc, s7, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s25, 51 +; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s7, 51 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] -; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v0, s26 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[2:3] ; SI-NEXT: v_add_f64 v[4:5], s[4:5], -v[0:1] ; SI-NEXT: s_bfe_u32 s2, s11, 0xb0014 ; SI-NEXT: v_mov_b32_e32 v6, s5 -; SI-NEXT: s_add_i32 s6, s2, s23 +; SI-NEXT: s_add_i32 s7, s2, s23 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; SI-NEXT: v_bfi_b32 v6, s24, v18, v6 -; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6 +; SI-NEXT: v_bfi_b32 v6, s6, v8, v6 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s7 ; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc ; SI-NEXT: v_mov_b32_e32 v4, 0 ; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[2:3] -; SI-NEXT: s_and_b32 s2, s11, s27 +; SI-NEXT: s_and_b32 s2, s11, s24 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] ; SI-NEXT: v_mov_b32_e32 v5, s2 ; SI-NEXT: v_mov_b32_e32 v4, s5 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s6, 0 +; SI-NEXT: v_cmp_lt_i32_e64 vcc, s7, 0 ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; SI-NEXT: v_mov_b32_e32 v5, s11 -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s6, 51 +; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s7, 51 ; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[2:3] ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc @@ -469,131 +469,131 @@ ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[2:3] ; SI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5] ; SI-NEXT: s_bfe_u32 s2, s9, 0xb0014 -; SI-NEXT: v_mov_b32_e32 v8, s11 -; SI-NEXT: s_add_i32 s6, s2, s23 +; SI-NEXT: v_mov_b32_e32 v9, s11 +; SI-NEXT: s_add_i32 s7, s2, s23 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 -; SI-NEXT: v_bfi_b32 v8, s24, v18, v8 -; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6 -; SI-NEXT: v_cndmask_b32_e32 v7, 0, v8, vcc +; SI-NEXT: v_bfi_b32 v9, s6, v8, v9 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s7 +; SI-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc ; SI-NEXT: v_mov_b32_e32 v6, 0 ; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[2:3] -; SI-NEXT: s_and_b32 s2, s9, s27 +; SI-NEXT: s_and_b32 s2, s9, s24 ; SI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] ; SI-NEXT: v_mov_b32_e32 v5, s2 ; SI-NEXT: v_mov_b32_e32 v4, s5 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s6, 0 +; SI-NEXT: v_cmp_lt_i32_e64 vcc, s7, 0 ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s6, 51 +; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s7, 51 ; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[2:3] ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; SI-NEXT: v_mov_b32_e32 v8, s8 -; SI-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[2:3] -; SI-NEXT: v_add_f64 v[8:9], s[8:9], -v[4:5] +; SI-NEXT: v_mov_b32_e32 v9, s8 +; SI-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[2:3] +; SI-NEXT: v_add_f64 v[9:10], s[8:9], -v[4:5] ; SI-NEXT: s_bfe_u32 s2, s15, 0xb0014 -; SI-NEXT: v_mov_b32_e32 v10, s9 -; SI-NEXT: s_add_i32 s6, s2, s23 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[8:9]|, 0.5 -; SI-NEXT: v_bfi_b32 v10, s24, v18, v10 -; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6 -; SI-NEXT: v_cndmask_b32_e32 v9, 0, v10, vcc -; SI-NEXT: v_mov_b32_e32 v8, 0 +; SI-NEXT: v_mov_b32_e32 v11, s9 +; SI-NEXT: s_add_i32 s7, s2, s23 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[9:10]|, 0.5 +; SI-NEXT: v_bfi_b32 v11, s6, v8, v11 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s7 +; SI-NEXT: v_cndmask_b32_e32 v10, 0, v11, vcc +; SI-NEXT: v_mov_b32_e32 v9, 0 ; SI-NEXT: s_andn2_b64 s[4:5], s[14:15], s[2:3] -; SI-NEXT: s_and_b32 s2, s15, s27 -; SI-NEXT: v_add_f64 v[4:5], v[4:5], v[8:9] -; SI-NEXT: v_mov_b32_e32 v9, s2 -; SI-NEXT: v_mov_b32_e32 v8, s5 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s6, 0 -; SI-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; SI-NEXT: v_mov_b32_e32 v9, s15 -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s6, 51 -; SI-NEXT: v_cndmask_b32_e64 v13, v8, v9, s[2:3] -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc -; SI-NEXT: v_mov_b32_e32 v9, s14 -; SI-NEXT: v_cndmask_b32_e64 v12, v8, v9, s[2:3] +; SI-NEXT: s_and_b32 s2, s15, s24 +; SI-NEXT: v_add_f64 v[4:5], v[4:5], v[9:10] +; SI-NEXT: v_mov_b32_e32 v10, s2 +; SI-NEXT: v_mov_b32_e32 v9, s5 +; SI-NEXT: v_cmp_lt_i32_e64 vcc, s7, 0 +; SI-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc +; SI-NEXT: v_mov_b32_e32 v10, s15 +; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s7, 51 +; SI-NEXT: v_cndmask_b32_e64 v14, v9, v10, s[2:3] +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: v_cndmask_b32_e64 v9, v9, 0, vcc +; SI-NEXT: v_mov_b32_e32 v10, s14 +; SI-NEXT: v_cndmask_b32_e64 v13, v9, v10, s[2:3] ; SI-NEXT: s_bfe_u32 s2, s13, 0xb0014 -; SI-NEXT: s_add_i32 s8, s2, s23 -; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s8 +; SI-NEXT: s_add_i32 s7, s2, s23 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s7 ; SI-NEXT: s_andn2_b64 s[4:5], s[12:13], s[2:3] ; SI-NEXT: s_bfe_u32 s2, s19, 0xb0014 -; SI-NEXT: s_add_i32 s10, s2, s23 -; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s10 -; SI-NEXT: v_mov_b32_e32 v8, s15 -; SI-NEXT: s_andn2_b64 s[6:7], s[18:19], s[2:3] -; SI-NEXT: s_and_b32 s2, s19, s27 -; SI-NEXT: v_bfi_b32 v19, s24, v18, v8 -; SI-NEXT: v_mov_b32_e32 v9, s2 -; SI-NEXT: v_mov_b32_e32 v8, s7 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s10, 0 -; SI-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; SI-NEXT: v_mov_b32_e32 v9, s19 -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s10, 51 -; SI-NEXT: v_cndmask_b32_e64 v9, v8, v9, s[2:3] -; SI-NEXT: v_mov_b32_e32 v8, s6 -; SI-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc -; SI-NEXT: v_mov_b32_e32 v10, s18 -; SI-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[2:3] +; SI-NEXT: s_add_i32 s11, s2, s23 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s11 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: s_andn2_b64 s[8:9], s[18:19], s[2:3] +; SI-NEXT: s_and_b32 s2, s19, s24 +; SI-NEXT: v_bfi_b32 v19, s6, v8, v9 +; SI-NEXT: v_mov_b32_e32 v10, s2 +; SI-NEXT: v_mov_b32_e32 v9, s9 +; SI-NEXT: v_cmp_lt_i32_e64 vcc, s11, 0 +; SI-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc +; SI-NEXT: v_mov_b32_e32 v10, s19 +; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s11, 51 +; SI-NEXT: v_cndmask_b32_e64 v10, v9, v10, s[2:3] +; SI-NEXT: v_mov_b32_e32 v9, s8 +; SI-NEXT: v_cndmask_b32_e64 v9, v9, 0, vcc +; SI-NEXT: v_mov_b32_e32 v11, s18 +; SI-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[2:3] ; SI-NEXT: s_bfe_u32 s2, s17, 0xb0014 -; SI-NEXT: s_add_i32 s10, s2, s23 -; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s10 -; SI-NEXT: s_andn2_b64 s[6:7], s[16:17], s[2:3] -; SI-NEXT: s_and_b32 s2, s17, s27 -; SI-NEXT: v_mov_b32_e32 v11, s2 -; SI-NEXT: v_mov_b32_e32 v10, s7 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s10, 0 -; SI-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc -; SI-NEXT: v_mov_b32_e32 v11, s17 -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s10, 51 -; SI-NEXT: v_cndmask_b32_e64 v15, v10, v11, s[2:3] -; SI-NEXT: v_mov_b32_e32 v10, s6 -; SI-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc -; SI-NEXT: v_mov_b32_e32 v11, s16 -; SI-NEXT: v_cndmask_b32_e64 v14, v10, v11, s[2:3] -; SI-NEXT: v_add_f64 v[10:11], s[16:17], -v[14:15] -; SI-NEXT: v_mov_b32_e32 v17, s19 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[10:11]|, 0.5 -; SI-NEXT: v_add_f64 v[10:11], s[18:19], -v[8:9] -; SI-NEXT: v_mov_b32_e32 v16, s17 -; SI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[10:11]|, 0.5 -; SI-NEXT: v_bfi_b32 v17, s24, v18, v17 -; SI-NEXT: v_cndmask_b32_e64 v11, 0, v17, s[2:3] -; SI-NEXT: v_mov_b32_e32 v10, 0 -; SI-NEXT: v_bfi_b32 v16, s24, v18, v16 -; SI-NEXT: v_add_f64 v[10:11], v[8:9], v[10:11] -; SI-NEXT: v_cndmask_b32_e32 v9, 0, v16, vcc -; SI-NEXT: v_mov_b32_e32 v8, 0 -; SI-NEXT: s_and_b32 s9, s13, s27 -; SI-NEXT: v_add_f64 v[8:9], v[14:15], v[8:9] -; SI-NEXT: v_mov_b32_e32 v14, s5 -; SI-NEXT: v_mov_b32_e32 v15, s9 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s8, 0 -; SI-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc +; SI-NEXT: s_add_i32 s11, s2, s23 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s11 +; SI-NEXT: s_andn2_b64 s[8:9], s[16:17], s[2:3] +; SI-NEXT: s_and_b32 s2, s17, s24 +; SI-NEXT: v_mov_b32_e32 v12, s2 +; SI-NEXT: v_mov_b32_e32 v11, s9 +; SI-NEXT: v_cmp_lt_i32_e64 vcc, s11, 0 +; SI-NEXT: v_cndmask_b32_e32 v11, v11, v12, vcc +; SI-NEXT: v_mov_b32_e32 v12, s17 +; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s11, 51 +; SI-NEXT: v_cndmask_b32_e64 v16, v11, v12, s[2:3] +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_cndmask_b32_e64 v11, v11, 0, vcc +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_cndmask_b32_e64 v15, v11, v12, s[2:3] +; SI-NEXT: v_add_f64 v[11:12], s[16:17], -v[15:16] +; SI-NEXT: v_mov_b32_e32 v18, s19 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[11:12]|, 0.5 +; SI-NEXT: v_add_f64 v[11:12], s[18:19], -v[9:10] +; SI-NEXT: v_mov_b32_e32 v17, s17 +; SI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[11:12]|, 0.5 +; SI-NEXT: v_bfi_b32 v18, s6, v8, v18 +; SI-NEXT: v_cndmask_b32_e64 v12, 0, v18, s[2:3] +; SI-NEXT: v_mov_b32_e32 v11, 0 +; SI-NEXT: v_bfi_b32 v17, s6, v8, v17 +; SI-NEXT: v_add_f64 v[11:12], v[9:10], v[11:12] +; SI-NEXT: v_cndmask_b32_e32 v10, 0, v17, vcc +; SI-NEXT: v_mov_b32_e32 v9, 0 +; SI-NEXT: s_and_b32 s10, s13, s24 +; SI-NEXT: v_add_f64 v[9:10], v[15:16], v[9:10] +; SI-NEXT: v_mov_b32_e32 v15, s5 +; SI-NEXT: v_mov_b32_e32 v16, s10 +; SI-NEXT: v_cmp_lt_i32_e64 vcc, s7, 0 +; SI-NEXT: v_cndmask_b32_e32 v15, v15, v16, vcc +; SI-NEXT: v_mov_b32_e32 v16, s13 +; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s7, 51 +; SI-NEXT: v_cndmask_b32_e64 v18, v15, v16, s[2:3] +; SI-NEXT: v_mov_b32_e32 v15, s4 +; SI-NEXT: v_cndmask_b32_e64 v15, v15, 0, vcc +; SI-NEXT: v_mov_b32_e32 v16, s12 +; SI-NEXT: v_cndmask_b32_e64 v17, v15, v16, s[2:3] ; SI-NEXT: v_mov_b32_e32 v15, s13 -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s8, 51 -; SI-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[2:3] -; SI-NEXT: v_mov_b32_e32 v14, s4 -; SI-NEXT: v_cndmask_b32_e64 v14, v14, 0, vcc -; SI-NEXT: v_mov_b32_e32 v15, s12 -; SI-NEXT: v_cndmask_b32_e64 v16, v14, v15, s[2:3] -; SI-NEXT: v_mov_b32_e32 v14, s13 -; SI-NEXT: v_bfi_b32 v18, s24, v18, v14 -; SI-NEXT: v_add_f64 v[14:15], s[12:13], -v[16:17] +; SI-NEXT: v_bfi_b32 v8, s6, v8, v15 +; SI-NEXT: v_add_f64 v[15:16], s[12:13], -v[17:18] ; SI-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x9 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 -; SI-NEXT: v_add_f64 v[14:15], s[14:15], -v[12:13] +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[15:16]|, 0.5 +; SI-NEXT: v_add_f64 v[15:16], s[14:15], -v[13:14] ; SI-NEXT: s_mov_b32 s23, 0xf000 -; SI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5 -; SI-NEXT: v_mov_b32_e32 v14, 0 -; SI-NEXT: v_cndmask_b32_e64 v15, 0, v19, s[0:1] -; SI-NEXT: v_add_f64 v[14:15], v[12:13], v[14:15] -; SI-NEXT: v_cndmask_b32_e32 v13, 0, v18, vcc -; SI-NEXT: v_mov_b32_e32 v12, 0 -; SI-NEXT: v_add_f64 v[12:13], v[16:17], v[12:13] +; SI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[15:16]|, 0.5 +; SI-NEXT: v_mov_b32_e32 v15, 0 +; SI-NEXT: v_cndmask_b32_e64 v16, 0, v19, s[0:1] +; SI-NEXT: v_add_f64 v[15:16], v[13:14], v[15:16] +; SI-NEXT: v_cndmask_b32_e32 v14, 0, v8, vcc +; SI-NEXT: v_mov_b32_e32 v13, 0 +; SI-NEXT: v_add_f64 v[13:14], v[17:18], v[13:14] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[20:23], 0 offset:48 -; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[20:23], 0 offset:32 +; SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[20:23], 0 offset:48 +; SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[20:23], 0 offset:32 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 ; SI-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/load-constant-i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -1669,52 +1669,52 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 +; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, 0xffff ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s1, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s0, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s6, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s1, s1, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s0, s0, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s3, s3, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s2, s12 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s10, 16 ; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s12 +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s12 ; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, s12 ; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, s12 +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, s12 +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, s12 +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i32: @@ -1887,51 +1887,51 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v16i16_to_v16i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 +; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s12, s1, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s13, s0, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s1, s1 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s0, s0 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s14, s3, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s2, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s3, s3 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s2, s2 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s16, s5, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s17, s4, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s12, s5, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s13, s4, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s5, s5 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s18, s7, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s19, s6, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s14, s7, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s6, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s7, s7 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s6, s6 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s16, s9, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s17, s8, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s9, s9 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s18, s11, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s19, s10, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s11, s11 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s10, s10 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s8, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s14 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s12 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i32: @@ -2697,87 +2697,87 @@ ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v32i16_to_v32i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s19, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[4:19], s[6:7], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s16, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s17, s1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s35, s19, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s36, s18, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s19, s19 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s18, s18 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s17, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s34, s16, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s17, s17 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s16, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s30, s15, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s31, s14, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s35, s15, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s36, s14, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s15, s15 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s28, s13, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s29, s12, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s13, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s34, s12, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s13, s13 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s12, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s30 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s26, s11, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s27, s10, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s30, s11, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s31, s10, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s11 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s10, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s28 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s24, s9, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s25, s8, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s28, s9, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s29, s8, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s9, s9 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s8, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s26 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s22, s7, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s23, s6, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s30 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s26, s7, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s27, s6, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s24 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s20, s5, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s21, s4, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s28 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s24, s5, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s25, s4, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s26 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s22, s3, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s23, s2, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s3, s3 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s2, s2 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s24 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s20, s1, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s21, s0, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s1, s1 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s0, s0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s22 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_sextload_v32i16_to_v32i32: @@ -3062,18 +3062,27 @@ ; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s37, 0xffff +; GCN-HSA-NEXT: s_mov_b32 s53, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GCN-HSA-NEXT: s_load_dwordx16 s[36:51], s[18:19], 0x10 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s20, s1, 16 +; GCN-HSA-NEXT: s_and_b32 s59, s40, s53 +; GCN-HSA-NEXT: s_and_b32 s60, s43, s53 +; GCN-HSA-NEXT: s_and_b32 s61, s42, s53 +; GCN-HSA-NEXT: s_and_b32 s62, s45, s53 +; GCN-HSA-NEXT: s_and_b32 s63, s44, s53 +; GCN-HSA-NEXT: s_and_b32 s64, s47, s53 +; GCN-HSA-NEXT: s_and_b32 s65, s46, s53 +; GCN-HSA-NEXT: s_and_b32 s66, s49, s53 +; GCN-HSA-NEXT: s_and_b32 s67, s48, s53 +; GCN-HSA-NEXT: s_and_b32 s68, s51, s53 ; GCN-HSA-NEXT: s_lshr_b32 s21, s0, 16 ; GCN-HSA-NEXT: s_lshr_b32 s22, s3, 16 ; GCN-HSA-NEXT: s_lshr_b32 s23, s2, 16 ; GCN-HSA-NEXT: s_lshr_b32 s24, s5, 16 ; GCN-HSA-NEXT: s_lshr_b32 s25, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s26, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s27, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s28, s9, 16 ; GCN-HSA-NEXT: s_lshr_b32 s29, s8, 16 ; GCN-HSA-NEXT: s_lshr_b32 s30, s11, 16 @@ -3081,196 +3090,188 @@ ; GCN-HSA-NEXT: s_lshr_b32 s33, s13, 16 ; GCN-HSA-NEXT: s_lshr_b32 s34, s12, 16 ; GCN-HSA-NEXT: s_lshr_b32 s35, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s36, s14, 16 -; GCN-HSA-NEXT: s_and_b32 s38, s1, s37 -; GCN-HSA-NEXT: s_and_b32 s39, s0, s37 -; GCN-HSA-NEXT: s_and_b32 s40, s3, s37 -; GCN-HSA-NEXT: s_and_b32 s41, s2, s37 -; GCN-HSA-NEXT: s_and_b32 s42, s5, s37 -; GCN-HSA-NEXT: s_and_b32 s43, s4, s37 -; GCN-HSA-NEXT: s_and_b32 s44, s7, s37 -; GCN-HSA-NEXT: s_and_b32 s45, s6, s37 -; GCN-HSA-NEXT: s_and_b32 s46, s9, s37 -; GCN-HSA-NEXT: s_and_b32 s47, s8, s37 -; GCN-HSA-NEXT: s_and_b32 s48, s11, s37 -; GCN-HSA-NEXT: s_and_b32 s49, s10, s37 -; GCN-HSA-NEXT: s_and_b32 s50, s13, s37 -; GCN-HSA-NEXT: s_and_b32 s51, s12, s37 -; GCN-HSA-NEXT: s_and_b32 s52, s15, s37 -; GCN-HSA-NEXT: s_and_b32 s53, s14, s37 -; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 -; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_and_b32 s57, s4, s37 -; GCN-HSA-NEXT: s_and_b32 s58, s7, s37 -; GCN-HSA-NEXT: s_and_b32 s59, s6, s37 -; GCN-HSA-NEXT: s_and_b32 s60, s9, s37 -; GCN-HSA-NEXT: s_and_b32 s61, s8, s37 -; GCN-HSA-NEXT: s_and_b32 s62, s11, s37 -; GCN-HSA-NEXT: s_and_b32 s63, s10, s37 -; GCN-HSA-NEXT: s_and_b32 s64, s13, s37 -; GCN-HSA-NEXT: s_and_b32 s65, s12, s37 -; GCN-HSA-NEXT: s_and_b32 s66, s15, s37 -; GCN-HSA-NEXT: s_and_b32 s54, s3, s37 -; GCN-HSA-NEXT: s_and_b32 s55, s2, s37 -; GCN-HSA-NEXT: s_and_b32 s56, s5, s37 -; GCN-HSA-NEXT: s_lshr_b32 s5, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s7, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s9, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s11, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s13, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s15, s15, 16 -; GCN-HSA-NEXT: s_and_b32 s18, s1, s37 -; GCN-HSA-NEXT: s_and_b32 s19, s0, s37 -; GCN-HSA-NEXT: s_and_b32 s37, s14, s37 -; GCN-HSA-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s67, s1, 16 -; GCN-HSA-NEXT: s_lshr_b32 s68, s0, 16 -; GCN-HSA-NEXT: s_lshr_b32 s3, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s2, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s65 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s64 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_lshr_b32 s52, s14, 16 +; GCN-HSA-NEXT: s_and_b32 s18, s37, s53 +; GCN-HSA-NEXT: s_and_b32 s19, s36, s53 +; GCN-HSA-NEXT: s_and_b32 s56, s39, s53 +; GCN-HSA-NEXT: s_and_b32 s57, s38, s53 +; GCN-HSA-NEXT: s_and_b32 s58, s41, s53 +; GCN-HSA-NEXT: s_lshr_b32 s41, s41, 16 +; GCN-HSA-NEXT: s_lshr_b32 s43, s43, 16 +; GCN-HSA-NEXT: s_lshr_b32 s42, s42, 16 +; GCN-HSA-NEXT: s_lshr_b32 s45, s45, 16 +; GCN-HSA-NEXT: s_lshr_b32 s44, s44, 16 +; GCN-HSA-NEXT: s_lshr_b32 s47, s47, 16 +; GCN-HSA-NEXT: s_lshr_b32 s46, s46, 16 +; GCN-HSA-NEXT: s_lshr_b32 s49, s49, 16 +; GCN-HSA-NEXT: s_lshr_b32 s48, s48, 16 +; GCN-HSA-NEXT: s_lshr_b32 s51, s51, 16 +; GCN-HSA-NEXT: s_lshr_b32 s26, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s27, s6, 16 +; GCN-HSA-NEXT: s_and_b32 s1, s1, s53 +; GCN-HSA-NEXT: s_and_b32 s0, s0, s53 +; GCN-HSA-NEXT: s_and_b32 s3, s3, s53 +; GCN-HSA-NEXT: s_and_b32 s2, s2, s53 +; GCN-HSA-NEXT: s_and_b32 s5, s5, s53 +; GCN-HSA-NEXT: s_and_b32 s4, s4, s53 +; GCN-HSA-NEXT: s_and_b32 s54, s7, s53 +; GCN-HSA-NEXT: s_and_b32 s55, s6, s53 +; GCN-HSA-NEXT: s_and_b32 s9, s9, s53 +; GCN-HSA-NEXT: s_and_b32 s8, s8, s53 +; GCN-HSA-NEXT: s_and_b32 s11, s11, s53 +; GCN-HSA-NEXT: s_and_b32 s10, s10, s53 +; GCN-HSA-NEXT: s_and_b32 s13, s13, s53 +; GCN-HSA-NEXT: s_and_b32 s12, s12, s53 +; GCN-HSA-NEXT: s_and_b32 s15, s15, s53 +; GCN-HSA-NEXT: s_and_b32 s14, s14, s53 +; GCN-HSA-NEXT: s_and_b32 s53, s50, s53 +; GCN-HSA-NEXT: s_lshr_b32 s50, s50, 16 +; GCN-HSA-NEXT: s_lshr_b32 s37, s37, 16 +; GCN-HSA-NEXT: s_lshr_b32 s36, s36, 16 +; GCN-HSA-NEXT: s_lshr_b32 s39, s39, 16 +; GCN-HSA-NEXT: s_lshr_b32 s38, s38, 16 +; GCN-HSA-NEXT: s_lshr_b32 s40, s40, 16 +; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xe0 +; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xa0 +; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s67 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s66 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s49 +; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v35, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v34, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s59 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s58 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s7 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x80 +; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v34, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s61 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s60 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s43 +; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s37 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s66 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s68 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s51 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s63 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s62 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s61 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s60 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s56 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s65 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s64 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s63 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s62 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s59 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s58 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s56 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s68 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s36 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s67 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s34 ; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s33 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s49 +; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GCN-HSA-NEXT: s_add_u32 s6, s16, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s47 +; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GCN-HSA-NEXT: s_add_u32 s6, s16, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s45 +; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s55 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s54 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s26 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s43 +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s42 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s41 +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s40 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -3279,64 +3280,64 @@ ; GCN-NOHSA-VI-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s20, 0xffff +; GCN-NOHSA-VI-NEXT: s_mov_b32 s26, 0xffff ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x40 ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[36:51], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s70, s15, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s70, s15, s26 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s5, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s4, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s7, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s63, s6, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s64, s9, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s65, s8, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s66, s11, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s67, s10, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s68, s13, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s69, s12, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s64, s9, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s65, s8, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s66, s11, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s67, s10, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s68, s13, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s69, s12, s26 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s15, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s37, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s37, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s36, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s22, s36, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s23, s39, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s39, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s25, s38, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s38, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s37, s26 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s36, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s36, s26 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s39, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s39, s26 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s38, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s38, s26 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s41, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s41, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s41, s26 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s40, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s40, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s40, s26 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s43, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s33, s43, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s33, s43, s26 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s42, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s42, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s42, s26 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s45, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s37, s45, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s37, s45, s26 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s44, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s39, s44, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s39, s44, s26 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s47, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s41, s47, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s41, s47, s26 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s46, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s43, s46, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s43, s46, s26 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s49, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s45, s49, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s45, s49, s26 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s48, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s47, s48, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s47, s48, s26 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s51, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s49, s51, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s49, s51, s26 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s50, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s50, s50, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s53, s1, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s55, s0, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s57, s3, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s59, s2, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s14, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s50, s50, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s53, s1, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s55, s0, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s57, s3, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s59, s2, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s14, s26 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s1, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s0, 16 @@ -3348,7 +3349,7 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 @@ -3434,14 +3435,14 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s22 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s18 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -3463,19 +3464,19 @@ ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T56.XYZW, T58.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T54.XYZW, T52.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T53.XYZW, T55.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T37.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T48.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T38.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T40.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T46.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T39.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T43.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T36.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T41.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T43.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T36.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 22: ; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 0, #1 -; EG-NEXT: VTX_READ_128 T37.XYZW, T35.X, 48, #1 -; EG-NEXT: VTX_READ_128 T38.XYZW, T35.X, 32, #1 -; EG-NEXT: VTX_READ_128 T39.XYZW, T35.X, 16, #1 +; EG-NEXT: VTX_READ_128 T39.XYZW, T35.X, 48, #1 +; EG-NEXT: VTX_READ_128 T40.XYZW, T35.X, 32, #1 +; EG-NEXT: VTX_READ_128 T41.XYZW, T35.X, 16, #1 ; EG-NEXT: Fetch clause starting at 30: ; EG-NEXT: VTX_READ_128 T49.XYZW, T35.X, 112, #1 ; EG-NEXT: VTX_READ_128 T50.XYZW, T35.X, 96, #1 @@ -3484,74 +3485,74 @@ ; EG-NEXT: ALU clause starting at 38: ; EG-NEXT: MOV * T35.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 39: -; EG-NEXT: LSHR * T40.W, T36.Y, literal.x, +; EG-NEXT: LSHR * T37.W, T36.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T40.Z, T36.Y, literal.x, +; EG-NEXT: AND_INT * T37.Z, T36.Y, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHR T40.Y, T36.X, literal.x, -; EG-NEXT: LSHR * T41.W, T36.W, literal.x, +; EG-NEXT: LSHR T37.Y, T36.X, literal.x, +; EG-NEXT: LSHR * T38.W, T36.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T40.X, T36.X, literal.x, -; EG-NEXT: AND_INT T41.Z, T36.W, literal.x, +; EG-NEXT: AND_INT T37.X, T36.X, literal.x, +; EG-NEXT: AND_INT T38.Z, T36.W, literal.x, ; EG-NEXT: LSHR * T36.X, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) -; EG-NEXT: LSHR T41.Y, T36.Z, literal.x, -; EG-NEXT: LSHR * T42.W, T39.Y, literal.x, +; EG-NEXT: LSHR T38.Y, T36.Z, literal.x, +; EG-NEXT: LSHR * T42.W, T41.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T41.X, T36.Z, literal.x, -; EG-NEXT: AND_INT T42.Z, T39.Y, literal.x, +; EG-NEXT: AND_INT T38.X, T36.Z, literal.x, +; EG-NEXT: AND_INT T42.Z, T41.Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) ; EG-NEXT: LSHR T43.X, PV.W, literal.x, -; EG-NEXT: LSHR T42.Y, T39.X, literal.y, -; EG-NEXT: LSHR T44.W, T39.W, literal.y, -; EG-NEXT: AND_INT * T42.X, T39.X, literal.z, +; EG-NEXT: LSHR T42.Y, T41.X, literal.y, +; EG-NEXT: LSHR T44.W, T41.W, literal.y, +; EG-NEXT: AND_INT * T42.X, T41.X, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: AND_INT T44.Z, T39.W, literal.x, +; EG-NEXT: AND_INT T44.Z, T41.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44) -; EG-NEXT: LSHR T39.X, PV.W, literal.x, -; EG-NEXT: LSHR T44.Y, T39.Z, literal.y, -; EG-NEXT: LSHR T45.W, T38.Y, literal.y, -; EG-NEXT: AND_INT * T44.X, T39.Z, literal.z, +; EG-NEXT: LSHR T41.X, PV.W, literal.x, +; EG-NEXT: LSHR T44.Y, T41.Z, literal.y, +; EG-NEXT: LSHR T45.W, T40.Y, literal.y, +; EG-NEXT: AND_INT * T44.X, T41.Z, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: AND_INT T45.Z, T38.Y, literal.x, +; EG-NEXT: AND_INT T45.Z, T40.Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44) ; EG-NEXT: LSHR T46.X, PV.W, literal.x, -; EG-NEXT: LSHR T45.Y, T38.X, literal.y, -; EG-NEXT: LSHR T47.W, T38.W, literal.y, -; EG-NEXT: AND_INT * T45.X, T38.X, literal.z, +; EG-NEXT: LSHR T45.Y, T40.X, literal.y, +; EG-NEXT: LSHR T47.W, T40.W, literal.y, +; EG-NEXT: AND_INT * T45.X, T40.X, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: AND_INT T47.Z, T38.W, literal.x, +; EG-NEXT: AND_INT T47.Z, T40.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 64(8.968310e-44) -; EG-NEXT: LSHR T38.X, PV.W, literal.x, -; EG-NEXT: LSHR T47.Y, T38.Z, literal.y, -; EG-NEXT: AND_INT * T47.X, T38.Z, literal.z, +; EG-NEXT: LSHR T40.X, PV.W, literal.x, +; EG-NEXT: LSHR T47.Y, T40.Z, literal.y, +; EG-NEXT: AND_INT * T47.X, T40.Z, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x, -; EG-NEXT: LSHR * T35.W, T37.Y, literal.y, +; EG-NEXT: LSHR * T35.W, T39.Y, literal.y, ; EG-NEXT: 80(1.121039e-43), 16(2.242078e-44) ; EG-NEXT: LSHR T48.X, PV.W, literal.x, -; EG-NEXT: AND_INT * T35.Z, T37.Y, literal.y, +; EG-NEXT: AND_INT * T35.Z, T39.Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) ; EG-NEXT: ALU clause starting at 95: -; EG-NEXT: LSHR T35.Y, T37.X, literal.x, -; EG-NEXT: LSHR * T53.W, T37.W, literal.x, +; EG-NEXT: LSHR T35.Y, T39.X, literal.x, +; EG-NEXT: LSHR * T53.W, T39.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T35.X, T37.X, literal.x, -; EG-NEXT: AND_INT T53.Z, T37.W, literal.x, +; EG-NEXT: AND_INT T35.X, T39.X, literal.x, +; EG-NEXT: AND_INT T53.Z, T39.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 96(1.345247e-43) -; EG-NEXT: LSHR T37.X, PV.W, literal.x, -; EG-NEXT: LSHR T53.Y, T37.Z, literal.y, +; EG-NEXT: LSHR T39.X, PV.W, literal.x, +; EG-NEXT: LSHR T53.Y, T39.Z, literal.y, ; EG-NEXT: LSHR T54.W, T52.Y, literal.y, -; EG-NEXT: AND_INT * T53.X, T37.Z, literal.z, +; EG-NEXT: AND_INT * T53.X, T39.Z, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: AND_INT T54.Z, T52.Y, literal.x, @@ -3638,101 +3639,103 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v64i16_to_v64i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x10 +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s5, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s4, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s1, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s34, s0, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s35, s1 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s38, s0 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s3, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s40, s2, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s41, s3 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s42, s2 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s43, s5, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s44, s4, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s22, s7, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s6, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s7, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s46, s6, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s7, s7 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s6, s6 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s24, s9, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s8, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s47, s9, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s48, s8, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s9, s9 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s8, s8 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s26, s11, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s10, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s49, s11, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s50, s10, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s11, s11 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s10, s10 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s28, s13, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s12, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s51, s13, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s52, s12, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s13, s13 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s12, s12 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s30, s15, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s14, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s53, s15, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s54, s14, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s15, s15 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s14, s14 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s17, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s34, s16, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s55, s17, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s56, s16, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s17, s17 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s16, s16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s19, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s52, s18, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s57, s19, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s58, s18, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s19, s19 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s18, s18 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s53, s37, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s54, s36, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s37, s37 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s36, s36 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s55, s39, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s56, s38, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s39, s39 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s38, s38 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s57, s41, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s58, s40, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s41, s41 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s40, s40 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s59, s42, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s60, s43 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s42, s42 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s45, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s62, s44, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s45, s45 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s44, s44 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s63, s47, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s64, s46, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s47, s47 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s46, s46 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s65, s49, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s66, s48, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s49, s49 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s48, s48 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s67, s51, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s68, s50, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s51, s51 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s50, s50 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s43, s43, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s59, s21, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s60, s20, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s21, s21 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s20, s20 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s22, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s62, s23 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s22, s22 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s63, s25, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s64, s24, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s25, s25 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s24, s24 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s65, s27, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s66, s26, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s27, s27 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s26, s26 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s67, s29, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s68, s28, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s29, s29 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s28, s28 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s69, s31, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s70, s30, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s31, s31 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s30, s30 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s23, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s36 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s37 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s50 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s68 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s51 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s67 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s66 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s49 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s65 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s46 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s47 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s63 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s44 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s62 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s45 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s61 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s42 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s59 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s43 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s41 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s57 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s70 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s69 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s68 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s67 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s66 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s65 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s63 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s61 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s62 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s60 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s59 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 @@ -3740,64 +3743,64 @@ ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s58 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s57 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s56 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s55 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s54 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s53 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s52 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s51 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s50 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s49 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s48 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s24 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s47 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s46 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s45 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s44 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s43 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s42 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -3805,58 +3808,51 @@ ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 -; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_ashr_i32 s20, s5, 16 -; GCN-HSA-NEXT: s_ashr_i32 s21, s4, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s22, s5 -; GCN-HSA-NEXT: s_sext_i32_i16 s23, s4 -; GCN-HSA-NEXT: s_ashr_i32 s24, s7, 16 -; GCN-HSA-NEXT: s_ashr_i32 s25, s6, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s26, s7 -; GCN-HSA-NEXT: s_sext_i32_i16 s27, s6 -; GCN-HSA-NEXT: s_ashr_i32 s28, s9, 16 -; GCN-HSA-NEXT: s_ashr_i32 s29, s8, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s30, s9 -; GCN-HSA-NEXT: s_sext_i32_i16 s31, s8 -; GCN-HSA-NEXT: s_ashr_i32 s33, s11, 16 -; GCN-HSA-NEXT: s_ashr_i32 s34, s10, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s35, s11 -; GCN-HSA-NEXT: s_sext_i32_i16 s36, s10 -; GCN-HSA-NEXT: s_ashr_i32 s37, s13, 16 -; GCN-HSA-NEXT: s_ashr_i32 s38, s12, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s39, s13 -; GCN-HSA-NEXT: s_sext_i32_i16 s40, s12 -; GCN-HSA-NEXT: s_ashr_i32 s41, s15, 16 -; GCN-HSA-NEXT: s_ashr_i32 s42, s14, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s43, s15 -; GCN-HSA-NEXT: s_sext_i32_i16 s44, s14 -; GCN-HSA-NEXT: s_ashr_i32 s45, s17, 16 -; GCN-HSA-NEXT: s_ashr_i32 s46, s16, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s47, s17 -; GCN-HSA-NEXT: s_sext_i32_i16 s48, s16 -; GCN-HSA-NEXT: s_ashr_i32 s49, s19, 16 -; GCN-HSA-NEXT: s_ashr_i32 s50, s18, 16 -; GCN-HSA-NEXT: s_sext_i32_i16 s51, s19 -; GCN-HSA-NEXT: s_sext_i32_i16 s52, s18 -; GCN-HSA-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x10 +; GCN-HSA-NEXT: s_load_dwordx16 s[16:31], s[2:3], 0x0 +; GCN-HSA-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x10 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_ashr_i32 s57, s9, 16 -; GCN-HSA-NEXT: s_ashr_i32 s59, s11, 16 -; GCN-HSA-NEXT: s_ashr_i32 s60, s10, 16 -; GCN-HSA-NEXT: s_ashr_i32 s61, s13, 16 -; GCN-HSA-NEXT: s_ashr_i32 s62, s12, 16 -; GCN-HSA-NEXT: s_ashr_i32 s63, s15, 16 -; GCN-HSA-NEXT: s_ashr_i32 s64, s14, 16 -; GCN-HSA-NEXT: s_ashr_i32 s65, s17, 16 -; GCN-HSA-NEXT: s_ashr_i32 s66, s16, 16 -; GCN-HSA-NEXT: s_ashr_i32 s67, s19, 16 -; GCN-HSA-NEXT: s_ashr_i32 s68, s18, 16 -; GCN-HSA-NEXT: s_ashr_i32 s53, s5, 16 -; GCN-HSA-NEXT: s_ashr_i32 s54, s4, 16 -; GCN-HSA-NEXT: s_ashr_i32 s55, s7, 16 -; GCN-HSA-NEXT: s_ashr_i32 s56, s6, 16 -; GCN-HSA-NEXT: s_ashr_i32 s58, s8, 16 +; GCN-HSA-NEXT: s_ashr_i32 s33, s29, 16 +; GCN-HSA-NEXT: s_ashr_i32 s57, s41, 16 +; GCN-HSA-NEXT: s_ashr_i32 s59, s43, 16 +; GCN-HSA-NEXT: s_ashr_i32 s60, s42, 16 +; GCN-HSA-NEXT: s_ashr_i32 s61, s45, 16 +; GCN-HSA-NEXT: s_ashr_i32 s62, s44, 16 +; GCN-HSA-NEXT: s_ashr_i32 s63, s47, 16 +; GCN-HSA-NEXT: s_ashr_i32 s64, s46, 16 +; GCN-HSA-NEXT: s_ashr_i32 s65, s49, 16 +; GCN-HSA-NEXT: s_ashr_i32 s66, s48, 16 +; GCN-HSA-NEXT: s_ashr_i32 s67, s51, 16 +; GCN-HSA-NEXT: s_ashr_i32 s68, s50, 16 +; GCN-HSA-NEXT: s_ashr_i32 s34, s28, 16 +; GCN-HSA-NEXT: s_ashr_i32 s35, s31, 16 +; GCN-HSA-NEXT: s_ashr_i32 s52, s30, 16 +; GCN-HSA-NEXT: s_ashr_i32 s53, s37, 16 +; GCN-HSA-NEXT: s_ashr_i32 s54, s36, 16 +; GCN-HSA-NEXT: s_ashr_i32 s55, s39, 16 +; GCN-HSA-NEXT: s_ashr_i32 s56, s38, 16 +; GCN-HSA-NEXT: s_ashr_i32 s58, s40, 16 +; GCN-HSA-NEXT: s_ashr_i32 s4, s17, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s6, s17 +; GCN-HSA-NEXT: s_ashr_i32 s5, s16, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s7, s16 +; GCN-HSA-NEXT: s_ashr_i32 s8, s19, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s10, s19 +; GCN-HSA-NEXT: s_ashr_i32 s9, s18, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s11, s18 +; GCN-HSA-NEXT: s_ashr_i32 s12, s21, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s14, s21 +; GCN-HSA-NEXT: s_ashr_i32 s13, s20, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s15, s20 +; GCN-HSA-NEXT: s_ashr_i32 s16, s23, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s18, s23 +; GCN-HSA-NEXT: s_ashr_i32 s17, s22, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s19, s22 +; GCN-HSA-NEXT: s_ashr_i32 s20, s25, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s22, s25 +; GCN-HSA-NEXT: s_ashr_i32 s25, s26, 16 +; GCN-HSA-NEXT: s_ashr_i32 s21, s24, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s23, s24 +; GCN-HSA-NEXT: s_ashr_i32 s24, s27, 16 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 @@ -3880,13 +3876,13 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s17, s17 -; GCN-HSA-NEXT: s_sext_i32_i16 s16, s16 +; GCN-HSA-NEXT: s_sext_i32_i16 s49, s49 +; GCN-HSA-NEXT: s_sext_i32_i16 s48, s48 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s48 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s49 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] @@ -3895,134 +3891,140 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v35, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 -; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 +; GCN-HSA-NEXT: s_sext_i32_i16 s43, s43 +; GCN-HSA-NEXT: s_sext_i32_i16 s42, s42 ; GCN-HSA-NEXT: v_mov_b32_e32 v34, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s42 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s60 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: s_sext_i32_i16 s19, s19 +; GCN-HSA-NEXT: s_sext_i32_i16 s51, s51 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_sext_i32_i16 s18, s18 +; GCN-HSA-NEXT: s_sext_i32_i16 s50, s50 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 -; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8 -; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 -; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 -; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 -; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 +; GCN-HSA-NEXT: s_sext_i32_i16 s36, s36 +; GCN-HSA-NEXT: s_sext_i32_i16 s40, s40 +; GCN-HSA-NEXT: s_sext_i32_i16 s45, s45 +; GCN-HSA-NEXT: s_sext_i32_i16 s44, s44 +; GCN-HSA-NEXT: s_sext_i32_i16 s47, s47 +; GCN-HSA-NEXT: s_sext_i32_i16 s46, s46 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 -; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 -; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GCN-HSA-NEXT: s_sext_i32_i16 s39, s39 +; GCN-HSA-NEXT: s_sext_i32_i16 s38, s38 +; GCN-HSA-NEXT: s_sext_i32_i16 s41, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s68 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s67 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s14 +; GCN-HSA-NEXT: s_sext_i32_i16 s37, s37 +; GCN-HSA-NEXT: s_sext_i32_i16 s29, s29 +; GCN-HSA-NEXT: s_sext_i32_i16 s28, s28 +; GCN-HSA-NEXT: s_sext_i32_i16 s31, s31 +; GCN-HSA-NEXT: s_sext_i32_i16 s30, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s47 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s44 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s45 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s38 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s58 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s41 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s57 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s56 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s55 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s54 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s34 ; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s33 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: s_sext_i32_i16 s27, s27 +; GCN-HSA-NEXT: s_sext_i32_i16 s26, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s43 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s38 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s39 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s20 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s12 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x40 -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[36:51], s[18:19], 0x0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x40 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s69, s15, 16 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s70, s14, 16 @@ -4030,22 +4032,6 @@ ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s14 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s67, s13, 16 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s68, s12, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s18, s37, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s20, s37 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s22, s39, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s24, s39 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s26, s41, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s28, s41 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s30, s43, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s33, s43 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s35, s45, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s37, s45 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s39, s47, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s41, s47 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s43, s49, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s45, s49 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s47, s51, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s49, s51 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s51, s1, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s53, s1 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s52, s0, 16 @@ -4058,8 +4044,8 @@ ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s12, s12 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s17 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s36 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 @@ -4101,84 +4087,93 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s19, s36, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s49, s31, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s21, s36 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s50, s30, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s58 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s56 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s57 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s23, s38, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s25, s38 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s27, s40, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s29, s40 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s31, s42, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s34, s42 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s36, s44, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s38, s44 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s40, s46, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s42, s46 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s44, s48, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s46, s48 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s48, s50, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s50, s50 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s31, s31 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s30, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s52 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s53 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s47, s29, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s48, s28, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s29, s29 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s28, s28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s45, s27, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s46, s26, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s27, s27 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s26, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s49 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s44 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s45 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s42 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s40 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s41 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s43, s25, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s44, s24, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s25, s25 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s24, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s46 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s45 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s38 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s41, s23, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s42, s22, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s23, s23 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s22, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s44 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s30 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s39, s21, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s40, s20, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s21, s21 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s20, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s42 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s26 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s35, s19, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s38, s18, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s19, s19 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s18, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s40 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s39 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s25 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s22 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s17, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s34, s16, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s17, s17 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s16, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s38 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -5109,40 +5104,40 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, 0xffff +; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, 0xffff ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s0, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s1, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s3, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s5, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s10, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s8, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s9, s2 +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, s6 +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, s6 +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, s6 +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s9, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i64: @@ -5851,184 +5846,185 @@ ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GCN-HSA-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_mov_b32 s2, s11 -; GCN-HSA-NEXT: s_mov_b32 s12, s9 -; GCN-HSA-NEXT: s_mov_b32 s14, s7 -; GCN-HSA-NEXT: s_mov_b32 s16, s5 -; GCN-HSA-NEXT: s_lshr_b32 s18, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s20, s8, 16 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[10:11], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[10:11], s[10:11], 48 -; GCN-HSA-NEXT: s_lshr_b32 s22, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s24, s4, 16 -; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[4:5], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[6:7], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[8:9], 0x100000 +; GCN-HSA-NEXT: s_mov_b32 s6, s15 +; GCN-HSA-NEXT: s_mov_b32 s16, s13 +; GCN-HSA-NEXT: s_mov_b32 s18, s11 +; GCN-HSA-NEXT: s_mov_b32 s20, s9 +; GCN-HSA-NEXT: s_lshr_b32 s22, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s24, s12, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 +; GCN-HSA-NEXT: s_ashr_i64 s[14:15], s[14:15], 48 +; GCN-HSA-NEXT: s_lshr_b32 s26, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s28, s8, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[10:11], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_ashr_i64 s[12:13], s[12:13], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[8:9], s[8:9], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[4:5], s[4:5], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[6:7], s[6:7], 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[24:25], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[22:23], 0x100000 +; GCN-HSA-NEXT: s_ashr_i64 s[10:11], s[10:11], 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[26:27], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 -; GCN-HSA-NEXT: s_add_u32 s22, s0, 0x70 -; GCN-HSA-NEXT: s_addc_u32 s23, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s8 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s9 -; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s8 +; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s12 +; GCN-HSA-NEXT: s_add_u32 s12, s0, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 +; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s13 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GCN-HSA-NEXT: s_add_u32 s10, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 +; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 +; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s23 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 +; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v16i16_to_v16i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[10:11], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[10:11], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s11 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[10:11], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s11, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[8:9], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s9 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[6:7], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s9, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s7 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s7, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s5 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s5, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s7 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, s3 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s7, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[2:3], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s3, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:80 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s14, s5 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s5, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:64 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s14, s1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s1, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[4:5], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[0:1], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s21 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_sextload_v16i16_to_v16i64: @@ -6126,109 +6122,111 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, 0xffff +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, 0xffff ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s17, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s19, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s4, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s29, s6, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s8, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s31, s10, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s33, s12, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s34, s14, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s16, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s18, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s17, s17, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s19, s19, s2 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s18, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s16, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s1, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s3, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s15, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s27, s0, s18 +; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s2, s18 +; GCN-NOHSA-SI-NEXT: s_and_b32 s29, s4, s18 +; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s6, s18 +; GCN-NOHSA-SI-NEXT: s_and_b32 s31, s8, s18 +; GCN-NOHSA-SI-NEXT: s_and_b32 s33, s10, s18 +; GCN-NOHSA-SI-NEXT: s_and_b32 s34, s12, s18 +; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s14, s18 +; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s1, s18 +; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s3, s18 +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s18 +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, s18 +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, s18 +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, s18 +; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, s18 +; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, s18 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s14, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s12, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s10, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s8, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s6, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s0, 16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s33 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s33 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -6657,26 +6655,26 @@ ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s15 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s13 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s11 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s7 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s28, s5 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s3 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s34, s1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s8, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[18:19], 0x100000 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s50, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s52, s0, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s36, s11 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s7 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s46, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s38, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s8, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[48:49], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[18:19], 0x100000 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s52, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s54, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s56, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s58, s0, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[54:55], s[2:3], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[56:57], s[4:5], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[58:59], s[6:7], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[6:7], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[60:61], s[8:9], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[62:63], s[10:11], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[64:65], s[12:13], 0x100000 @@ -6691,24 +6689,24 @@ ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 48 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s44 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s45 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s50 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s51 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s49 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s2 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s3 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[28:29], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[26:27], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[24:25], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[22:23], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[34:35], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[30:31], 0x100000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s21 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[46:47], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[44:45], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[40:41], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[42:43], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s37 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s10 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s11 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s16 @@ -6724,28 +6722,28 @@ ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[52:53], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[50:51], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[48:49], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[46:47], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[42:43], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[40:41], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[38:39], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[36:37], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[58:59], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[56:57], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[54:55], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[52:53], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[34:35], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[28:29], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[26:27], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s39 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s70 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s71 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s41 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s68 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s69 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 @@ -6758,14 +6756,14 @@ ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s63 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s60 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s61 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s59 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s56 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s57 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s54 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, s55 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 @@ -6799,139 +6797,139 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_mov_b32 s38, s15 -; GCN-HSA-NEXT: s_mov_b32 s40, s13 -; GCN-HSA-NEXT: s_mov_b32 s42, s11 -; GCN-HSA-NEXT: s_mov_b32 s44, s9 -; GCN-HSA-NEXT: s_mov_b32 s46, s7 -; GCN-HSA-NEXT: s_mov_b32 s48, s5 -; GCN-HSA-NEXT: s_mov_b32 s50, s3 -; GCN-HSA-NEXT: s_mov_b32 s52, s1 -; GCN-HSA-NEXT: s_lshr_b32 s54, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s56, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s58, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s60, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s62, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s64, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s66, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s68, s0, 16 +; GCN-HSA-NEXT: s_mov_b32 s42, s15 +; GCN-HSA-NEXT: s_mov_b32 s44, s13 +; GCN-HSA-NEXT: s_mov_b32 s46, s11 +; GCN-HSA-NEXT: s_mov_b32 s48, s9 +; GCN-HSA-NEXT: s_mov_b32 s50, s7 +; GCN-HSA-NEXT: s_mov_b32 s52, s5 +; GCN-HSA-NEXT: s_mov_b32 s54, s3 +; GCN-HSA-NEXT: s_mov_b32 s56, s1 +; GCN-HSA-NEXT: s_lshr_b32 s58, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s60, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s62, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s64, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s66, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s68, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s70, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s72, s0, 16 ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-HSA-NEXT: s_ashr_i64 s[36:37], s[0:1], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[0:1], s[14:15], 48 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[70:71], s[2:3], 48 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[38:39], 0x100000 +; GCN-HSA-NEXT: s_ashr_i64 s[38:39], s[2:3], 48 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[42:43], 0x100000 ; GCN-HSA-NEXT: s_ashr_i64 s[74:75], s[6:7], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[76:77], s[8:9], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[78:79], s[10:11], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[80:81], s[12:13], 48 +; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[10:11], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[12:13], s[12:13], 48 ; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[72:73], s[4:5], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[40:41], s[4:5], 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 -; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[68:69], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[66:67], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[62:63], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[60:61], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[58:59], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[56:57], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[38:39], s[54:55], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 -; GCN-HSA-NEXT: s_add_u32 s54, s16, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s55, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s12 -; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xd0 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 -; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s12 -; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s12 -; GCN-HSA-NEXT: s_add_u32 s12, s16, 0x90 -; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s12 -; GCN-HSA-NEXT: s_add_u32 s12, s16, 0x70 -; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s12 -; GCN-HSA-NEXT: s_add_u32 s12, s16, 0x50 -; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s12 -; GCN-HSA-NEXT: s_add_u32 s12, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s41 -; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s12 -; GCN-HSA-NEXT: s_add_u32 s12, s16, 16 -; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v35, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v34, s12 -; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s74 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s75 -; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s12 -; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xc0 -; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s55 +; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[72:73], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[70:71], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[68:69], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[64:65], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[62:63], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[60:61], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[58:59], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[56:57], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000 +; GCN-HSA-NEXT: s_add_u32 s56, s16, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s57, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s44 +; GCN-HSA-NEXT: s_add_u32 s44, s16, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s45 +; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s45 +; GCN-HSA-NEXT: s_add_u32 s44, s16, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s45 +; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s45 +; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s57 +; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s40 +; GCN-HSA-NEXT: s_add_u32 s40, s16, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s41 +; GCN-HSA-NEXT: s_addc_u32 s41, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s80 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s81 +; GCN-HSA-NEXT: s_add_u32 s38, s16, 48 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 +; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s39 +; GCN-HSA-NEXT: s_add_u32 s38, s16, 16 +; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s47 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s78 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s79 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s45 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s14 +; GCN-HSA-NEXT: s_add_u32 s14, s16, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s49 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s76 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s77 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s72 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s73 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s70 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s71 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GCN-HSA-NEXT: s_addc_u32 s15, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s12 +; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s74 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s75 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s45 +; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v34, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s39 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s38 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s39 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s14 ; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s12 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] @@ -7001,146 +6999,151 @@ ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s0, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s26, s1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s1, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s2, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s34, s3 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s3, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s4, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s40, s5 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s5, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s46, s7 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s7, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s52, s9 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s9, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s58, s11 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s11, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s64, s13 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s13, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s70, s15 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s72, s15, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s30, s1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s1, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s6, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s10, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s64, s11 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s80, s15 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s82, s15, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s0, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s44, s5 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s50, s7 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s7, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[34:35], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[48:49], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[64:65], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[64:65], s[82:83], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s8, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s56, s9 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s9, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[30:31], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[46:47], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[62:63], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[62:63], s[80:81], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s78, s14, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[68:69], s[14:15], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[24:25], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[28:29], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[34:35], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[38:39], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[42:43], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[48:49], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[54:55], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[60:61], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[66:67], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[66:67], s[72:73], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[26:27], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[30:31], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[36:37], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[40:41], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[46:47], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[52:53], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[58:59], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[64:65], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[64:65], s[70:71], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s2, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s38, s3 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s3, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[76:77], s[14:15], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s4, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[44:45], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[58:59], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s72, s13 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s74, s13, 16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s64 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s65 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s66 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s67 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s17 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[78:79], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s62 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s63 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[62:63], s[12:13], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s69 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[42:43], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[56:57], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[68:69], s[12:13], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s70, s12, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[40:41], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[54:55], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[72:73], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[74:75], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s76 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s77 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s58 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s58 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s59 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s60 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[38:39], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[52:53], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s11, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[70:71], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[10:11], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s62 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s63 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[10:11], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[36:37], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[50:51], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[66:67], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s69 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s52 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s54 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[8:9], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s61 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s46 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s47 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s42 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s43 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s44 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s45 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s40 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s41 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s42 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s40 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s38 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s39 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 Index: llvm/test/CodeGen/AMDGPU/load-global-i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -2123,18 +2123,18 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v16i16_to_v16i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, 0xffff -; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 @@ -2144,20 +2144,20 @@ ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, s2, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s2, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, s2, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, s2, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, s6, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s6, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, s6, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, s6, v2 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, s2, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, s2, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, s2, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s2, v6 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, s6, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, s6, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, s6, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s6, v6 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i32: @@ -2362,17 +2362,17 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { ; GCN-NOHSA-SI-LABEL: global_sextload_v16i16_to_v16i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v1 @@ -2392,10 +2392,10 @@ ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 16, v6 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v7, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v6, 0, 16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i32: @@ -3591,24 +3591,24 @@ ; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_movk_i32 s14, 0x50 -; GCN-HSA-NEXT: s_movk_i32 s15, 0x60 -; GCN-HSA-NEXT: s_movk_i32 s16, 0x70 -; GCN-HSA-NEXT: s_mov_b32 s17, 0xffff +; GCN-HSA-NEXT: s_movk_i32 s6, 0x50 +; GCN-HSA-NEXT: s_movk_i32 s7, 0x60 +; GCN-HSA-NEXT: s_movk_i32 s8, 0x70 +; GCN-HSA-NEXT: s_mov_b32 s9, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_add_u32 s4, s2, s14 +; GCN-HSA-NEXT: s_add_u32 s4, s2, s6 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s2, s15 +; GCN-HSA-NEXT: s_add_u32 s4, s2, s7 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s2, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s2, s8 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[2:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -3616,19 +3616,19 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: s_add_u32 s6, s2, 48 -; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 +; GCN-HSA-NEXT: s_add_u32 s10, s2, 48 +; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 64 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] ; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21] @@ -3638,61 +3638,61 @@ ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xc0 -; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xd0 +; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s12, s0, 0xa0 +; GCN-HSA-NEXT: s_add_u32 s12, s0, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s14, s0, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s16, s0, 0xa0 +; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v26, s17, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v24, s17, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GCN-HSA-NEXT: s_add_u32 s12, s0, 0xb0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] -; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v26, s9, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v24, s9, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 +; GCN-HSA-NEXT: s_add_u32 s16, s0, 0xb0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[24:27] +; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v26, s9, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v24, s9, v6 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[24:27] +; GCN-HSA-NEXT: s_waitcnt vmcnt(8) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v6, s9, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v4, s9, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v26, s17, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v24, s17, v2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] -; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s17, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s17, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v5, s9, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v3, s9, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; GCN-HSA-NEXT: s_waitcnt vmcnt(9) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_waitcnt vmcnt(9) +; GCN-HSA-NEXT: v_and_b32_e32 v0, s9, v8 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v9 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s17, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s9, v9 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s9, v11 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s17, v10 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s9, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: v_mov_b32_e32 v27, s5 @@ -3700,13 +3700,13 @@ ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v33 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v32 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v33 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s17, v32 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s9, v33 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s9, v32 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v35 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v34 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s17, v35 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s17, v34 +; GCN-HSA-NEXT: v_and_b32_e32 v6, s9, v35 +; GCN-HSA-NEXT: v_and_b32_e32 v4, s9, v34 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 @@ -3716,51 +3716,51 @@ ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s15 +; GCN-HSA-NEXT: s_add_u32 s2, s0, s7 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v29 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v28 -; GCN-HSA-NEXT: v_and_b32_e32 v10, s17, v29 -; GCN-HSA-NEXT: v_and_b32_e32 v8, s17, v28 +; GCN-HSA-NEXT: v_and_b32_e32 v10, s9, v29 +; GCN-HSA-NEXT: v_and_b32_e32 v8, s9, v28 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v31 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v31 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s17, v30 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s9, v31 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s9, v30 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s16 +; GCN-HSA-NEXT: s_add_u32 s2, s0, s8 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v21 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v21 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s17, v20 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s9, v21 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s9, v20 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v23 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v22 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s17, v23 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s17, v22 +; GCN-HSA-NEXT: v_and_b32_e32 v6, s9, v23 +; GCN-HSA-NEXT: v_and_b32_e32 v4, s9, v22 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s9, v15 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s17, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s9, v14 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s17, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v6, s9, v13 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s17, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v4, s9, v12 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v17 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v16 -; GCN-HSA-NEXT: v_and_b32_e32 v14, s17, v17 -; GCN-HSA-NEXT: v_and_b32_e32 v12, s17, v16 +; GCN-HSA-NEXT: v_and_b32_e32 v14, s9, v17 +; GCN-HSA-NEXT: v_and_b32_e32 v12, s9, v16 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s14 +; GCN-HSA-NEXT: s_add_u32 s2, s0, s6 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v19 @@ -3769,8 +3769,8 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v18 -; GCN-HSA-NEXT: v_and_b32_e32 v10, s17, v19 -; GCN-HSA-NEXT: v_and_b32_e32 v8, s17, v18 +; GCN-HSA-NEXT: v_and_b32_e32 v10, s9, v19 +; GCN-HSA-NEXT: v_and_b32_e32 v8, s9, v18 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 @@ -3798,79 +3798,79 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, 0xffff ; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, s0, v15 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, s0, v14 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v28, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s0, v19 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s0, v18 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v29, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v30, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v31, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[56:59], off, s[8:11], 0 offset:112 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, s0, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v12 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, s0, v12 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v19 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, s0, v19 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, s0, v18 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, s0, v17 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s0, v16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, s0, v23 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, s0, v22 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, s0, v23 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, s0, v22 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, s0, v21 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, s0, v20 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v43, 16, v27 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, s0, v27 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v26 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v40, s0, v26 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v27 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, s0, v27 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, s0, v26 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, s0, v25 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, s0, v24 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 16, v11 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v46, s0, v11 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v44, s0, v10 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, s0, v9 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s0, v8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v51, 16, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v50, s0, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, s0, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, s0, v5 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s0, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, s0, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v52, s0, v2 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, s0, v1 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s0, v0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v43, 16, v31 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, s0, v31 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v30 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v40, s0, v30 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, s0, v29 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, s0, v28 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 16, v15 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v46, s0, v15 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v44, s0, v14 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, s0, v13 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s0, v12 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v50, s0, v11 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, s0, v10 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, s0, v9 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s0, v8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, s0, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v52, s0, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, s0, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s0, v4 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v61, 16, v59 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v60, s0, v59 @@ -3889,13 +3889,13 @@ ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:176 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload @@ -3919,20 +3919,20 @@ ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T59.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T52.XYZW, T58.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T54.XYZW, T56.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T55.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T55.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T53.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T48.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T48.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T47.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T46.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T46.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T44.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T43.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T41.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 22: ; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 0, #1 -; EG-NEXT: VTX_READ_128 T37.XYZW, T35.X, 48, #1 -; EG-NEXT: VTX_READ_128 T38.XYZW, T35.X, 32, #1 -; EG-NEXT: VTX_READ_128 T39.XYZW, T35.X, 16, #1 +; EG-NEXT: VTX_READ_128 T38.XYZW, T35.X, 48, #1 +; EG-NEXT: VTX_READ_128 T39.XYZW, T35.X, 32, #1 +; EG-NEXT: VTX_READ_128 T40.XYZW, T35.X, 16, #1 ; EG-NEXT: Fetch clause starting at 30: ; EG-NEXT: VTX_READ_128 T49.XYZW, T35.X, 112, #1 ; EG-NEXT: VTX_READ_128 T50.XYZW, T35.X, 96, #1 @@ -3941,75 +3941,75 @@ ; EG-NEXT: ALU clause starting at 38: ; EG-NEXT: MOV * T35.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 39: -; EG-NEXT: LSHR * T40.W, T36.W, literal.x, +; EG-NEXT: LSHR * T37.W, T36.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T40.Z, T36.W, literal.x, +; EG-NEXT: AND_INT * T37.Z, T36.W, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHR T40.Y, T36.Z, literal.x, +; EG-NEXT: LSHR T37.Y, T36.Z, literal.x, ; EG-NEXT: LSHR * T36.W, T36.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T40.X, T36.Z, literal.x, +; EG-NEXT: AND_INT T37.X, T36.Z, literal.x, ; EG-NEXT: AND_INT T36.Z, T36.Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) ; EG-NEXT: LSHR T41.X, PV.W, literal.x, ; EG-NEXT: LSHR T36.Y, T36.X, literal.y, -; EG-NEXT: LSHR T42.W, T39.W, literal.y, +; EG-NEXT: LSHR T42.W, T40.W, literal.y, ; EG-NEXT: AND_INT * T36.X, T36.X, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: AND_INT * T42.Z, T39.W, literal.x, +; EG-NEXT: AND_INT * T42.Z, T40.W, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: LSHR T43.X, KC0[2].Y, literal.x, -; EG-NEXT: LSHR T42.Y, T39.Z, literal.y, -; EG-NEXT: LSHR T39.W, T39.Y, literal.y, -; EG-NEXT: AND_INT * T42.X, T39.Z, literal.z, +; EG-NEXT: LSHR T42.Y, T40.Z, literal.y, +; EG-NEXT: LSHR T40.W, T40.Y, literal.y, +; EG-NEXT: AND_INT * T42.X, T40.Z, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: AND_INT T39.Z, T39.Y, literal.x, +; EG-NEXT: AND_INT T40.Z, T40.Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44) ; EG-NEXT: LSHR T44.X, PV.W, literal.x, -; EG-NEXT: LSHR T39.Y, T39.X, literal.y, -; EG-NEXT: LSHR T45.W, T38.W, literal.y, -; EG-NEXT: AND_INT * T39.X, T39.X, literal.z, +; EG-NEXT: LSHR T40.Y, T40.X, literal.y, +; EG-NEXT: LSHR T45.W, T39.W, literal.y, +; EG-NEXT: AND_INT * T40.X, T40.X, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: AND_INT T45.Z, T38.W, literal.x, +; EG-NEXT: AND_INT T45.Z, T39.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44) ; EG-NEXT: LSHR T46.X, PV.W, literal.x, -; EG-NEXT: LSHR T45.Y, T38.Z, literal.y, -; EG-NEXT: LSHR T38.W, T38.Y, literal.y, -; EG-NEXT: AND_INT * T45.X, T38.Z, literal.z, +; EG-NEXT: LSHR T45.Y, T39.Z, literal.y, +; EG-NEXT: LSHR T39.W, T39.Y, literal.y, +; EG-NEXT: AND_INT * T45.X, T39.Z, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: AND_INT T38.Z, T38.Y, literal.x, +; EG-NEXT: AND_INT T39.Z, T39.Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 80(1.121039e-43) ; EG-NEXT: LSHR T47.X, PV.W, literal.x, -; EG-NEXT: LSHR T38.Y, T38.X, literal.y, -; EG-NEXT: AND_INT * T38.X, T38.X, literal.z, +; EG-NEXT: LSHR T39.Y, T39.X, literal.y, +; EG-NEXT: AND_INT * T39.X, T39.X, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x, -; EG-NEXT: LSHR * T35.W, T37.W, literal.y, +; EG-NEXT: LSHR * T35.W, T38.W, literal.y, ; EG-NEXT: 64(8.968310e-44), 16(2.242078e-44) ; EG-NEXT: LSHR T48.X, PV.W, literal.x, -; EG-NEXT: AND_INT * T35.Z, T37.W, literal.y, +; EG-NEXT: AND_INT * T35.Z, T38.W, literal.y, ; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) ; EG-NEXT: ALU clause starting at 96: -; EG-NEXT: LSHR T35.Y, T37.Z, literal.x, -; EG-NEXT: LSHR * T37.W, T37.Y, literal.x, +; EG-NEXT: LSHR T35.Y, T38.Z, literal.x, +; EG-NEXT: LSHR * T38.W, T38.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T35.X, T37.Z, literal.x, -; EG-NEXT: AND_INT T37.Z, T37.Y, literal.x, +; EG-NEXT: AND_INT T35.X, T38.Z, literal.x, +; EG-NEXT: AND_INT T38.Z, T38.Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43) ; EG-NEXT: LSHR T53.X, PV.W, literal.x, -; EG-NEXT: LSHR T37.Y, T37.X, literal.y, +; EG-NEXT: LSHR T38.Y, T38.X, literal.y, ; EG-NEXT: LSHR T54.W, T52.W, literal.y, -; EG-NEXT: AND_INT * T37.X, T37.X, literal.z, +; EG-NEXT: AND_INT * T38.X, T38.X, literal.z, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: AND_INT T54.Z, T52.W, literal.x, @@ -4279,102 +4279,102 @@ ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[36:39], off, s[8:11], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v19 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v18 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v19, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v18, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v11 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v10 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v11, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v10, 0, 16 ; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v39, 16, v17 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v37, 16, v16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v38, v17, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v36, v16, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 16, v23 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 16, v22 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v23, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v22, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v43, 16, v21 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v41, 16, v20 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v42, v21, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v40, v20, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 16, v27 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 16, v26 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v27, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v26, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v47, 16, v25 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v45, 16, v24 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v46, v25, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v44, v24, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 16, v31 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v30 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v31, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v30, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v51, 16, v29 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v49, 16, v28 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v50, v29, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v48, v28, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 16, v15 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v29, 16, v14 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v15, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v14, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v55, 16, v13 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v53, 16, v12 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v54, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v52, v12, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 16, v11 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 16, v10 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v14, v11, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v10, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v59, 16, v9 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v57, 16, v8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v58, v9, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v56, v8, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v7 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 16, v6 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v7, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v6, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v63, 16, v5 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v61, 16, v4 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v62, v5, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v60, v4, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v35 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 16, v34 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v35, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v34, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v9 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 16, v8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v9, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v8, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v31 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 16, v30 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v31, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v30, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v43, 16, v29 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v41, 16, v28 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v42, v29, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v40, v28, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 16, v35 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v29, 16, v34 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v35, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v34, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v47, 16, v33 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v45, 16, v32 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v46, v33, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v44, v32, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 16, v39 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v33, 16, v38 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v39, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v38, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v51, 16, v37 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v49, 16, v36 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v50, v37, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v48, v36, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v39, 16, v27 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v37, 16, v26 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v38, v27, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v36, v26, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v55, 16, v25 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v53, 16, v24 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v54, v25, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v52, v24, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 16, v23 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v22 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v23, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v22, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v59, 16, v21 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v57, 16, v20 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v58, v21, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v56, v20, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 16, v19 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v20, 16, v18 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v19, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v18, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v63, 16, v17 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v61, 16, v16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v62, v17, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v60, v16, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 16, v15 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 16, v14 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v15, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v14, 0, 16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v33 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v32 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v33, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v32, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v13 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v12 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v13, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v12, 0, 16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload @@ -4386,28 +4386,28 @@ ; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_movk_i32 s8, 0x70 -; GCN-HSA-NEXT: s_movk_i32 s9, 0x60 -; GCN-HSA-NEXT: s_movk_i32 s10, 0x50 +; GCN-HSA-NEXT: s_movk_i32 s9, 0x70 +; GCN-HSA-NEXT: s_movk_i32 s10, 0x60 +; GCN-HSA-NEXT: s_movk_i32 s8, 0x50 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_add_u32 s4, s2, s8 +; GCN-HSA-NEXT: s_add_u32 s4, s2, s9 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s2, s9 +; GCN-HSA-NEXT: s_add_u32 s4, s2, s10 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; GCN-HSA-NEXT: s_add_u32 s4, s2, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCN-HSA-NEXT: s_add_u32 s4, s2, s10 -; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 64 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s2, 48 @@ -4433,66 +4433,70 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 16, v0 -; GCN-HSA-NEXT: v_bfe_i32 v26, v1, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v24, v0, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v13 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 16, v12 +; GCN-HSA-NEXT: v_bfe_i32 v26, v13, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v24, v12, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[24:27] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v3 -; GCN-HSA-NEXT: v_bfe_i32 v26, v3, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 16, v2 -; GCN-HSA-NEXT: v_bfe_i32 v24, v2, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v15 +; GCN-HSA-NEXT: v_bfe_i32 v26, v15, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 16, v14 +; GCN-HSA-NEXT: v_bfe_i32 v24, v14, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[24:27] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v4, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v8 +; GCN-HSA-NEXT: v_bfe_i32 v14, v9, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v12, v8, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v7 -; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v6 -; GCN-HSA-NEXT: v_bfe_i32 v0, v6, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v11 +; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v10 +; GCN-HSA-NEXT: v_bfe_i32 v11, v10, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v13 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v12 -; GCN-HSA-NEXT: v_bfe_i32 v2, v13, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v12, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v4 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v5 +; GCN-HSA-NEXT: v_bfe_i32 v10, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[8:11] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v7 +; GCN-HSA-NEXT: v_bfe_i32 v14, v7, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v6 +; GCN-HSA-NEXT: v_bfe_i32 v12, v6, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v6, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v2 +; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v8 -; GCN-HSA-NEXT: v_bfe_i32 v2, v9, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v8, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v15 -; GCN-HSA-NEXT: v_bfe_i32 v6, v15, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v14 -; GCN-HSA-NEXT: v_bfe_i32 v4, v14, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -4501,11 +4505,7 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v11 -; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v10 -; GCN-HSA-NEXT: v_bfe_i32 v11, v10, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_waitcnt vmcnt(11) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v17 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v16 @@ -4515,7 +4515,7 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s9 +; GCN-HSA-NEXT: s_add_u32 s2, s0, s10 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v19 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v18 ; GCN-HSA-NEXT: v_bfe_i32 v2, v19, 0, 16 @@ -4524,7 +4524,7 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s8 +; GCN-HSA-NEXT: s_add_u32 s2, s0, s9 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 @@ -4532,7 +4532,7 @@ ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s10 +; GCN-HSA-NEXT: s_add_u32 s2, s0, s8 ; GCN-HSA-NEXT: s_waitcnt vmcnt(12) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v23 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v22 @@ -4594,81 +4594,86 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(6) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v59, 16, v1 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v57, 16, v0 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v51, 16, v9 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v59, 16, v13 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(5) +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v55, 16, v17 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v54, v17, 0, 16 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 16, v15 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v15, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v29, 16, v14 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v14, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v28, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v11 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v11, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v10 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v29, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v30, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v31, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v9 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v9, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v27 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v27, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v26 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v26, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v39, 16, v25 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v38, v25, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v37, 16, v24 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v36, v24, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 16, v31 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v31, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 16, v30 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v30, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v43, 16, v29 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v42, v29, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v41, 16, v28 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v40, v28, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 16, v35 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v35, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v29, 16, v34 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v34, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v47, 16, v33 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v46, v33, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v45, 16, v32 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v44, v32, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 16, v23 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v33, 16, v22 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v22, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v23, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v51, 16, v21 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v50, v21, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v49, 16, v20 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v48, v20, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 16, v19 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v19, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 16, v18 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v18, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v53, 16, v16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v52, v16, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 16, v15 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v15, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 16, v14 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v14, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v57, 16, v12 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v58, v13, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v56, v12, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[60:63], off, s[8:11], 0 offset:112 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v50, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v58, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v56, v0, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 16, v19 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v19, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v29, 16, v18 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v18, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v39, 16, v17 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v38, v17, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v37, 16, v16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v36, v16, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 16, v23 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v23, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 16, v22 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v22, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v43, 16, v21 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v42, v21, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v41, 16, v20 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v40, v20, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 16, v27 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v27, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 16, v26 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v26, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v47, 16, v25 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v46, v25, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v45, 16, v24 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v44, v24, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 16, v11 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v11, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 16, v10 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v10, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v49, 16, v8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v48, v8, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v3 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v2 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 16, v13 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v13, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v33, 16, v12 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v12, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 16, v7 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v7, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 16, v6 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v55, 16, v5 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v54, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v53, 16, v4 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v52, v4, 0, 16 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v61 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v60 @@ -4681,18 +4686,23 @@ ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:240 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:208 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:176 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:144 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload @@ -4914,9 +4924,9 @@ ; CM-NEXT: ALU 82, @57, KC0[CB0:0-32], KC1[] ; CM-NEXT: ALU 72, @140, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T65, T66.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T36, T37.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T35, T36.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T64, T56.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T35, T55.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T37, T55.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T63, T54.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T45, T53.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T62, T52.X @@ -4932,17 +4942,17 @@ ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: Fetch clause starting at 24: -; CM-NEXT: VTX_READ_128 T36.XYZW, T35.X, 16, #1 -; CM-NEXT: VTX_READ_128 T37.XYZW, T35.X, 0, #1 +; CM-NEXT: VTX_READ_128 T35.XYZW, T37.X, 16, #1 +; CM-NEXT: VTX_READ_128 T36.XYZW, T37.X, 0, #1 ; CM-NEXT: Fetch clause starting at 28: -; CM-NEXT: VTX_READ_128 T41.XYZW, T35.X, 112, #1 -; CM-NEXT: VTX_READ_128 T42.XYZW, T35.X, 96, #1 -; CM-NEXT: VTX_READ_128 T43.XYZW, T35.X, 80, #1 -; CM-NEXT: VTX_READ_128 T44.XYZW, T35.X, 64, #1 -; CM-NEXT: VTX_READ_128 T45.XYZW, T35.X, 48, #1 -; CM-NEXT: VTX_READ_128 T35.XYZW, T35.X, 32, #1 +; CM-NEXT: VTX_READ_128 T41.XYZW, T37.X, 112, #1 +; CM-NEXT: VTX_READ_128 T42.XYZW, T37.X, 96, #1 +; CM-NEXT: VTX_READ_128 T43.XYZW, T37.X, 80, #1 +; CM-NEXT: VTX_READ_128 T44.XYZW, T37.X, 64, #1 +; CM-NEXT: VTX_READ_128 T45.XYZW, T37.X, 48, #1 +; CM-NEXT: VTX_READ_128 T37.XYZW, T37.X, 32, #1 ; CM-NEXT: ALU clause starting at 40: -; CM-NEXT: MOV * T35.X, KC0[2].Z, +; CM-NEXT: MOV * T37.X, KC0[2].Z, ; CM-NEXT: ALU clause starting at 41: ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; CM-NEXT: 224(3.138909e-43), 0(0.000000e+00) @@ -4950,34 +4960,34 @@ ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; CM-NEXT: 2(2.802597e-45), 240(3.363116e-43) ; CM-NEXT: LSHR T39.X, PV.W, literal.x, -; CM-NEXT: LSHR T0.Y, T37.Z, literal.y, -; CM-NEXT: LSHR T0.Z, T37.W, literal.y, +; CM-NEXT: LSHR T0.Y, T36.Z, literal.y, +; CM-NEXT: LSHR T0.Z, T36.W, literal.y, ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; CM-NEXT: 192(2.690493e-43), 0(0.000000e+00) ; CM-NEXT: LSHR T40.X, PV.W, literal.x, -; CM-NEXT: LSHR T1.Y, T37.Y, literal.y, -; CM-NEXT: LSHR T1.Z, T36.Z, literal.y, -; CM-NEXT: LSHR * T0.W, T36.W, literal.y, +; CM-NEXT: LSHR T1.Y, T36.Y, literal.y, +; CM-NEXT: LSHR T1.Z, T35.Z, literal.y, +; CM-NEXT: LSHR * T0.W, T35.W, literal.y, ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; CM-NEXT: ALU clause starting at 57: -; CM-NEXT: LSHR T2.Z, T36.X, literal.x, +; CM-NEXT: LSHR T2.Z, T35.X, literal.x, ; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; CM-NEXT: 16(2.242078e-44), 208(2.914701e-43) ; CM-NEXT: LSHR T46.X, PV.W, literal.x, -; CM-NEXT: LSHR T2.Y, T36.Y, literal.y, -; CM-NEXT: LSHR T3.Z, T35.Z, literal.y, +; CM-NEXT: LSHR T2.Y, T35.Y, literal.y, +; CM-NEXT: LSHR T3.Z, T37.Z, literal.y, ; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; CM-NEXT: 160(2.242078e-43), 0(0.000000e+00) ; CM-NEXT: LSHR T47.X, PV.W, literal.x, -; CM-NEXT: LSHR T3.Y, T35.W, literal.y, -; CM-NEXT: LSHR T4.Z, T35.X, literal.y, +; CM-NEXT: LSHR T3.Y, T37.W, literal.y, +; CM-NEXT: LSHR T4.Z, T37.X, literal.y, ; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; CM-NEXT: 176(2.466285e-43), 0(0.000000e+00) ; CM-NEXT: LSHR T48.X, PV.W, literal.x, -; CM-NEXT: LSHR T4.Y, T35.Y, literal.y, +; CM-NEXT: LSHR T4.Y, T37.Y, literal.y, ; CM-NEXT: LSHR T5.Z, T45.Z, literal.y, ; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) @@ -5077,41 +5087,41 @@ ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; CM-NEXT: BFE_INT T62.X, T45.Z, 0.0, literal.x, ; CM-NEXT: BFE_INT T61.Y, T7.Z, 0.0, literal.x, BS:VEC_120/SCL_212 -; CM-NEXT: BFE_INT T45.Z, T35.Y, 0.0, literal.x, +; CM-NEXT: BFE_INT T45.Z, T37.Y, 0.0, literal.x, ; CM-NEXT: BFE_INT * T44.W, T6.Y, 0.0, literal.x, BS:VEC_120/SCL_212 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T45.X, T35.X, 0.0, literal.x, +; CM-NEXT: BFE_INT T45.X, T37.X, 0.0, literal.x, ; CM-NEXT: BFE_INT T44.Y, T6.Z, 0.0, literal.x, -; CM-NEXT: BFE_INT T63.Z, T35.W, 0.0, literal.x, +; CM-NEXT: BFE_INT T63.Z, T37.W, 0.0, literal.x, ; CM-NEXT: BFE_INT * T62.W, T5.Y, 0.0, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T63.X, T35.Z, 0.0, literal.x, +; CM-NEXT: BFE_INT T63.X, T37.Z, 0.0, literal.x, ; CM-NEXT: BFE_INT T62.Y, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212 -; CM-NEXT: BFE_INT T35.Z, T36.Y, 0.0, literal.x, +; CM-NEXT: BFE_INT T37.Z, T35.Y, 0.0, literal.x, ; CM-NEXT: BFE_INT * T45.W, T4.Y, 0.0, literal.x, BS:VEC_120/SCL_212 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T35.X, T36.X, 0.0, literal.x, +; CM-NEXT: BFE_INT T37.X, T35.X, 0.0, literal.x, ; CM-NEXT: BFE_INT T45.Y, T4.Z, 0.0, literal.x, -; CM-NEXT: BFE_INT T64.Z, T36.W, 0.0, literal.x, +; CM-NEXT: BFE_INT T64.Z, T35.W, 0.0, literal.x, ; CM-NEXT: BFE_INT * T63.W, T3.Y, 0.0, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T64.X, T36.Z, 0.0, literal.x, +; CM-NEXT: BFE_INT T64.X, T35.Z, 0.0, literal.x, ; CM-NEXT: BFE_INT T63.Y, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212 -; CM-NEXT: BFE_INT T36.Z, T37.Y, 0.0, literal.x, -; CM-NEXT: BFE_INT * T35.W, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212 +; CM-NEXT: BFE_INT T35.Z, T36.Y, 0.0, literal.x, +; CM-NEXT: BFE_INT * T37.W, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T36.X, T37.X, 0.0, literal.x, -; CM-NEXT: BFE_INT T35.Y, T2.Z, 0.0, literal.x, -; CM-NEXT: BFE_INT T65.Z, T37.W, 0.0, literal.x, +; CM-NEXT: BFE_INT T35.X, T36.X, 0.0, literal.x, +; CM-NEXT: BFE_INT T37.Y, T2.Z, 0.0, literal.x, +; CM-NEXT: BFE_INT T65.Z, T36.W, 0.0, literal.x, ; CM-NEXT: BFE_INT * T64.W, T0.W, 0.0, literal.x, BS:VEC_120/SCL_212 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T65.X, T37.Z, 0.0, literal.x, +; CM-NEXT: BFE_INT T65.X, T36.Z, 0.0, literal.x, ; CM-NEXT: BFE_INT T64.Y, T1.Z, 0.0, literal.x, BS:VEC_120/SCL_212 -; CM-NEXT: LSHR T1.Z, T37.X, literal.x, -; CM-NEXT: BFE_INT * T36.W, T1.Y, 0.0, literal.x, +; CM-NEXT: LSHR T1.Z, T36.X, literal.x, +; CM-NEXT: BFE_INT * T35.W, T1.Y, 0.0, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: LSHR T37.X, KC0[2].Y, literal.x, -; CM-NEXT: BFE_INT T36.Y, PV.Z, 0.0, literal.y, +; CM-NEXT: LSHR T36.X, KC0[2].Y, literal.x, +; CM-NEXT: BFE_INT T35.Y, PV.Z, 0.0, literal.y, ; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.y, ; CM-NEXT: BFE_INT * T65.W, T0.Z, 0.0, literal.y, ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) @@ -6042,15 +6052,15 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, 0xffff -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, 0 @@ -6060,8 +6070,8 @@ ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 @@ -6071,10 +6081,10 @@ ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, s12, v2 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, s12, v1 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s12, v3 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64: @@ -6934,10 +6944,10 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[2:3] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 @@ -6955,63 +6965,63 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[0:1], 48 +; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[4:5], 48 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, v3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[2:3], 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 +; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[6:7], 48 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v6 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v10, v1, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; GCN-HSA-NEXT: v_bfe_i32 v7, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v5, v6, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[5:8] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GCN-HSA-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v6, v6, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_bfe_i32 v0, v5, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[4:5], 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, v7 +; GCN-HSA-NEXT: s_waitcnt vmcnt(4) +; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[0:1], 48 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, v3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v10, v8, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v6, 0, 16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v0, v4, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v11, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[6:7], 48 -; GCN-HSA-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v11, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[2:3], 48 +; GCN-HSA-NEXT: v_bfe_i32 v6, v1, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v16i16_to_v16i64: @@ -7567,21 +7577,20 @@ ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[1:4], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[5:8], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v57, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v24, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v20, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s0, v1 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s0, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v9, s0, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, s0, v36 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 @@ -7590,13 +7599,12 @@ ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, s0, v37 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 16, v37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v37, 0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, s0, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s0, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s0, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, s0, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, s0, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v13, s0, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s0, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v17, s0, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v21, s0, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v25, s0, v8 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, s0, v7 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v39, s0, v32 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v32 @@ -7617,45 +7625,47 @@ ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:208 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v46, v37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v48, 0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v44, 16, v31 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v45, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v43, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v37 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v45, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v37 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v35, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v33, v37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v42, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v40, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v26, v37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, v37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v37 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v39, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v37 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v37 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v32i16_to_v32i64: @@ -8001,45 +8011,45 @@ ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v15 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v16, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[2:3], 48 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[14:15], 48 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[0:1], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[17:18], v[12:13], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v13, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v3 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[6:7], 48 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v13, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[17:18], v[2:3], 48 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[4:5], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v5, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[17:18], v[0:1], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(4) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v7 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[14:15], 48 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[17:18], v[6:7], 48 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[17:18], v[12:13], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v13, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[17:18], v[4:5], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v5, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v11 @@ -8053,48 +8063,49 @@ ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v9, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v14, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v12 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v10 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v12, 0, 16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v5, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v6, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v10 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v8, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v3, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v8, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v10, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v7, v7, 0, 16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v9, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v12, 0, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v14, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v4, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v4, 0, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v6, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 @@ -8103,7 +8114,7 @@ ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64: @@ -8113,11 +8124,11 @@ ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[2:3] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 @@ -8135,8 +8146,8 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_bfe_i32 v16, v1, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[0:1], 48 +; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[4:5], 48 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 @@ -8152,93 +8163,93 @@ ; GCN-HSA-NEXT: s_add_u32 s10, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s12, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, v3 -; GCN-HSA-NEXT: v_bfe_i32 v16, v1, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 +; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 -; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[2:3], 48 +; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[6:7], 48 ; GCN-HSA-NEXT: s_add_u32 s14, s0, 32 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v6 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: v_bfe_i32 v18, v1, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v2, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s14 +; GCN-HSA-NEXT: v_bfe_i32 v18, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v16, v6, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s14 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GCN-HSA-NEXT: flat_store_dwordx4 v[1:2], v[16:19] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s15 +; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[16:19] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GCN-HSA-NEXT: v_bfe_i32 v2, v1, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v6, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s5 ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-HSA-NEXT: v_bfe_i32 v0, v5, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[4:5], 48 +; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[0:1], 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, v3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, v7 -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[6:7], 48 +; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[2:3], 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s9 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_bfe_i32 v0, v9, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[8:9], 48 +; GCN-HSA-NEXT: v_bfe_i32 v3, v9, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[5:6], v[8:9], 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, v11 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[3:6] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, v11 -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[10:11], 48 +; GCN-HSA-NEXT: v_bfe_i32 v3, v1, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[5:6], v[10:11], 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[3:6] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s13 ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_bfe_i32 v0, v13, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[12:13], 48 +; GCN-HSA-NEXT: v_bfe_i32 v3, v13, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[5:6], v[12:13], 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s12 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, v15 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[3:6] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, v15 +; GCN-HSA-NEXT: v_bfe_i32 v3, v1, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s11 -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[14:15], 48 +; GCN-HSA-NEXT: v_ashr_i64 v[5:6], v[14:15], 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s10 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v0, v6, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[3:6] +; GCN-HSA-NEXT: v_bfe_i32 v1, v2, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v3, v7, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v10 -; GCN-HSA-NEXT: v_bfe_i32 v0, v4, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[1:4] +; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v10 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -8292,8 +8303,8 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) @@ -8310,7 +8321,7 @@ ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v14, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v12, 0, 16 @@ -8336,7 +8347,7 @@ ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v5, 0, 16 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v8 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v11, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v8, 0, 16 @@ -8344,58 +8355,58 @@ ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v9, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v8, 0, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, v7 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v8, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v7, 0, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, v3 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v2, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v4, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v0, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v0, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v0, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v11, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v0, 0, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v7, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v5, v4, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm ; Index: llvm/test/CodeGen/AMDGPU/sdiv64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -5,21 +5,21 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GCN-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s12, s3, 31 -; GCN-NEXT: s_add_u32 s2, s2, s12 -; GCN-NEXT: s_mov_b32 s13, s12 -; GCN-NEXT: s_addc_u32 s3, s3, s12 -; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GCN-NEXT: s_sub_u32 s4, 0, s2 -; GCN-NEXT: s_subb_u32 s5, 0, s3 +; GCN-NEXT: s_ashr_i32 s2, s5, 31 +; GCN-NEXT: s_add_u32 s4, s4, s2 +; GCN-NEXT: s_mov_b32 s3, s2 +; GCN-NEXT: s_addc_u32 s5, s5, s2 +; GCN-NEXT: s_xor_b64 s[12:13], s[4:5], s[2:3] +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s13 +; GCN-NEXT: s_sub_u32 s4, 0, s12 +; GCN-NEXT: s_subb_u32 s5, 0, s13 ; GCN-NEXT: s_ashr_i32 s14, s11, 31 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 @@ -96,23 +96,23 @@ ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_mul_lo_u32 v2, s12, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s12, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s13, v0 +; GCN-NEXT: v_mov_b32_e32 v5, s13 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_mul_lo_u32 v3, s2, v0 +; GCN-NEXT: v_mul_lo_u32 v3, s12, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, s11, v2 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, s10, v3 ; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 +; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s12, v3 ; GCN-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v5 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 +; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v4 ; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GCN-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 ; GCN-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] @@ -122,16 +122,16 @@ ; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v6, s11 ; GCN-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 ; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 ; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: s_xor_b64 s[0:1], s[14:15], s[12:13] +; GCN-NEXT: s_xor_b64 s[0:1], s[14:15], s[2:3] ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN-NEXT: v_xor_b32_e32 v0, s0, v0 ; GCN-NEXT: v_xor_b32_e32 v1, s1, v1 Index: llvm/test/CodeGen/AMDGPU/shl.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/shl.ll +++ llvm/test/CodeGen/AMDGPU/shl.ll @@ -971,30 +971,30 @@ ; ; VI-LABEL: shl_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s19, 0xf000 -; VI-NEXT: s_mov_b32 s18, -1 +; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s16, s8 -; VI-NEXT: s_mov_b32 s17, s9 -; VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 -; VI-NEXT: s_load_dwordx8 s[8:15], s[10:11], 0x20 +; VI-NEXT: s_mov_b32 s0, s12 +; VI-NEXT: s_mov_b32 s1, s13 +; VI-NEXT: s_load_dwordx8 s[4:11], s[14:15], 0x0 +; VI-NEXT: s_load_dwordx8 s[12:19], s[14:15], 0x20 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshl_b64 s[10:11], s[10:11], s18 +; VI-NEXT: s_lshl_b64 s[8:9], s[8:9], s16 ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], s14 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s12 -; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], s10 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; VI-NEXT: s_nop 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 -; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: shl_v4i64: Index: llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll +++ llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll @@ -1,6 +1,5 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck %s -; FIXME: The wide loads and bundles introduce so much spilling. define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(float addrspace(4)* %wei_ptr, float addrspace(1)* %out_ptr, float addrspace(1)* %in) { ; CHECK-LABEL: excess_soft_clause_reg_pressure: ; CHECK: BB0_1: ; %for.cond28.preheader @@ -14,78 +13,15 @@ ; CHECK-NEXT: s_load_dwordx16 ; CHECK-NEXT: s_load_dwordx16 -; CHECK: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: s_load_dwordx16 - -; CHECK: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: v_writelane_b32 - -; CHECK: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 +; CHECK-NOT: v_writelane_b32 +; CHECK-NOT: v_readlane_b32 ; CHECK: s_load_dwordx16 ; CHECK: s_load_dwordx16 ; CHECK: s_load_dwordx16 -; CHECK: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 -; CHECK-NEXT: v_readlane_b32 +; CHECK-NOT: v_writelane_b32 +; CHECK-NOT: v_readlane_b32 entry: %i = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() %i1 = bitcast i8 addrspace(4)* %i to i64 addrspace(4)* Index: llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir +++ llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir @@ -38,40 +38,38 @@ ; CHECK: undef %62.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2, implicit $exec ; CHECK: SI_SPILL_V128_SAVE %62, %stack.3, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.3, align 4, addrspace 5) ; CHECK: undef %67.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec - ; CHECK: undef %71.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %71, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5) - ; CHECK: undef %76.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %76, %stack.5, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.5, align 4, addrspace 5) - ; CHECK: undef %81.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %81, %stack.6, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.6, align 4, addrspace 5) - ; CHECK: undef %86.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec - ; CHECK: undef %90.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %90, %stack.7, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.7, align 4, addrspace 5) + ; CHECK: SI_SPILL_V128_SAVE %67, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5) + ; CHECK: undef %72.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE %72, %stack.5, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.5, align 4, addrspace 5) + ; CHECK: undef %77.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE %77, %stack.6, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.6, align 4, addrspace 5) + ; CHECK: undef %82.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE %82, %stack.7, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.7, align 4, addrspace 5) + ; CHECK: undef %87.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec + ; CHECK: undef %91.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec ; CHECK: undef %95.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec ; CHECK: SI_SPILL_V128_SAVE %95, %stack.8, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.8, align 4, addrspace 5) - ; CHECK: undef %100.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %100, %stack.9, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.9, align 4, addrspace 5) - ; CHECK: undef %105.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec - ; CHECK: undef %109.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec - ; CHECK: undef %113.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec - ; CHECK: undef %117.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %117, %stack.10, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.10, align 4, addrspace 5) + ; CHECK: undef %19.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec + ; CHECK: undef %153.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE %153, %stack.14, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.14, align 4, addrspace 5) + ; CHECK: undef %102.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec + ; CHECK: undef %106.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE %106, %stack.9, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.9, align 4, addrspace 5) + ; CHECK: undef %111.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 64, 0, 0, 0, implicit $exec :: (load (s128), align 64, addrspace 1) - ; CHECK: undef %122.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec - ; CHECK: undef %126.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec - ; CHECK: undef %130.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %130, %stack.11, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.11, align 4, addrspace 5) - ; CHECK: undef %135.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %135, %stack.12, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.12, align 4, addrspace 5) + ; CHECK: undef %115.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec + ; CHECK: undef %119.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec + ; CHECK: undef %123.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec + ; CHECK: undef %127.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE %127, %stack.10, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.10, align 4, addrspace 5) ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 80, 0, 0, 0, implicit $exec :: (load (s128), addrspace 1) - ; CHECK: undef %140.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec - ; CHECK: undef %144.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %144, %stack.13, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.13, align 4, addrspace 5) - ; CHECK: undef %149.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE %149, %stack.14, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.14, align 4, addrspace 5) - ; CHECK: undef %154.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec + ; CHECK: undef %138.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec + ; CHECK: undef %142.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec + ; CHECK: undef %146.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec + ; CHECK: undef %150.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE %150, %stack.13, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.13, align 4, addrspace 5) ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 96, 0, 0, 0, implicit $exec :: (load (s128), align 32, addrspace 1) - ; CHECK: undef %158.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec + ; CHECK: undef %156.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec ; CHECK: undef %36.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub0, implicit $exec ; CHECK: undef %37.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub3, implicit $exec ; CHECK: undef %38.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub2, implicit $exec @@ -93,59 +91,61 @@ ; CHECK: [[SI_SPILL_V128_RESTORE3:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.3, align 4, addrspace 5) ; CHECK: [[SI_SPILL_V128_RESTORE3]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2, implicit $exec ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE3]], %stack.3, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.3, align 4, addrspace 5) - ; CHECK: undef %68.sub2:vreg_128 = COPY %67.sub2 - ; CHECK: %68.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec ; CHECK: [[SI_SPILL_V128_RESTORE4:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5) - ; CHECK: [[SI_SPILL_V128_RESTORE4]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec + ; CHECK: [[SI_SPILL_V128_RESTORE4]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE4]], %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5) ; CHECK: [[SI_SPILL_V128_RESTORE5:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.5, align 4, addrspace 5) - ; CHECK: [[SI_SPILL_V128_RESTORE5]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec + ; CHECK: [[SI_SPILL_V128_RESTORE5]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE5]], %stack.5, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.5, align 4, addrspace 5) ; CHECK: [[SI_SPILL_V128_RESTORE6:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.6, align 4, addrspace 5) - ; CHECK: [[SI_SPILL_V128_RESTORE6]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec + ; CHECK: [[SI_SPILL_V128_RESTORE6]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE6]], %stack.6, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.6, align 4, addrspace 5) - ; CHECK: undef %87.sub2:vreg_128 = COPY %86.sub2 - ; CHECK: %87.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec ; CHECK: [[SI_SPILL_V128_RESTORE7:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.7, align 4, addrspace 5) - ; CHECK: [[SI_SPILL_V128_RESTORE7]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec + ; CHECK: [[SI_SPILL_V128_RESTORE7]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE7]], %stack.7, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.7, align 4, addrspace 5) - ; CHECK: [[SI_SPILL_V128_RESTORE8:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.8, align 4, addrspace 5) - ; CHECK: [[SI_SPILL_V128_RESTORE8]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE8]], %stack.8, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.8, align 4, addrspace 5) - ; CHECK: [[SI_SPILL_V128_RESTORE9:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.9, align 4, addrspace 5) - ; CHECK: [[SI_SPILL_V128_RESTORE9]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE9]], %stack.9, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.9, align 4, addrspace 5) - ; CHECK: undef %106.sub2:vreg_128 = COPY %105.sub2 - ; CHECK: %106.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec - ; CHECK: undef %110.sub2:vreg_128 = COPY %109.sub2 - ; CHECK: %110.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec - ; CHECK: undef %114.sub2:vreg_128 = COPY %113.sub2 - ; CHECK: %114.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec - ; CHECK: [[SI_SPILL_V128_RESTORE10:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.10, align 4, addrspace 5) - ; CHECK: [[SI_SPILL_V128_RESTORE10]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE10]], %stack.10, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.10, align 4, addrspace 5) - ; CHECK: undef %123.sub2:vreg_128 = COPY %122.sub2 - ; CHECK: %123.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec - ; CHECK: undef %127.sub2:vreg_128 = COPY %126.sub2 - ; CHECK: %127.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec - ; CHECK: [[SI_SPILL_V128_RESTORE11:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.11, align 4, addrspace 5) - ; CHECK: [[SI_SPILL_V128_RESTORE11]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE11]], %stack.11, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.11, align 4, addrspace 5) - ; CHECK: [[SI_SPILL_V128_RESTORE12:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.12, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.12, align 4, addrspace 5) - ; CHECK: [[SI_SPILL_V128_RESTORE12]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE12]], %stack.12, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.12, align 4, addrspace 5) - ; CHECK: undef %141.sub2:vreg_128 = COPY %140.sub2 - ; CHECK: %141.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec - ; CHECK: [[SI_SPILL_V128_RESTORE13:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.13, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.13, align 4, addrspace 5) - ; CHECK: [[SI_SPILL_V128_RESTORE13]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE13]], %stack.13, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.13, align 4, addrspace 5) - ; CHECK: [[SI_SPILL_V128_RESTORE14:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.14, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.14, align 4, addrspace 5) - ; CHECK: [[SI_SPILL_V128_RESTORE14]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec - ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE14]], %stack.14, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.14, align 4, addrspace 5) - ; CHECK: undef %155.sub2:vreg_128 = COPY %154.sub2 - ; CHECK: %155.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec - ; CHECK: undef %159.sub2:vreg_128 = COPY %158.sub2 - ; CHECK: %159.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec + ; CHECK: undef %131.sub2:vreg_128 = COPY %87.sub2 + ; CHECK: SI_SPILL_V128_SAVE %131, %stack.11, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.11, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE8:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.11, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE8]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE8]], %stack.11, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.11, align 4, addrspace 5) + ; CHECK: undef %134.sub2:vreg_128 = COPY %91.sub2 + ; CHECK: SI_SPILL_V128_SAVE %134, %stack.12, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.12, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE9:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.12, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.12, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE9]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE9]], %stack.12, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.12, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE10:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.8, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE10]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE10]], %stack.8, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.8, align 4, addrspace 5) + ; CHECK: %19.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec + ; CHECK: [[SI_SPILL_V128_RESTORE11:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.14, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.14, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE11]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE11]], %stack.14, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.14, align 4, addrspace 5) + ; CHECK: undef %103.sub2:vreg_128 = COPY %102.sub2 + ; CHECK: %103.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec + ; CHECK: [[SI_SPILL_V128_RESTORE12:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.9, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE12]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE12]], %stack.9, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.9, align 4, addrspace 5) + ; CHECK: undef %112.sub2:vreg_128 = COPY %111.sub2 + ; CHECK: %112.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec + ; CHECK: undef %116.sub2:vreg_128 = COPY %115.sub2 + ; CHECK: %116.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec + ; CHECK: undef %120.sub2:vreg_128 = COPY %119.sub2 + ; CHECK: %120.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec + ; CHECK: undef %124.sub2:vreg_128 = COPY %123.sub2 + ; CHECK: %124.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec + ; CHECK: [[SI_SPILL_V128_RESTORE13:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.10, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE13]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE13]], %stack.10, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.10, align 4, addrspace 5) + ; CHECK: undef %139.sub2:vreg_128 = COPY %138.sub2 + ; CHECK: %139.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec + ; CHECK: undef %143.sub2:vreg_128 = COPY %142.sub2 + ; CHECK: %143.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec + ; CHECK: undef %147.sub2:vreg_128 = COPY %146.sub2 + ; CHECK: %147.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec + ; CHECK: [[SI_SPILL_V128_RESTORE14:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.13, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.13, align 4, addrspace 5) + ; CHECK: [[SI_SPILL_V128_RESTORE14]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec + ; CHECK: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE14]], %stack.13, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.13, align 4, addrspace 5) + ; CHECK: %156.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec ; CHECK: %36.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub0, implicit $exec ; CHECK: %37.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub3, implicit $exec ; CHECK: %38.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub2, implicit $exec @@ -174,139 +174,136 @@ ; CHECK: %36.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %36.sub3:vreg_128 = COPY %43.sub1 ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %36, %2, 0, 384, 0, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1) - ; CHECK: undef %157.sub0:vreg_128 = COPY %159.sub0 { - ; CHECK: internal %157.sub2:vreg_128 = COPY %159.sub2 + ; CHECK: undef %157.sub0:vreg_128 = COPY %156.sub0 { + ; CHECK: internal %157.sub2:vreg_128 = COPY %156.sub2 ; CHECK: } ; CHECK: %157.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %157.sub3:vreg_128 = COPY %43.sub1 ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %157, %2, 0, 400, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1) - ; CHECK: undef %153.sub0:vreg_128 = COPY %155.sub0 { - ; CHECK: internal %153.sub2:vreg_128 = COPY %155.sub2 + ; CHECK: [[SI_SPILL_V128_RESTORE15:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.13, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.13, align 4, addrspace 5) + ; CHECK: undef %149.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE15]].sub0 { + ; CHECK: internal %149.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE15]].sub2 ; CHECK: } - ; CHECK: %153.sub1:vreg_128 = COPY %43.sub1 - ; CHECK: %153.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %153, %2, 0, 352, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1) - ; CHECK: [[SI_SPILL_V128_RESTORE15:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.14, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.14, align 4, addrspace 5) - ; CHECK: undef %148.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE15]].sub0 { - ; CHECK: internal %148.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE15]].sub2 + ; CHECK: %149.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %149.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %149, %2, 0, 352, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1) + ; CHECK: undef %145.sub0:vreg_128 = COPY %147.sub0 { + ; CHECK: internal %145.sub2:vreg_128 = COPY %147.sub2 ; CHECK: } - ; CHECK: %148.sub1:vreg_128 = COPY %43.sub1 - ; CHECK: %148.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %148, %2, 0, 368, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1) - ; CHECK: [[SI_SPILL_V128_RESTORE16:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.13, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.13, align 4, addrspace 5) - ; CHECK: undef %143.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE16]].sub0 { - ; CHECK: internal %143.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE16]].sub2 + ; CHECK: %145.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %145.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %145, %2, 0, 368, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK: undef %141.sub0:vreg_128 = COPY %143.sub0 { + ; CHECK: internal %141.sub2:vreg_128 = COPY %143.sub2 ; CHECK: } - ; CHECK: %143.sub1:vreg_128 = COPY %43.sub1 - ; CHECK: %143.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %143, %2, 0, 320, 0, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1) - ; CHECK: undef %139.sub0:vreg_128 = COPY %141.sub0 { - ; CHECK: internal %139.sub2:vreg_128 = COPY %141.sub2 + ; CHECK: %141.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %141.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %141, %2, 0, 320, 0, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1) + ; CHECK: undef %137.sub0:vreg_128 = COPY %139.sub0 { + ; CHECK: internal %137.sub2:vreg_128 = COPY %139.sub2 ; CHECK: } - ; CHECK: %139.sub1:vreg_128 = COPY %43.sub1 - ; CHECK: %139.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %139, %2, 0, 336, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1) - ; CHECK: [[SI_SPILL_V128_RESTORE17:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.12, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.12, align 4, addrspace 5) - ; CHECK: undef %134.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE17]].sub0 { - ; CHECK: internal %134.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE17]].sub2 + ; CHECK: %137.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %137.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %137, %2, 0, 336, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK: [[SI_SPILL_V128_RESTORE16:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.10, align 4, addrspace 5) + ; CHECK: undef %126.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE16]].sub0 { + ; CHECK: internal %126.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE16]].sub2 ; CHECK: } - ; CHECK: %134.sub1:vreg_128 = COPY %43.sub1 - ; CHECK: %134.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %134, %2, 0, 288, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1) - ; CHECK: [[SI_SPILL_V128_RESTORE18:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.11, align 4, addrspace 5) - ; CHECK: undef %129.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE18]].sub0 { - ; CHECK: internal %129.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE18]].sub2 + ; CHECK: %126.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %126.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %126, %2, 0, 288, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1) + ; CHECK: undef %122.sub0:vreg_128 = COPY %124.sub0 { + ; CHECK: internal %122.sub2:vreg_128 = COPY %124.sub2 ; CHECK: } - ; CHECK: %129.sub1:vreg_128 = COPY %43.sub1 - ; CHECK: %129.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %129, %2, 0, 304, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1) - ; CHECK: undef %125.sub0:vreg_128 = COPY %127.sub0 { - ; CHECK: internal %125.sub2:vreg_128 = COPY %127.sub2 + ; CHECK: %122.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %122.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %122, %2, 0, 304, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK: undef %118.sub0:vreg_128 = COPY %120.sub0 { + ; CHECK: internal %118.sub2:vreg_128 = COPY %120.sub2 ; CHECK: } - ; CHECK: %125.sub1:vreg_128 = COPY %43.sub1 - ; CHECK: %125.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %125, %2, 0, 256, 0, 0, 0, implicit $exec :: (store (s128), align 256, addrspace 1) - ; CHECK: undef %121.sub0:vreg_128 = COPY %123.sub0 { - ; CHECK: internal %121.sub2:vreg_128 = COPY %123.sub2 + ; CHECK: %118.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %118.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %118, %2, 0, 256, 0, 0, 0, implicit $exec :: (store (s128), align 256, addrspace 1) + ; CHECK: undef %114.sub0:vreg_128 = COPY %116.sub0 { + ; CHECK: internal %114.sub2:vreg_128 = COPY %116.sub2 ; CHECK: } - ; CHECK: %121.sub1:vreg_128 = COPY %43.sub1 - ; CHECK: %121.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %121, %2, 0, 272, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1) - ; CHECK: [[SI_SPILL_V128_RESTORE19:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.10, align 4, addrspace 5) - ; CHECK: undef %116.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE19]].sub0 { - ; CHECK: internal %116.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE19]].sub2 + ; CHECK: %114.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %114.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %114, %2, 0, 272, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK: undef %110.sub0:vreg_128 = COPY %112.sub0 { + ; CHECK: internal %110.sub2:vreg_128 = COPY %112.sub2 ; CHECK: } - ; CHECK: %116.sub1:vreg_128 = COPY %43.sub1 - ; CHECK: %116.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %116, %2, 0, 224, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1) - ; CHECK: undef %112.sub0:vreg_128 = COPY %114.sub0 { - ; CHECK: internal %112.sub2:vreg_128 = COPY %114.sub2 + ; CHECK: %110.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %110.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %110, %2, 0, 224, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1) + ; CHECK: [[SI_SPILL_V128_RESTORE17:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.9, align 4, addrspace 5) + ; CHECK: undef %105.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE17]].sub0 { + ; CHECK: internal %105.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE17]].sub2 ; CHECK: } - ; CHECK: %112.sub1:vreg_128 = COPY %43.sub1 - ; CHECK: %112.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %112, %2, 0, 240, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1) - ; CHECK: undef %108.sub0:vreg_128 = COPY %110.sub0 { - ; CHECK: internal %108.sub2:vreg_128 = COPY %110.sub2 + ; CHECK: %105.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %105.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %105, %2, 0, 240, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK: undef %101.sub0:vreg_128 = COPY %103.sub0 { + ; CHECK: internal %101.sub2:vreg_128 = COPY %103.sub2 ; CHECK: } - ; CHECK: %108.sub1:vreg_128 = COPY %43.sub1 - ; CHECK: %108.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %108, %2, 0, 192, 0, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1) - ; CHECK: undef %104.sub0:vreg_128 = COPY %106.sub0 { - ; CHECK: internal %104.sub2:vreg_128 = COPY %106.sub2 - ; CHECK: } - ; CHECK: %104.sub1:vreg_128 = COPY %43.sub1 - ; CHECK: %104.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %104, %2, 0, 208, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1) - ; CHECK: [[SI_SPILL_V128_RESTORE20:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.9, align 4, addrspace 5) - ; CHECK: undef %99.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE20]].sub0 { - ; CHECK: internal %99.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE20]].sub2 + ; CHECK: %101.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %101.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %101, %2, 0, 192, 0, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1) + ; CHECK: [[SI_SPILL_V128_RESTORE18:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.14, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.14, align 4, addrspace 5) + ; CHECK: undef %99.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE18]].sub0 { + ; CHECK: internal %99.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE18]].sub2 ; CHECK: } ; CHECK: %99.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %99.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %99, %2, 0, 160, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1) - ; CHECK: [[SI_SPILL_V128_RESTORE21:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.8, align 4, addrspace 5) - ; CHECK: undef %94.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE21]].sub0 { - ; CHECK: internal %94.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE21]].sub2 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %99, %2, 0, 208, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK: %19.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %19.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %19, %2, 0, 160, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1) + ; CHECK: [[SI_SPILL_V128_RESTORE19:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.8, align 4, addrspace 5) + ; CHECK: undef %94.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE19]].sub0 { + ; CHECK: internal %94.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE19]].sub2 ; CHECK: } ; CHECK: %94.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %94.sub3:vreg_128 = COPY %43.sub1 ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %94, %2, 0, 176, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1) - ; CHECK: [[SI_SPILL_V128_RESTORE22:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.7, align 4, addrspace 5) - ; CHECK: undef %89.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE22]].sub0 { - ; CHECK: internal %89.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE22]].sub2 + ; CHECK: [[SI_SPILL_V128_RESTORE20:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.12, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.12, align 4, addrspace 5) + ; CHECK: undef %90.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE20]].sub0 { + ; CHECK: internal %90.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE20]].sub2 ; CHECK: } - ; CHECK: %89.sub1:vreg_128 = COPY %43.sub1 - ; CHECK: %89.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %89, %2, 0, 128, 0, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1) - ; CHECK: undef %85.sub0:vreg_128 = COPY %87.sub0 { - ; CHECK: internal %85.sub2:vreg_128 = COPY %87.sub2 + ; CHECK: %90.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %90.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %90, %2, 0, 128, 0, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1) + ; CHECK: [[SI_SPILL_V128_RESTORE21:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.11, align 4, addrspace 5) + ; CHECK: undef %86.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE21]].sub0 { + ; CHECK: internal %86.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE21]].sub2 ; CHECK: } - ; CHECK: %85.sub1:vreg_128 = COPY %43.sub1 - ; CHECK: %85.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %85, %2, 0, 144, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK: %86.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %86.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %86, %2, 0, 144, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK: [[SI_SPILL_V128_RESTORE22:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.7, align 4, addrspace 5) + ; CHECK: undef %81.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE22]].sub0 { + ; CHECK: internal %81.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE22]].sub2 + ; CHECK: } + ; CHECK: %81.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %81.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %81, %2, 0, 96, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1) ; CHECK: [[SI_SPILL_V128_RESTORE23:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.6, align 4, addrspace 5) - ; CHECK: undef %80.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE23]].sub0 { - ; CHECK: internal %80.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE23]].sub2 + ; CHECK: undef %76.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE23]].sub0 { + ; CHECK: internal %76.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE23]].sub2 ; CHECK: } - ; CHECK: %80.sub1:vreg_128 = COPY %43.sub1 - ; CHECK: %80.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %80, %2, 0, 96, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1) + ; CHECK: %76.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %76.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %76, %2, 0, 112, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK: [[SI_SPILL_V128_RESTORE24:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.5, align 4, addrspace 5) - ; CHECK: undef %75.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE24]].sub0 { - ; CHECK: internal %75.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE24]].sub2 + ; CHECK: undef %71.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE24]].sub0 { + ; CHECK: internal %71.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE24]].sub2 ; CHECK: } - ; CHECK: %75.sub1:vreg_128 = COPY %43.sub1 - ; CHECK: %75.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %75, %2, 0, 112, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK: %71.sub1:vreg_128 = COPY %43.sub1 + ; CHECK: %71.sub3:vreg_128 = COPY %43.sub1 + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %71, %2, 0, 64, 0, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1) ; CHECK: [[SI_SPILL_V128_RESTORE25:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5) - ; CHECK: undef %70.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE25]].sub0 { - ; CHECK: internal %70.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE25]].sub2 - ; CHECK: } - ; CHECK: %70.sub1:vreg_128 = COPY %43.sub1 - ; CHECK: %70.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %70, %2, 0, 64, 0, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1) - ; CHECK: undef %66.sub0:vreg_128 = COPY %68.sub0 { - ; CHECK: internal %66.sub2:vreg_128 = COPY %68.sub2 + ; CHECK: undef %66.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE25]].sub0 { + ; CHECK: internal %66.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE25]].sub2 ; CHECK: } ; CHECK: %66.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %66.sub3:vreg_128 = COPY %43.sub1 Index: llvm/test/CodeGen/AMDGPU/srl.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/srl.ll +++ llvm/test/CodeGen/AMDGPU/srl.ll @@ -295,30 +295,30 @@ ; ; VI-LABEL: lshr_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s19, 0xf000 -; VI-NEXT: s_mov_b32 s18, -1 +; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s16, s8 -; VI-NEXT: s_mov_b32 s17, s9 -; VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 -; VI-NEXT: s_load_dwordx8 s[8:15], s[10:11], 0x20 +; VI-NEXT: s_mov_b32 s0, s12 +; VI-NEXT: s_mov_b32 s1, s13 +; VI-NEXT: s_load_dwordx8 s[4:11], s[14:15], 0x0 +; VI-NEXT: s_load_dwordx8 s[12:19], s[14:15], 0x20 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 +; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], s16 ; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 ; VI-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 -; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 -; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s8 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; VI-NEXT: s_nop 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 -; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: lshr_v4i64: Index: llvm/test/CodeGen/ARM/fptosi-sat-scalar.ll =================================================================== --- llvm/test/CodeGen/ARM/fptosi-sat-scalar.ll +++ llvm/test/CodeGen/ARM/fptosi-sat-scalar.ll @@ -1668,42 +1668,42 @@ ; VFP2-NEXT: mov r4, r1 ; VFP2-NEXT: mov r5, r0 ; VFP2-NEXT: bl __fixdfti -; VFP2-NEXT: vldr d0, .LCPI18_0 -; VFP2-NEXT: vmov d1, r5, r4 -; VFP2-NEXT: vldr d2, .LCPI18_1 -; VFP2-NEXT: vcmp.f64 d1, d0 +; VFP2-NEXT: vldr d2, .LCPI18_0 +; VFP2-NEXT: vmov d0, r5, r4 +; VFP2-NEXT: vldr d1, .LCPI18_1 +; VFP2-NEXT: vcmp.f64 d0, d2 ; VFP2-NEXT: vmrs APSR_nzcv, fpscr -; VFP2-NEXT: vcmp.f64 d1, d2 +; VFP2-NEXT: vcmp.f64 d0, d1 ; VFP2-NEXT: movlt r0, #0 ; VFP2-NEXT: vmrs APSR_nzcv, fpscr -; VFP2-NEXT: vcmp.f64 d1, d1 +; VFP2-NEXT: vcmp.f64 d0, d0 ; VFP2-NEXT: mvngt r0, #0 ; VFP2-NEXT: vmrs APSR_nzcv, fpscr -; VFP2-NEXT: vcmp.f64 d1, d0 +; VFP2-NEXT: vcmp.f64 d0, d2 ; VFP2-NEXT: movvs r0, #0 ; VFP2-NEXT: vmrs APSR_nzcv, fpscr -; VFP2-NEXT: vcmp.f64 d1, d2 +; VFP2-NEXT: vcmp.f64 d0, d1 ; VFP2-NEXT: movlt r1, #0 ; VFP2-NEXT: vmrs APSR_nzcv, fpscr -; VFP2-NEXT: vcmp.f64 d1, d1 +; VFP2-NEXT: vcmp.f64 d0, d0 ; VFP2-NEXT: mvngt r1, #0 ; VFP2-NEXT: vmrs APSR_nzcv, fpscr -; VFP2-NEXT: vcmp.f64 d1, d0 +; VFP2-NEXT: vcmp.f64 d0, d2 ; VFP2-NEXT: movvs r1, #0 ; VFP2-NEXT: vmrs APSR_nzcv, fpscr -; VFP2-NEXT: vcmp.f64 d1, d2 +; VFP2-NEXT: vcmp.f64 d0, d1 ; VFP2-NEXT: movlt r2, #0 ; VFP2-NEXT: vmrs APSR_nzcv, fpscr -; VFP2-NEXT: vcmp.f64 d1, d1 +; VFP2-NEXT: vcmp.f64 d0, d0 ; VFP2-NEXT: mvngt r2, #0 ; VFP2-NEXT: vmrs APSR_nzcv, fpscr -; VFP2-NEXT: vcmp.f64 d1, d0 +; VFP2-NEXT: vcmp.f64 d0, d2 ; VFP2-NEXT: movvs r2, #0 ; VFP2-NEXT: vmrs APSR_nzcv, fpscr -; VFP2-NEXT: vcmp.f64 d1, d2 +; VFP2-NEXT: vcmp.f64 d0, d1 ; VFP2-NEXT: mvnlt r3, #7 ; VFP2-NEXT: vmrs APSR_nzcv, fpscr -; VFP2-NEXT: vcmp.f64 d1, d1 +; VFP2-NEXT: vcmp.f64 d0, d0 ; VFP2-NEXT: movgt r3, #7 ; VFP2-NEXT: vmrs APSR_nzcv, fpscr ; VFP2-NEXT: movvs r3, #0 @@ -1848,42 +1848,42 @@ ; VFP2-NEXT: mov r4, r1 ; VFP2-NEXT: mov r5, r0 ; VFP2-NEXT: bl __fixdfti -; VFP2-NEXT: vldr d0, .LCPI19_0 -; VFP2-NEXT: vmov d1, r5, r4 -; VFP2-NEXT: vldr d2, .LCPI19_1 -; VFP2-NEXT: vcmp.f64 d1, d0 +; VFP2-NEXT: vldr d2, .LCPI19_0 +; VFP2-NEXT: vmov d0, r5, r4 +; VFP2-NEXT: vldr d1, .LCPI19_1 +; VFP2-NEXT: vcmp.f64 d0, d2 ; VFP2-NEXT: vmrs APSR_nzcv, fpscr -; VFP2-NEXT: vcmp.f64 d1, d2 +; VFP2-NEXT: vcmp.f64 d0, d1 ; VFP2-NEXT: movlt r0, #0 ; VFP2-NEXT: vmrs APSR_nzcv, fpscr -; VFP2-NEXT: vcmp.f64 d1, d1 +; VFP2-NEXT: vcmp.f64 d0, d0 ; VFP2-NEXT: mvngt r0, #0 ; VFP2-NEXT: vmrs APSR_nzcv, fpscr -; VFP2-NEXT: vcmp.f64 d1, d0 +; VFP2-NEXT: vcmp.f64 d0, d2 ; VFP2-NEXT: movvs r0, #0 ; VFP2-NEXT: vmrs APSR_nzcv, fpscr -; VFP2-NEXT: vcmp.f64 d1, d2 +; VFP2-NEXT: vcmp.f64 d0, d1 ; VFP2-NEXT: movlt r1, #0 ; VFP2-NEXT: vmrs APSR_nzcv, fpscr -; VFP2-NEXT: vcmp.f64 d1, d1 +; VFP2-NEXT: vcmp.f64 d0, d0 ; VFP2-NEXT: mvngt r1, #0 ; VFP2-NEXT: vmrs APSR_nzcv, fpscr -; VFP2-NEXT: vcmp.f64 d1, d0 +; VFP2-NEXT: vcmp.f64 d0, d2 ; VFP2-NEXT: movvs r1, #0 ; VFP2-NEXT: vmrs APSR_nzcv, fpscr -; VFP2-NEXT: vcmp.f64 d1, d2 +; VFP2-NEXT: vcmp.f64 d0, d1 ; VFP2-NEXT: movlt r2, #0 ; VFP2-NEXT: vmrs APSR_nzcv, fpscr -; VFP2-NEXT: vcmp.f64 d1, d1 +; VFP2-NEXT: vcmp.f64 d0, d0 ; VFP2-NEXT: mvngt r2, #0 ; VFP2-NEXT: vmrs APSR_nzcv, fpscr -; VFP2-NEXT: vcmp.f64 d1, d0 +; VFP2-NEXT: vcmp.f64 d0, d2 ; VFP2-NEXT: movvs r2, #0 ; VFP2-NEXT: vmrs APSR_nzcv, fpscr -; VFP2-NEXT: vcmp.f64 d1, d2 +; VFP2-NEXT: vcmp.f64 d0, d1 ; VFP2-NEXT: movlt r3, #-2147483648 ; VFP2-NEXT: vmrs APSR_nzcv, fpscr -; VFP2-NEXT: vcmp.f64 d1, d1 +; VFP2-NEXT: vcmp.f64 d0, d0 ; VFP2-NEXT: mvngt r3, #-2147483648 ; VFP2-NEXT: vmrs APSR_nzcv, fpscr ; VFP2-NEXT: movvs r3, #0 Index: llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll =================================================================== --- llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll +++ llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll @@ -370,40 +370,40 @@ ; ARM7: @ %bb.0: ; ARM7-NEXT: push {r4, r5, r6, r7, r11, lr} ; ARM7-NEXT: vpush {d8, d9} -; ARM7-NEXT: mov r5, r0 +; ARM7-NEXT: mov r6, r0 ; ARM7-NEXT: and r0, r3, #1 -; ARM7-NEXT: mov r4, r1 +; ARM7-NEXT: mov r5, r1 ; ARM7-NEXT: rsb r1, r0, #0 ; ARM7-NEXT: mov r0, r2 ; ARM7-NEXT: mov r2, #9 ; ARM7-NEXT: mov r3, #0 ; ARM7-NEXT: bl __moddi3 -; ARM7-NEXT: mov r6, r0 -; ARM7-NEXT: and r0, r4, #1 -; ARM7-NEXT: mov r7, r1 +; ARM7-NEXT: mov r7, r0 +; ARM7-NEXT: and r0, r5, #1 +; ARM7-NEXT: mov r4, r1 ; ARM7-NEXT: rsb r1, r0, #0 -; ARM7-NEXT: mov r0, r5 +; ARM7-NEXT: mov r0, r6 ; ARM7-NEXT: mov r2, #9 ; ARM7-NEXT: mov r3, #0 ; ARM7-NEXT: bl __moddi3 ; ARM7-NEXT: vmov.32 d8[0], r0 ; ARM7-NEXT: ldr r0, [sp, #44] ; ARM7-NEXT: ldr r2, [sp, #40] -; ARM7-NEXT: mov r4, r1 +; ARM7-NEXT: mov r5, r1 ; ARM7-NEXT: and r0, r0, #1 ; ARM7-NEXT: mvn r3, #0 ; ARM7-NEXT: rsb r1, r0, #0 -; ARM7-NEXT: vmov.32 d9[0], r6 +; ARM7-NEXT: vmov.32 d9[0], r7 ; ARM7-NEXT: mov r0, r2 ; ARM7-NEXT: mvn r2, #8 ; ARM7-NEXT: bl __moddi3 ; ARM7-NEXT: vmov.32 d16[0], r0 ; ARM7-NEXT: adr r0, .LCPI3_0 -; ARM7-NEXT: vmov.32 d9[1], r7 +; ARM7-NEXT: vmov.32 d9[1], r4 ; ARM7-NEXT: vld1.64 {d18, d19}, [r0:128] ; ARM7-NEXT: adr r0, .LCPI3_1 ; ARM7-NEXT: vmov.32 d16[1], r1 -; ARM7-NEXT: vmov.32 d8[1], r4 +; ARM7-NEXT: vmov.32 d8[1], r5 ; ARM7-NEXT: vand q8, q8, q9 ; ARM7-NEXT: vld1.64 {d20, d21}, [r0:128] ; ARM7-NEXT: adr r0, .LCPI3_2 @@ -446,40 +446,40 @@ ; ARM8: @ %bb.0: ; ARM8-NEXT: push {r4, r5, r6, r7, r11, lr} ; ARM8-NEXT: vpush {d8, d9} -; ARM8-NEXT: mov r5, r0 +; ARM8-NEXT: mov r6, r0 ; ARM8-NEXT: and r0, r3, #1 -; ARM8-NEXT: mov r4, r1 +; ARM8-NEXT: mov r5, r1 ; ARM8-NEXT: rsb r1, r0, #0 ; ARM8-NEXT: mov r0, r2 ; ARM8-NEXT: mov r2, #9 ; ARM8-NEXT: mov r3, #0 ; ARM8-NEXT: bl __moddi3 -; ARM8-NEXT: mov r6, r0 -; ARM8-NEXT: and r0, r4, #1 -; ARM8-NEXT: mov r7, r1 +; ARM8-NEXT: mov r7, r0 +; ARM8-NEXT: and r0, r5, #1 +; ARM8-NEXT: mov r4, r1 ; ARM8-NEXT: rsb r1, r0, #0 -; ARM8-NEXT: mov r0, r5 +; ARM8-NEXT: mov r0, r6 ; ARM8-NEXT: mov r2, #9 ; ARM8-NEXT: mov r3, #0 ; ARM8-NEXT: bl __moddi3 ; ARM8-NEXT: vmov.32 d8[0], r0 ; ARM8-NEXT: ldr r0, [sp, #44] ; ARM8-NEXT: ldr r2, [sp, #40] -; ARM8-NEXT: mov r4, r1 +; ARM8-NEXT: mov r5, r1 ; ARM8-NEXT: and r0, r0, #1 ; ARM8-NEXT: mvn r3, #0 ; ARM8-NEXT: rsb r1, r0, #0 -; ARM8-NEXT: vmov.32 d9[0], r6 +; ARM8-NEXT: vmov.32 d9[0], r7 ; ARM8-NEXT: mov r0, r2 ; ARM8-NEXT: mvn r2, #8 ; ARM8-NEXT: bl __moddi3 ; ARM8-NEXT: vmov.32 d16[0], r0 ; ARM8-NEXT: adr r0, .LCPI3_0 -; ARM8-NEXT: vmov.32 d9[1], r7 +; ARM8-NEXT: vmov.32 d9[1], r4 ; ARM8-NEXT: vld1.64 {d18, d19}, [r0:128] ; ARM8-NEXT: adr r0, .LCPI3_1 ; ARM8-NEXT: vmov.32 d16[1], r1 -; ARM8-NEXT: vmov.32 d8[1], r4 +; ARM8-NEXT: vmov.32 d8[1], r5 ; ARM8-NEXT: vand q8, q8, q9 ; ARM8-NEXT: vld1.64 {d20, d21}, [r0:128] ; ARM8-NEXT: adr r0, .LCPI3_2 @@ -522,40 +522,40 @@ ; NEON7: @ %bb.0: ; NEON7-NEXT: push {r4, r5, r6, r7, r11, lr} ; NEON7-NEXT: vpush {d8, d9} -; NEON7-NEXT: mov r5, r0 +; NEON7-NEXT: mov r6, r0 ; NEON7-NEXT: and r0, r3, #1 -; NEON7-NEXT: mov r4, r1 +; NEON7-NEXT: mov r5, r1 ; NEON7-NEXT: rsb r1, r0, #0 ; NEON7-NEXT: mov r0, r2 ; NEON7-NEXT: mov r2, #9 ; NEON7-NEXT: mov r3, #0 ; NEON7-NEXT: bl __moddi3 -; NEON7-NEXT: mov r6, r0 -; NEON7-NEXT: and r0, r4, #1 -; NEON7-NEXT: mov r7, r1 +; NEON7-NEXT: mov r7, r0 +; NEON7-NEXT: and r0, r5, #1 +; NEON7-NEXT: mov r4, r1 ; NEON7-NEXT: rsb r1, r0, #0 -; NEON7-NEXT: mov r0, r5 +; NEON7-NEXT: mov r0, r6 ; NEON7-NEXT: mov r2, #9 ; NEON7-NEXT: mov r3, #0 ; NEON7-NEXT: bl __moddi3 ; NEON7-NEXT: vmov.32 d8[0], r0 ; NEON7-NEXT: ldr r0, [sp, #44] ; NEON7-NEXT: ldr r2, [sp, #40] -; NEON7-NEXT: mov r4, r1 +; NEON7-NEXT: mov r5, r1 ; NEON7-NEXT: and r0, r0, #1 ; NEON7-NEXT: mvn r3, #0 ; NEON7-NEXT: rsb r1, r0, #0 -; NEON7-NEXT: vmov.32 d9[0], r6 +; NEON7-NEXT: vmov.32 d9[0], r7 ; NEON7-NEXT: mov r0, r2 ; NEON7-NEXT: mvn r2, #8 ; NEON7-NEXT: bl __moddi3 ; NEON7-NEXT: vmov.32 d16[0], r0 ; NEON7-NEXT: adr r0, .LCPI3_0 -; NEON7-NEXT: vmov.32 d9[1], r7 +; NEON7-NEXT: vmov.32 d9[1], r4 ; NEON7-NEXT: vld1.64 {d18, d19}, [r0:128] ; NEON7-NEXT: adr r0, .LCPI3_1 ; NEON7-NEXT: vmov.32 d16[1], r1 -; NEON7-NEXT: vmov.32 d8[1], r4 +; NEON7-NEXT: vmov.32 d8[1], r5 ; NEON7-NEXT: vand q8, q8, q9 ; NEON7-NEXT: vld1.64 {d20, d21}, [r0:128] ; NEON7-NEXT: adr r0, .LCPI3_2 @@ -598,40 +598,40 @@ ; NEON8: @ %bb.0: ; NEON8-NEXT: push {r4, r5, r6, r7, r11, lr} ; NEON8-NEXT: vpush {d8, d9} -; NEON8-NEXT: mov r5, r0 +; NEON8-NEXT: mov r6, r0 ; NEON8-NEXT: and r0, r3, #1 -; NEON8-NEXT: mov r4, r1 +; NEON8-NEXT: mov r5, r1 ; NEON8-NEXT: rsb r1, r0, #0 ; NEON8-NEXT: mov r0, r2 ; NEON8-NEXT: mov r2, #9 ; NEON8-NEXT: mov r3, #0 ; NEON8-NEXT: bl __moddi3 -; NEON8-NEXT: mov r6, r0 -; NEON8-NEXT: and r0, r4, #1 -; NEON8-NEXT: mov r7, r1 +; NEON8-NEXT: mov r7, r0 +; NEON8-NEXT: and r0, r5, #1 +; NEON8-NEXT: mov r4, r1 ; NEON8-NEXT: rsb r1, r0, #0 -; NEON8-NEXT: mov r0, r5 +; NEON8-NEXT: mov r0, r6 ; NEON8-NEXT: mov r2, #9 ; NEON8-NEXT: mov r3, #0 ; NEON8-NEXT: bl __moddi3 ; NEON8-NEXT: vmov.32 d8[0], r0 ; NEON8-NEXT: ldr r0, [sp, #44] ; NEON8-NEXT: ldr r2, [sp, #40] -; NEON8-NEXT: mov r4, r1 +; NEON8-NEXT: mov r5, r1 ; NEON8-NEXT: and r0, r0, #1 ; NEON8-NEXT: mvn r3, #0 ; NEON8-NEXT: rsb r1, r0, #0 -; NEON8-NEXT: vmov.32 d9[0], r6 +; NEON8-NEXT: vmov.32 d9[0], r7 ; NEON8-NEXT: mov r0, r2 ; NEON8-NEXT: mvn r2, #8 ; NEON8-NEXT: bl __moddi3 ; NEON8-NEXT: vmov.32 d16[0], r0 ; NEON8-NEXT: adr r0, .LCPI3_0 -; NEON8-NEXT: vmov.32 d9[1], r7 +; NEON8-NEXT: vmov.32 d9[1], r4 ; NEON8-NEXT: vld1.64 {d18, d19}, [r0:128] ; NEON8-NEXT: adr r0, .LCPI3_1 ; NEON8-NEXT: vmov.32 d16[1], r1 -; NEON8-NEXT: vmov.32 d8[1], r4 +; NEON8-NEXT: vmov.32 d8[1], r5 ; NEON8-NEXT: vand q8, q8, q9 ; NEON8-NEXT: vld1.64 {d20, d21}, [r0:128] ; NEON8-NEXT: adr r0, .LCPI3_2 Index: llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll =================================================================== --- llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll +++ llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll @@ -9,7 +9,7 @@ ; ARMV6-NEXT: sub sp, sp, #28 ; ARMV6-NEXT: ldr r7, [sp, #72] ; ARMV6-NEXT: mov r6, r0 -; ARMV6-NEXT: str r0, [sp, #8] @ 4-byte Spill +; ARMV6-NEXT: str r0, [sp, #8] @ 4-byte Spill ; ARMV6-NEXT: ldr r4, [sp, #84] ; ARMV6-NEXT: umull r1, r0, r2, r7 ; ARMV6-NEXT: mov lr, r7 @@ -17,16 +17,16 @@ ; ARMV6-NEXT: str r1, [r6] ; ARMV6-NEXT: ldr r6, [sp, #80] ; ARMV6-NEXT: umull r1, r7, r3, r6 -; ARMV6-NEXT: str r7, [sp, #12] @ 4-byte Spill +; ARMV6-NEXT: str r7, [sp, #12] @ 4-byte Spill ; ARMV6-NEXT: add r1, r5, r1 ; ARMV6-NEXT: umull r7, r5, r6, r2 ; ARMV6-NEXT: mov r6, lr -; ARMV6-NEXT: str r7, [sp, #16] @ 4-byte Spill +; ARMV6-NEXT: str r7, [sp, #16] @ 4-byte Spill ; ARMV6-NEXT: mov r7, #0 ; ARMV6-NEXT: adds r1, r5, r1 -; ARMV6-NEXT: str r1, [sp, #4] @ 4-byte Spill +; ARMV6-NEXT: str r1, [sp, #4] @ 4-byte Spill ; ARMV6-NEXT: adc r1, r7, #0 -; ARMV6-NEXT: str r1, [sp, #24] @ 4-byte Spill +; ARMV6-NEXT: str r1, [sp, #24] @ 4-byte Spill ; ARMV6-NEXT: ldr r1, [sp, #64] ; ARMV6-NEXT: ldr r7, [sp, #76] ; ARMV6-NEXT: ldr r5, [sp, #64] @@ -40,15 +40,15 @@ ; ARMV6-NEXT: adds r12, lr, r12 ; ARMV6-NEXT: umull r2, lr, r2, r7 ; ARMV6-NEXT: adc r6, r6, #0 -; ARMV6-NEXT: str r6, [sp, #20] @ 4-byte Spill -; ARMV6-NEXT: ldr r6, [sp, #16] @ 4-byte Reload +; ARMV6-NEXT: str r6, [sp, #20] @ 4-byte Spill +; ARMV6-NEXT: ldr r6, [sp, #16] @ 4-byte Reload ; ARMV6-NEXT: adds r11, r11, r6 -; ARMV6-NEXT: ldr r6, [sp, #4] @ 4-byte Reload +; ARMV6-NEXT: ldr r6, [sp, #4] @ 4-byte Reload ; ARMV6-NEXT: adc r6, r12, r6 ; ARMV6-NEXT: mov r12, #0 ; ARMV6-NEXT: umlal r0, r12, r3, r5 -; ARMV6-NEXT: ldr r5, [sp, #8] @ 4-byte Reload -; ARMV6-NEXT: str r6, [sp, #16] @ 4-byte Spill +; ARMV6-NEXT: ldr r5, [sp, #8] @ 4-byte Reload +; ARMV6-NEXT: str r6, [sp, #16] @ 4-byte Spill ; ARMV6-NEXT: ldr r6, [sp, #64] ; ARMV6-NEXT: adds r0, r2, r0 ; ARMV6-NEXT: str r0, [r5, #4] @@ -62,7 +62,7 @@ ; ARMV6-NEXT: orrs r12, r6, r4 ; ARMV6-NEXT: movne r12, #1 ; ARMV6-NEXT: cmp r9, #0 -; ARMV6-NEXT: ldr r6, [sp, #12] @ 4-byte Reload +; ARMV6-NEXT: ldr r6, [sp, #12] @ 4-byte Reload ; ARMV6-NEXT: movne r9, #1 ; ARMV6-NEXT: cmp r8, #0 ; ARMV6-NEXT: movne r8, #1 @@ -81,17 +81,17 @@ ; ARMV6-NEXT: adds r0, r0, r11 ; ARMV6-NEXT: str r0, [r5, #8] ; ARMV6-NEXT: and r1, r1, r7 -; ARMV6-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; ARMV6-NEXT: ldr r0, [sp, #16] @ 4-byte Reload ; ARMV6-NEXT: orr r1, r1, r8 ; ARMV6-NEXT: orr r1, r1, r9 ; ARMV6-NEXT: adcs r0, r2, r0 ; ARMV6-NEXT: str r0, [r5, #12] ; ARMV6-NEXT: and r0, r4, r3 -; ARMV6-NEXT: ldr r2, [sp, #24] @ 4-byte Reload +; ARMV6-NEXT: ldr r2, [sp, #24] @ 4-byte Reload ; ARMV6-NEXT: orr r0, r0, r10 ; ARMV6-NEXT: orr r0, r0, r6 ; ARMV6-NEXT: orr r0, r0, r2 -; ARMV6-NEXT: ldr r2, [sp, #20] @ 4-byte Reload +; ARMV6-NEXT: ldr r2, [sp, #20] @ 4-byte Reload ; ARMV6-NEXT: orr r1, r1, r2 ; ARMV6-NEXT: and r2, lr, r12 ; ARMV6-NEXT: orr r1, r2, r1 @@ -115,51 +115,51 @@ ; ARMV7-NEXT: ldr r9, [sp, #76] ; ARMV7-NEXT: umull r4, lr, r5, r1 ; ARMV7-NEXT: umull r0, r7, r2, r10 -; ARMV7-NEXT: str r4, [sp, #24] @ 4-byte Spill +; ARMV7-NEXT: str r4, [sp, #24] @ 4-byte Spill ; ARMV7-NEXT: ldr r4, [sp, #88] ; ARMV7-NEXT: umull r1, r6, r1, r10 -; ARMV7-NEXT: str r0, [sp, #32] @ 4-byte Spill +; ARMV7-NEXT: str r0, [sp, #32] @ 4-byte Spill ; ARMV7-NEXT: umull r11, r0, r2, r5 -; ARMV7-NEXT: str r6, [sp, #20] @ 4-byte Spill -; ARMV7-NEXT: str r1, [sp, #28] @ 4-byte Spill +; ARMV7-NEXT: str r6, [sp, #20] @ 4-byte Spill +; ARMV7-NEXT: str r1, [sp, #28] @ 4-byte Spill ; ARMV7-NEXT: umull r6, r12, r3, r4 ; ARMV7-NEXT: ldr r1, [sp, #92] -; ARMV7-NEXT: str r0, [sp, #8] @ 4-byte Spill +; ARMV7-NEXT: str r0, [sp, #8] @ 4-byte Spill ; ARMV7-NEXT: mov r0, #0 ; ARMV7-NEXT: umlal r7, r0, r3, r10 -; ARMV7-NEXT: str r6, [sp, #16] @ 4-byte Spill +; ARMV7-NEXT: str r6, [sp, #16] @ 4-byte Spill ; ARMV7-NEXT: umull r6, r1, r1, r2 ; ARMV7-NEXT: umull r2, r4, r4, r2 -; ARMV7-NEXT: str r6, [sp, #4] @ 4-byte Spill -; ARMV7-NEXT: str r2, [sp, #12] @ 4-byte Spill +; ARMV7-NEXT: str r6, [sp, #4] @ 4-byte Spill +; ARMV7-NEXT: str r2, [sp, #12] @ 4-byte Spill ; ARMV7-NEXT: adds r2, r11, r7 -; ARMV7-NEXT: ldr r7, [sp, #8] @ 4-byte Reload +; ARMV7-NEXT: ldr r7, [sp, #8] @ 4-byte Reload ; ARMV7-NEXT: mov r11, #0 -; ARMV7-NEXT: str r4, [sp] @ 4-byte Spill +; ARMV7-NEXT: str r4, [sp] @ 4-byte Spill ; ARMV7-NEXT: umull r6, r4, r9, r10 ; ARMV7-NEXT: adcs r9, r0, r7 -; ARMV7-NEXT: ldr r0, [sp, #32] @ 4-byte Reload +; ARMV7-NEXT: ldr r0, [sp, #32] @ 4-byte Reload ; ARMV7-NEXT: adc r10, r11, #0 ; ARMV7-NEXT: stm r8, {r0, r2} -; ARMV7-NEXT: ldr r0, [sp, #24] @ 4-byte Reload +; ARMV7-NEXT: ldr r0, [sp, #24] @ 4-byte Reload ; ARMV7-NEXT: umlal r9, r10, r3, r5 -; ARMV7-NEXT: ldr r2, [sp, #20] @ 4-byte Reload +; ARMV7-NEXT: ldr r2, [sp, #20] @ 4-byte Reload ; ARMV7-NEXT: add r0, r6, r0 -; ARMV7-NEXT: adds r2, r2, r0 -; ARMV7-NEXT: ldr r6, [sp, #4] @ 4-byte Reload -; ARMV7-NEXT: adc r0, r11, #0 -; ARMV7-NEXT: str r0, [sp, #32] @ 4-byte Spill -; ARMV7-NEXT: ldr r0, [sp, #16] @ 4-byte Reload -; ARMV7-NEXT: ldr r7, [sp, #28] @ 4-byte Reload -; ARMV7-NEXT: add r0, r6, r0 -; ARMV7-NEXT: ldr r6, [sp] @ 4-byte Reload -; ARMV7-NEXT: adds r0, r6, r0 -; ARMV7-NEXT: ldr r6, [sp, #12] @ 4-byte Reload +; ARMV7-NEXT: adds r0, r2, r0 +; ARMV7-NEXT: ldr r6, [sp, #4] @ 4-byte Reload +; ARMV7-NEXT: adc r2, r11, #0 +; ARMV7-NEXT: str r2, [sp, #32] @ 4-byte Spill +; ARMV7-NEXT: ldr r2, [sp, #16] @ 4-byte Reload +; ARMV7-NEXT: ldr r7, [sp, #28] @ 4-byte Reload +; ARMV7-NEXT: add r2, r6, r2 +; ARMV7-NEXT: ldr r6, [sp] @ 4-byte Reload +; ARMV7-NEXT: adds r2, r6, r2 +; ARMV7-NEXT: ldr r6, [sp, #12] @ 4-byte Reload ; ARMV7-NEXT: adc r11, r11, #0 ; ARMV7-NEXT: adds r7, r7, r6 ; ARMV7-NEXT: ldr r6, [sp, #92] -; ARMV7-NEXT: adc r0, r2, r0 -; ARMV7-NEXT: str r0, [sp, #28] @ 4-byte Spill +; ARMV7-NEXT: adc r0, r0, r2 +; ARMV7-NEXT: str r0, [sp, #28] @ 4-byte Spill ; ARMV7-NEXT: ldr r0, [sp, #92] ; ARMV7-NEXT: cmp r3, #0 ; ARMV7-NEXT: movwne r3, #1 @@ -195,11 +195,11 @@ ; ARMV7-NEXT: adds r7, r9, r7 ; ARMV7-NEXT: str r7, [r8, #8] ; ARMV7-NEXT: and r2, r2, r3 -; ARMV7-NEXT: ldr r7, [sp, #28] @ 4-byte Reload +; ARMV7-NEXT: ldr r7, [sp, #28] @ 4-byte Reload ; ARMV7-NEXT: orr r0, r0, r11 ; ARMV7-NEXT: adcs r7, r10, r7 ; ARMV7-NEXT: str r7, [r8, #12] -; ARMV7-NEXT: ldr r7, [sp, #32] @ 4-byte Reload +; ARMV7-NEXT: ldr r7, [sp, #32] @ 4-byte Reload ; ARMV7-NEXT: orr r1, r1, r7 ; ARMV7-NEXT: orr r1, r2, r1 ; ARMV7-NEXT: orr r0, r1, r0 Index: llvm/test/CodeGen/Hexagon/reg-scavengebug-2.ll =================================================================== --- llvm/test/CodeGen/Hexagon/reg-scavengebug-2.ll +++ llvm/test/CodeGen/Hexagon/reg-scavengebug-2.ll @@ -1,5 +1,5 @@ ; RUN: llc -O3 -march=hexagon < %s | FileCheck %s -; CHECK: v{{[0-9]+}} = vmem(r{{[0-9]+}}+#0) +; CHECK: v{{[0-9]+}}.cur = vmem(r{{[0-9]+}}+#0) target triple = "hexagon" Index: llvm/test/CodeGen/Mips/cconv/vector.ll =================================================================== --- llvm/test/CodeGen/Mips/cconv/vector.ll +++ llvm/test/CodeGen/Mips/cconv/vector.ll @@ -1027,85 +1027,85 @@ ; MIPS64-NEXT: sll $10, $10, 0 ; MIPS64-NEXT: addu $9, $10, $9 ; MIPS64-NEXT: addu $2, $8, $2 -; MIPS64-NEXT: sll $1, $1, 8 +; MIPS64-NEXT: sll $8, $1, 8 ; MIPS64-NEXT: andi $3, $3, 255 -; MIPS64-NEXT: sll $8, $12, 8 +; MIPS64-NEXT: sll $1, $12, 8 ; MIPS64-NEXT: sll $10, $11, 0 ; MIPS64-NEXT: dsrl $11, $5, 32 ; MIPS64-NEXT: sll $11, $11, 0 ; MIPS64-NEXT: addu $10, $11, $10 ; MIPS64-NEXT: andi $10, $10, 255 -; MIPS64-NEXT: or $8, $10, $8 -; MIPS64-NEXT: sll $10, $6, 0 -; MIPS64-NEXT: or $1, $3, $1 +; MIPS64-NEXT: or $10, $10, $1 +; MIPS64-NEXT: sll $1, $6, 0 +; MIPS64-NEXT: or $8, $3, $8 ; MIPS64-NEXT: sll $2, $2, 8 -; MIPS64-NEXT: andi $3, $9, 255 -; MIPS64-NEXT: dsrl $9, $6, 40 -; MIPS64-NEXT: srl $11, $10, 24 +; MIPS64-NEXT: andi $9, $9, 255 +; MIPS64-NEXT: dsrl $11, $6, 40 +; MIPS64-NEXT: srl $3, $1, 24 ; MIPS64-NEXT: sll $12, $4, 0 ; MIPS64-NEXT: srl $13, $12, 24 -; MIPS64-NEXT: srl $14, $10, 16 +; MIPS64-NEXT: srl $14, $1, 16 ; MIPS64-NEXT: srl $15, $12, 16 -; MIPS64-NEXT: andi $8, $8, 65535 +; MIPS64-NEXT: andi $10, $10, 65535 ; MIPS64-NEXT: addu $14, $15, $14 -; MIPS64-NEXT: addu $11, $13, $11 -; MIPS64-NEXT: sll $7, $7, 0 -; MIPS64-NEXT: or $2, $3, $2 -; MIPS64-NEXT: sll $1, $1, 16 -; MIPS64-NEXT: sll $3, $9, 0 +; MIPS64-NEXT: addu $13, $13, $3 +; MIPS64-NEXT: sll $3, $7, 0 +; MIPS64-NEXT: or $2, $9, $2 +; MIPS64-NEXT: sll $7, $8, 16 +; MIPS64-NEXT: sll $8, $11, 0 ; MIPS64-NEXT: dsrl $9, $4, 40 ; MIPS64-NEXT: sll $9, $9, 0 -; MIPS64-NEXT: addu $3, $9, $3 +; MIPS64-NEXT: addu $8, $9, $8 ; MIPS64-NEXT: dsrl $6, $6, 32 -; MIPS64-NEXT: srl $9, $7, 24 +; MIPS64-NEXT: srl $9, $3, 24 ; MIPS64-NEXT: sll $5, $5, 0 -; MIPS64-NEXT: srl $13, $5, 24 -; MIPS64-NEXT: or $1, $8, $1 -; MIPS64-NEXT: addu $8, $13, $9 -; MIPS64-NEXT: sll $9, $11, 8 +; MIPS64-NEXT: srl $11, $5, 24 +; MIPS64-NEXT: or $7, $10, $7 +; MIPS64-NEXT: addu $9, $11, $9 +; MIPS64-NEXT: sll $10, $13, 8 ; MIPS64-NEXT: andi $11, $14, 255 ; MIPS64-NEXT: sll $2, $2, 16 -; MIPS64-NEXT: sll $3, $3, 8 +; MIPS64-NEXT: sll $8, $8, 8 ; MIPS64-NEXT: sll $6, $6, 0 ; MIPS64-NEXT: dsrl $4, $4, 32 ; MIPS64-NEXT: sll $4, $4, 0 ; MIPS64-NEXT: addu $4, $4, $6 ; MIPS64-NEXT: andi $4, $4, 255 -; MIPS64-NEXT: or $3, $4, $3 -; MIPS64-NEXT: andi $3, $3, 65535 -; MIPS64-NEXT: or $2, $3, $2 -; MIPS64-NEXT: or $3, $11, $9 -; MIPS64-NEXT: addu $4, $12, $10 -; MIPS64-NEXT: sll $6, $8, 8 -; MIPS64-NEXT: srl $8, $7, 16 -; MIPS64-NEXT: srl $9, $5, 16 -; MIPS64-NEXT: addu $8, $9, $8 -; MIPS64-NEXT: andi $8, $8, 255 -; MIPS64-NEXT: or $6, $8, $6 -; MIPS64-NEXT: addu $8, $5, $7 +; MIPS64-NEXT: or $4, $4, $8 +; MIPS64-NEXT: andi $4, $4, 65535 +; MIPS64-NEXT: or $2, $4, $2 +; MIPS64-NEXT: or $4, $11, $10 +; MIPS64-NEXT: addu $6, $12, $1 +; MIPS64-NEXT: sll $8, $9, 8 +; MIPS64-NEXT: srl $9, $3, 16 +; MIPS64-NEXT: srl $10, $5, 16 +; MIPS64-NEXT: addu $9, $10, $9 +; MIPS64-NEXT: andi $9, $9, 255 +; MIPS64-NEXT: or $8, $9, $8 +; MIPS64-NEXT: addu $9, $5, $3 ; MIPS64-NEXT: dsll $2, $2, 32 -; MIPS64-NEXT: sll $3, $3, 16 -; MIPS64-NEXT: andi $4, $4, 255 -; MIPS64-NEXT: srl $9, $10, 8 +; MIPS64-NEXT: sll $4, $4, 16 +; MIPS64-NEXT: andi $6, $6, 255 +; MIPS64-NEXT: srl $1, $1, 8 ; MIPS64-NEXT: srl $10, $12, 8 -; MIPS64-NEXT: addu $9, $10, $9 -; MIPS64-NEXT: sll $9, $9, 8 -; MIPS64-NEXT: or $4, $4, $9 -; MIPS64-NEXT: andi $4, $4, 65535 -; MIPS64-NEXT: or $3, $4, $3 -; MIPS64-NEXT: dsll $3, $3, 32 -; MIPS64-NEXT: dsrl $3, $3, 32 -; MIPS64-NEXT: or $2, $3, $2 +; MIPS64-NEXT: addu $1, $10, $1 +; MIPS64-NEXT: sll $1, $1, 8 +; MIPS64-NEXT: or $1, $6, $1 +; MIPS64-NEXT: andi $1, $1, 65535 +; MIPS64-NEXT: or $1, $1, $4 ; MIPS64-NEXT: dsll $1, $1, 32 -; MIPS64-NEXT: sll $3, $6, 16 -; MIPS64-NEXT: andi $4, $8, 255 -; MIPS64-NEXT: srl $6, $7, 8 +; MIPS64-NEXT: dsrl $1, $1, 32 +; MIPS64-NEXT: or $2, $1, $2 +; MIPS64-NEXT: dsll $1, $7, 32 +; MIPS64-NEXT: sll $4, $8, 16 +; MIPS64-NEXT: andi $6, $9, 255 +; MIPS64-NEXT: srl $3, $3, 8 ; MIPS64-NEXT: srl $5, $5, 8 -; MIPS64-NEXT: addu $5, $5, $6 -; MIPS64-NEXT: sll $5, $5, 8 -; MIPS64-NEXT: or $4, $4, $5 -; MIPS64-NEXT: andi $4, $4, 65535 -; MIPS64-NEXT: or $3, $4, $3 +; MIPS64-NEXT: addu $3, $5, $3 +; MIPS64-NEXT: sll $3, $3, 8 +; MIPS64-NEXT: or $3, $6, $3 +; MIPS64-NEXT: andi $3, $3, 65535 +; MIPS64-NEXT: or $3, $3, $4 ; MIPS64-NEXT: dsll $3, $3, 32 ; MIPS64-NEXT: dsrl $3, $3, 32 ; MIPS64-NEXT: or $3, $3, $1 Index: llvm/test/CodeGen/PowerPC/ppc-fpclass.ll =================================================================== --- llvm/test/CodeGen/PowerPC/ppc-fpclass.ll +++ llvm/test/CodeGen/PowerPC/ppc-fpclass.ll @@ -252,7 +252,7 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: stwu 1, -48(1) ; CHECK-NEXT: stfd 3, 32(1) -; CHECK-NEXT: lis 3, 32752 +; CHECK-NEXT: lis 4, 32752 ; CHECK-NEXT: lwz 8, 32(1) ; CHECK-NEXT: stfd 4, 40(1) ; CHECK-NEXT: lwz 9, 44(1) @@ -262,33 +262,33 @@ ; CHECK-NEXT: clrlwi. 9, 9, 1 ; CHECK-NEXT: stfd 1, 16(1) ; CHECK-NEXT: cmplwi 5, 9, 0 -; CHECK-NEXT: lwz 5, 20(1) +; CHECK-NEXT: lwz 6, 20(1) ; CHECK-NEXT: crandc 24, 1, 22 ; CHECK-NEXT: stfd 2, 24(1) ; CHECK-NEXT: cmpwi 8, 0 -; CHECK-NEXT: lwz 4, 16(1) -; CHECK-NEXT: cmplw 7, 10, 3 +; CHECK-NEXT: lwz 5, 16(1) +; CHECK-NEXT: cmplw 7, 10, 4 ; CHECK-NEXT: lwz 7, 28(1) ; CHECK-NEXT: xoris 10, 10, 32752 ; CHECK-NEXT: crandc 20, 22, 2 ; CHECK-NEXT: cmplwi 10, 0 -; CHECK-NEXT: lwz 6, 24(1) +; CHECK-NEXT: lwz 3, 24(1) ; CHECK-NEXT: crandc 21, 29, 2 -; CHECK-NEXT: cmplw 7, 5, 3 -; CHECK-NEXT: xoris 3, 5, 32752 +; CHECK-NEXT: cmplw 7, 6, 4 +; CHECK-NEXT: xoris 4, 6, 32752 ; CHECK-NEXT: crandc 22, 2, 6 -; CHECK-NEXT: cmplwi 3, 0 -; CHECK-NEXT: cmpwi 1, 4, 0 +; CHECK-NEXT: cmplwi 4, 0 +; CHECK-NEXT: cmpwi 1, 5, 0 ; CHECK-NEXT: crandc 23, 29, 2 ; CHECK-NEXT: crandc 25, 2, 6 -; CHECK-NEXT: clrlwi. 3, 7, 1 -; CHECK-NEXT: cmplwi 1, 3, 0 +; CHECK-NEXT: clrlwi. 4, 7, 1 +; CHECK-NEXT: cmplwi 1, 4, 0 ; CHECK-NEXT: crandc 26, 1, 6 -; CHECK-NEXT: cmpwi 6, 0 -; CHECK-NEXT: or 4, 8, 9 +; CHECK-NEXT: cmpwi 3, 0 +; CHECK-NEXT: or 5, 8, 9 ; CHECK-NEXT: crandc 27, 6, 2 -; CHECK-NEXT: cmplwi 4, 0 -; CHECK-NEXT: or 3, 6, 3 +; CHECK-NEXT: cmplwi 5, 0 +; CHECK-NEXT: or 3, 3, 4 ; CHECK-NEXT: cror 20, 20, 24 ; CHECK-NEXT: cror 21, 22, 21 ; CHECK-NEXT: cmplwi 1, 3, 0 @@ -407,7 +407,7 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: stwu 1, -48(1) ; CHECK-NEXT: stfd 3, 32(1) -; CHECK-NEXT: lis 3, 32752 +; CHECK-NEXT: lis 4, 32752 ; CHECK-NEXT: lwz 8, 32(1) ; CHECK-NEXT: stfd 4, 40(1) ; CHECK-NEXT: lwz 9, 44(1) @@ -417,33 +417,33 @@ ; CHECK-NEXT: clrlwi. 9, 9, 1 ; CHECK-NEXT: stfd 1, 16(1) ; CHECK-NEXT: cmplwi 5, 9, 0 -; CHECK-NEXT: lwz 5, 20(1) +; CHECK-NEXT: lwz 6, 20(1) ; CHECK-NEXT: crandc 24, 1, 22 ; CHECK-NEXT: stfd 2, 24(1) ; CHECK-NEXT: cmpwi 8, 0 -; CHECK-NEXT: lwz 4, 16(1) -; CHECK-NEXT: cmplw 7, 10, 3 +; CHECK-NEXT: lwz 5, 16(1) +; CHECK-NEXT: cmplw 7, 10, 4 ; CHECK-NEXT: lwz 7, 28(1) ; CHECK-NEXT: xoris 10, 10, 32752 ; CHECK-NEXT: crandc 20, 22, 2 ; CHECK-NEXT: cmplwi 10, 0 -; CHECK-NEXT: lwz 6, 24(1) +; CHECK-NEXT: lwz 3, 24(1) ; CHECK-NEXT: crandc 21, 29, 2 -; CHECK-NEXT: cmplw 7, 5, 3 -; CHECK-NEXT: xoris 3, 5, 32752 +; CHECK-NEXT: cmplw 7, 6, 4 +; CHECK-NEXT: xoris 4, 6, 32752 ; CHECK-NEXT: crandc 22, 2, 6 -; CHECK-NEXT: cmplwi 3, 0 -; CHECK-NEXT: cmpwi 1, 4, 0 +; CHECK-NEXT: cmplwi 4, 0 +; CHECK-NEXT: cmpwi 1, 5, 0 ; CHECK-NEXT: crandc 23, 29, 2 ; CHECK-NEXT: crandc 25, 2, 6 -; CHECK-NEXT: clrlwi. 3, 7, 1 -; CHECK-NEXT: cmplwi 1, 3, 0 +; CHECK-NEXT: clrlwi. 4, 7, 1 +; CHECK-NEXT: cmplwi 1, 4, 0 ; CHECK-NEXT: crandc 26, 1, 6 -; CHECK-NEXT: cmpwi 6, 0 -; CHECK-NEXT: or 4, 8, 9 +; CHECK-NEXT: cmpwi 3, 0 +; CHECK-NEXT: or 5, 8, 9 ; CHECK-NEXT: crandc 27, 6, 2 -; CHECK-NEXT: cmplwi 4, 0 -; CHECK-NEXT: or 3, 6, 3 +; CHECK-NEXT: cmplwi 5, 0 +; CHECK-NEXT: or 3, 3, 4 ; CHECK-NEXT: cror 20, 20, 24 ; CHECK-NEXT: cror 21, 22, 21 ; CHECK-NEXT: cmplwi 1, 3, 0 Index: llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll +++ llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll @@ -647,23 +647,23 @@ ; ; P8BE-LABEL: combine_srem_sdiv: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, -21386 +; P8BE-NEXT: mfvsrd r5, v2 +; P8BE-NEXT: lis r4, -21386 ; P8BE-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; P8BE-NEXT: addis r30, r2, .LCPI2_0@toc@ha -; P8BE-NEXT: ori r3, r3, 37253 -; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r6, r4, 48, 48 -; P8BE-NEXT: rldicl r7, r4, 32, 48 -; P8BE-NEXT: extsh r8, r5 +; P8BE-NEXT: ori r4, r4, 37253 +; P8BE-NEXT: clrldi r3, r5, 48 +; P8BE-NEXT: rldicl r6, r5, 48, 48 +; P8BE-NEXT: rldicl r7, r5, 32, 48 +; P8BE-NEXT: extsh r8, r3 ; P8BE-NEXT: extsh r9, r6 ; P8BE-NEXT: extsh r10, r7 -; P8BE-NEXT: mulhw r11, r8, r3 -; P8BE-NEXT: mulhw r12, r9, r3 -; P8BE-NEXT: rldicl r4, r4, 16, 48 -; P8BE-NEXT: mulhw r0, r10, r3 -; P8BE-NEXT: extsh r4, r4 -; P8BE-NEXT: mulhw r3, r4, r3 +; P8BE-NEXT: mulhw r11, r8, r4 +; P8BE-NEXT: mulhw r12, r9, r4 +; P8BE-NEXT: rldicl r5, r5, 16, 48 +; P8BE-NEXT: mulhw r0, r10, r4 +; P8BE-NEXT: extsh r5, r5 +; P8BE-NEXT: mulhw r4, r5, r4 ; P8BE-NEXT: add r8, r11, r8 ; P8BE-NEXT: add r9, r12, r9 ; P8BE-NEXT: srwi r11, r8, 31 @@ -674,7 +674,7 @@ ; P8BE-NEXT: srawi r12, r9, 6 ; P8BE-NEXT: srwi r9, r9, 31 ; P8BE-NEXT: add r8, r8, r11 -; P8BE-NEXT: add r3, r3, r4 +; P8BE-NEXT: add r4, r4, r5 ; P8BE-NEXT: lxvw4x v2, 0, r0 ; P8BE-NEXT: srawi r11, r10, 6 ; P8BE-NEXT: srwi r10, r10, 31 @@ -682,25 +682,25 @@ ; P8BE-NEXT: mtvsrwz v3, r8 ; P8BE-NEXT: mulli r12, r8, 95 ; P8BE-NEXT: add r10, r11, r10 -; P8BE-NEXT: srwi r11, r3, 31 +; P8BE-NEXT: srwi r11, r4, 31 ; P8BE-NEXT: mtvsrwz v4, r9 -; P8BE-NEXT: srawi r3, r3, 6 +; P8BE-NEXT: srawi r4, r4, 6 ; P8BE-NEXT: mulli r8, r9, 95 ; P8BE-NEXT: mtvsrwz v5, r10 -; P8BE-NEXT: add r3, r3, r11 +; P8BE-NEXT: add r4, r4, r11 ; P8BE-NEXT: mulli r9, r10, 95 ; P8BE-NEXT: vperm v3, v4, v3, v2 -; P8BE-NEXT: mulli r10, r3, 95 -; P8BE-NEXT: sub r5, r5, r12 +; P8BE-NEXT: mulli r10, r4, 95 +; P8BE-NEXT: sub r3, r3, r12 ; P8BE-NEXT: sub r6, r6, r8 -; P8BE-NEXT: mtvsrwz v4, r5 +; P8BE-NEXT: mtvsrwz v4, r3 ; P8BE-NEXT: mtvsrwz v0, r6 -; P8BE-NEXT: sub r5, r7, r9 -; P8BE-NEXT: sub r4, r4, r10 -; P8BE-NEXT: mtvsrwz v1, r5 -; P8BE-NEXT: mtvsrwz v6, r4 +; P8BE-NEXT: sub r3, r7, r9 +; P8BE-NEXT: sub r5, r5, r10 +; P8BE-NEXT: mtvsrwz v1, r3 +; P8BE-NEXT: mtvsrwz v6, r5 ; P8BE-NEXT: vperm v4, v0, v4, v2 -; P8BE-NEXT: mtvsrwz v0, r3 +; P8BE-NEXT: mtvsrwz v0, r4 ; P8BE-NEXT: vperm v1, v6, v1, v2 ; P8BE-NEXT: vperm v2, v0, v5, v2 ; P8BE-NEXT: vmrghw v4, v1, v4 Index: llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll +++ llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll @@ -625,21 +625,21 @@ ; ; P8BE-LABEL: combine_urem_udiv: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 22765 -; P8BE-NEXT: ori r3, r3, 8969 -; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r6, r4, 48, 48 -; P8BE-NEXT: clrlwi r8, r5, 16 +; P8BE-NEXT: mfvsrd r5, v2 +; P8BE-NEXT: lis r4, 22765 +; P8BE-NEXT: ori r4, r4, 8969 +; P8BE-NEXT: clrldi r3, r5, 48 +; P8BE-NEXT: rldicl r6, r5, 48, 48 +; P8BE-NEXT: clrlwi r8, r3, 16 ; P8BE-NEXT: clrlwi r9, r6, 16 -; P8BE-NEXT: rldicl r7, r4, 32, 48 -; P8BE-NEXT: rldicl r4, r4, 16, 48 -; P8BE-NEXT: mulhwu r10, r8, r3 -; P8BE-NEXT: mulhwu r12, r9, r3 +; P8BE-NEXT: rldicl r7, r5, 32, 48 +; P8BE-NEXT: rldicl r5, r5, 16, 48 +; P8BE-NEXT: mulhwu r10, r8, r4 +; P8BE-NEXT: mulhwu r12, r9, r4 ; P8BE-NEXT: clrlwi r11, r7, 16 -; P8BE-NEXT: clrlwi r4, r4, 16 -; P8BE-NEXT: mulhwu r0, r11, r3 -; P8BE-NEXT: mulhwu r3, r4, r3 +; P8BE-NEXT: clrlwi r5, r5, 16 +; P8BE-NEXT: mulhwu r0, r11, r4 +; P8BE-NEXT: mulhwu r4, r5, r4 ; P8BE-NEXT: sub r8, r8, r10 ; P8BE-NEXT: sub r9, r9, r12 ; P8BE-NEXT: srwi r8, r8, 1 @@ -647,7 +647,7 @@ ; P8BE-NEXT: sub r11, r11, r0 ; P8BE-NEXT: add r8, r8, r10 ; P8BE-NEXT: add r9, r9, r12 -; P8BE-NEXT: sub r12, r4, r3 +; P8BE-NEXT: sub r12, r5, r4 ; P8BE-NEXT: addis r10, r2, .LCPI2_0@toc@ha ; P8BE-NEXT: srwi r11, r11, 1 ; P8BE-NEXT: srwi r8, r8, 6 @@ -656,27 +656,27 @@ ; P8BE-NEXT: addi r10, r10, .LCPI2_0@toc@l ; P8BE-NEXT: add r11, r11, r0 ; P8BE-NEXT: mulli r0, r8, 95 -; P8BE-NEXT: add r3, r12, r3 +; P8BE-NEXT: add r4, r12, r4 ; P8BE-NEXT: mtvsrwz v3, r8 ; P8BE-NEXT: lxvw4x v2, 0, r10 ; P8BE-NEXT: srwi r10, r11, 6 ; P8BE-NEXT: mulli r8, r9, 95 -; P8BE-NEXT: srwi r3, r3, 6 +; P8BE-NEXT: srwi r4, r4, 6 ; P8BE-NEXT: mtvsrwz v4, r9 ; P8BE-NEXT: mulli r9, r10, 95 ; P8BE-NEXT: mtvsrwz v5, r10 -; P8BE-NEXT: mulli r10, r3, 95 +; P8BE-NEXT: mulli r10, r4, 95 ; P8BE-NEXT: vperm v3, v4, v3, v2 -; P8BE-NEXT: sub r5, r5, r0 +; P8BE-NEXT: sub r3, r3, r0 ; P8BE-NEXT: sub r6, r6, r8 -; P8BE-NEXT: mtvsrwz v4, r5 +; P8BE-NEXT: mtvsrwz v4, r3 ; P8BE-NEXT: mtvsrwz v0, r6 -; P8BE-NEXT: sub r5, r7, r9 -; P8BE-NEXT: sub r4, r4, r10 -; P8BE-NEXT: mtvsrwz v1, r5 -; P8BE-NEXT: mtvsrwz v6, r4 +; P8BE-NEXT: sub r3, r7, r9 +; P8BE-NEXT: sub r5, r5, r10 +; P8BE-NEXT: mtvsrwz v1, r3 +; P8BE-NEXT: mtvsrwz v6, r5 ; P8BE-NEXT: vperm v4, v0, v4, v2 -; P8BE-NEXT: mtvsrwz v0, r3 +; P8BE-NEXT: mtvsrwz v0, r4 ; P8BE-NEXT: vperm v1, v6, v1, v2 ; P8BE-NEXT: vperm v2, v0, v5, v2 ; P8BE-NEXT: vmrghw v4, v1, v4 Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll @@ -1608,24 +1608,24 @@ ; LMULMAX1-RV64-NEXT: vor.vv v26, v29, v26 ; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v28 ; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-RV64-NEXT: lui a4, 3855 -; LMULMAX1-RV64-NEXT: addiw a4, a4, 241 -; LMULMAX1-RV64-NEXT: slli a4, a4, 12 -; LMULMAX1-RV64-NEXT: addi a4, a4, -241 -; LMULMAX1-RV64-NEXT: slli a4, a4, 12 -; LMULMAX1-RV64-NEXT: addi a4, a4, 241 -; LMULMAX1-RV64-NEXT: slli a4, a4, 12 -; LMULMAX1-RV64-NEXT: addi a4, a4, -241 +; LMULMAX1-RV64-NEXT: lui a2, 3855 +; LMULMAX1-RV64-NEXT: addiw a2, a2, 241 +; LMULMAX1-RV64-NEXT: slli a2, a2, 12 +; LMULMAX1-RV64-NEXT: addi a2, a2, -241 +; LMULMAX1-RV64-NEXT: slli a2, a2, 12 +; LMULMAX1-RV64-NEXT: addi a2, a2, 241 +; LMULMAX1-RV64-NEXT: slli a2, a2, 12 +; LMULMAX1-RV64-NEXT: addi a4, a2, -241 ; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a4 ; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4 -; LMULMAX1-RV64-NEXT: lui a5, 1044721 -; LMULMAX1-RV64-NEXT: addiw a5, a5, -241 -; LMULMAX1-RV64-NEXT: slli a5, a5, 12 -; LMULMAX1-RV64-NEXT: addi a5, a5, 241 -; LMULMAX1-RV64-NEXT: slli a5, a5, 12 -; LMULMAX1-RV64-NEXT: addi a5, a5, -241 -; LMULMAX1-RV64-NEXT: slli a5, a5, 12 -; LMULMAX1-RV64-NEXT: addi a5, a5, 240 +; LMULMAX1-RV64-NEXT: lui a2, 1044721 +; LMULMAX1-RV64-NEXT: addiw a2, a2, -241 +; LMULMAX1-RV64-NEXT: slli a2, a2, 12 +; LMULMAX1-RV64-NEXT: addi a2, a2, 241 +; LMULMAX1-RV64-NEXT: slli a2, a2, 12 +; LMULMAX1-RV64-NEXT: addi a2, a2, -241 +; LMULMAX1-RV64-NEXT: slli a2, a2, 12 +; LMULMAX1-RV64-NEXT: addi a5, a2, 240 ; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a5 ; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 4 ; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll @@ -2133,16 +2133,16 @@ ; LMULMAX1-RV32-NEXT: sw a1, 44(sp) ; LMULMAX1-RV32-NEXT: addi a7, zero, 32 ; LMULMAX1-RV32-NEXT: vsrl.vx v26, v26, a7 -; LMULMAX1-RV32-NEXT: vmv.x.s a4, v26 -; LMULMAX1-RV32-NEXT: srli a5, a4, 8 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV32-NEXT: srli a5, a1, 8 ; LMULMAX1-RV32-NEXT: and a5, a5, a2 -; LMULMAX1-RV32-NEXT: srli a1, a4, 24 -; LMULMAX1-RV32-NEXT: or a1, a5, a1 -; LMULMAX1-RV32-NEXT: slli a5, a4, 8 +; LMULMAX1-RV32-NEXT: srli a4, a1, 24 +; LMULMAX1-RV32-NEXT: or a4, a5, a4 +; LMULMAX1-RV32-NEXT: slli a5, a1, 8 ; LMULMAX1-RV32-NEXT: and a5, a5, a3 -; LMULMAX1-RV32-NEXT: slli a4, a4, 24 -; LMULMAX1-RV32-NEXT: or a4, a4, a5 -; LMULMAX1-RV32-NEXT: or a1, a4, a1 +; LMULMAX1-RV32-NEXT: slli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: or a1, a1, a4 ; LMULMAX1-RV32-NEXT: sw a1, 32(sp) ; LMULMAX1-RV32-NEXT: vsrl.vx v26, v27, a7 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll @@ -2114,80 +2114,80 @@ ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX2-RV32-NEXT: addi a2, a1, -1 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: and a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a2, a1, 1 -; LMULMAX2-RV32-NEXT: lui a3, 349525 -; LMULMAX2-RV32-NEXT: addi a6, a3, 1365 -; LMULMAX2-RV32-NEXT: and a2, a2, a6 -; LMULMAX2-RV32-NEXT: sub a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a2, a1, a2 +; LMULMAX2-RV32-NEXT: srli a3, a2, 1 +; LMULMAX2-RV32-NEXT: lui a1, 349525 +; LMULMAX2-RV32-NEXT: addi a6, a1, 1365 +; LMULMAX2-RV32-NEXT: and a3, a3, a6 +; LMULMAX2-RV32-NEXT: sub a3, a2, a3 ; LMULMAX2-RV32-NEXT: lui a2, 209715 ; LMULMAX2-RV32-NEXT: addi a2, a2, 819 +; LMULMAX2-RV32-NEXT: and a4, a3, a2 +; LMULMAX2-RV32-NEXT: srli a3, a3, 2 +; LMULMAX2-RV32-NEXT: and a3, a3, a2 +; LMULMAX2-RV32-NEXT: add a3, a4, a3 +; LMULMAX2-RV32-NEXT: srli a4, a3, 4 +; LMULMAX2-RV32-NEXT: add a4, a3, a4 +; LMULMAX2-RV32-NEXT: lui a3, 61681 +; LMULMAX2-RV32-NEXT: addi a3, a3, -241 +; LMULMAX2-RV32-NEXT: and a4, a4, a3 +; LMULMAX2-RV32-NEXT: lui a5, 4112 +; LMULMAX2-RV32-NEXT: addi a5, a5, 257 +; LMULMAX2-RV32-NEXT: mul a4, a4, a5 +; LMULMAX2-RV32-NEXT: srli a4, a4, 24 +; LMULMAX2-RV32-NEXT: sw a4, 16(sp) +; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 3 +; LMULMAX2-RV32-NEXT: vmv.x.s a4, v26 +; LMULMAX2-RV32-NEXT: addi a1, a4, -1 +; LMULMAX2-RV32-NEXT: not a4, a4 +; LMULMAX2-RV32-NEXT: and a1, a4, a1 +; LMULMAX2-RV32-NEXT: srli a4, a1, 1 +; LMULMAX2-RV32-NEXT: and a4, a4, a6 +; LMULMAX2-RV32-NEXT: sub a1, a1, a4 ; LMULMAX2-RV32-NEXT: and a4, a1, a2 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a2 ; LMULMAX2-RV32-NEXT: add a1, a4, a1 ; LMULMAX2-RV32-NEXT: srli a4, a1, 4 ; LMULMAX2-RV32-NEXT: add a1, a1, a4 -; LMULMAX2-RV32-NEXT: lui a4, 61681 -; LMULMAX2-RV32-NEXT: addi a4, a4, -241 -; LMULMAX2-RV32-NEXT: and a1, a1, a4 -; LMULMAX2-RV32-NEXT: lui a5, 4112 -; LMULMAX2-RV32-NEXT: addi a5, a5, 257 -; LMULMAX2-RV32-NEXT: mul a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: sw a1, 16(sp) -; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 3 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: addi a3, a1, -1 -; LMULMAX2-RV32-NEXT: not a1, a1 ; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: srli a3, a1, 1 -; LMULMAX2-RV32-NEXT: and a3, a3, a6 -; LMULMAX2-RV32-NEXT: sub a1, a1, a3 -; LMULMAX2-RV32-NEXT: and a3, a1, a2 -; LMULMAX2-RV32-NEXT: srli a1, a1, 2 -; LMULMAX2-RV32-NEXT: and a1, a1, a2 -; LMULMAX2-RV32-NEXT: add a1, a3, a1 -; LMULMAX2-RV32-NEXT: srli a3, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a3 -; LMULMAX2-RV32-NEXT: and a1, a1, a4 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: sw a1, 28(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: addi a3, a1, -1 +; LMULMAX2-RV32-NEXT: addi a4, a1, -1 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: srli a3, a1, 1 -; LMULMAX2-RV32-NEXT: and a3, a3, a6 -; LMULMAX2-RV32-NEXT: sub a1, a1, a3 -; LMULMAX2-RV32-NEXT: and a3, a1, a2 +; LMULMAX2-RV32-NEXT: and a1, a1, a4 +; LMULMAX2-RV32-NEXT: srli a4, a1, 1 +; LMULMAX2-RV32-NEXT: and a4, a4, a6 +; LMULMAX2-RV32-NEXT: sub a1, a1, a4 +; LMULMAX2-RV32-NEXT: and a4, a1, a2 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a2 -; LMULMAX2-RV32-NEXT: add a1, a3, a1 -; LMULMAX2-RV32-NEXT: srli a3, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a3 -; LMULMAX2-RV32-NEXT: and a1, a1, a4 +; LMULMAX2-RV32-NEXT: add a1, a4, a1 +; LMULMAX2-RV32-NEXT: srli a4, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a4 +; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: sw a1, 24(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 -; LMULMAX2-RV32-NEXT: addi a3, a1, -1 +; LMULMAX2-RV32-NEXT: addi a4, a1, -1 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: srli a3, a1, 1 -; LMULMAX2-RV32-NEXT: and a3, a3, a6 -; LMULMAX2-RV32-NEXT: sub a1, a1, a3 -; LMULMAX2-RV32-NEXT: and a3, a1, a2 +; LMULMAX2-RV32-NEXT: and a1, a1, a4 +; LMULMAX2-RV32-NEXT: srli a4, a1, 1 +; LMULMAX2-RV32-NEXT: and a4, a4, a6 +; LMULMAX2-RV32-NEXT: sub a1, a1, a4 +; LMULMAX2-RV32-NEXT: and a4, a1, a2 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a2 -; LMULMAX2-RV32-NEXT: add a1, a3, a1 +; LMULMAX2-RV32-NEXT: add a1, a4, a1 ; LMULMAX2-RV32-NEXT: srli a2, a1, 4 ; LMULMAX2-RV32-NEXT: add a1, a1, a2 -; LMULMAX2-RV32-NEXT: and a1, a1, a4 +; LMULMAX2-RV32-NEXT: and a1, a1, a3 ; LMULMAX2-RV32-NEXT: mul a1, a1, a5 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: sw a1, 20(sp) @@ -2237,79 +2237,79 @@ ; LMULMAX2-RV64-NEXT: and a4, a4, a3 ; LMULMAX2-RV64-NEXT: add a4, a5, a4 ; LMULMAX2-RV64-NEXT: srli a5, a4, 4 -; LMULMAX2-RV64-NEXT: add a4, a4, a5 -; LMULMAX2-RV64-NEXT: lui a5, 3855 -; LMULMAX2-RV64-NEXT: addiw a5, a5, 241 -; LMULMAX2-RV64-NEXT: slli a5, a5, 12 -; LMULMAX2-RV64-NEXT: addi a5, a5, -241 -; LMULMAX2-RV64-NEXT: slli a5, a5, 12 -; LMULMAX2-RV64-NEXT: addi a5, a5, 241 -; LMULMAX2-RV64-NEXT: slli a5, a5, 12 -; LMULMAX2-RV64-NEXT: addi a5, a5, -241 -; LMULMAX2-RV64-NEXT: and a4, a4, a5 -; LMULMAX2-RV64-NEXT: lui a1, 4112 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 257 -; LMULMAX2-RV64-NEXT: slli a1, a1, 16 -; LMULMAX2-RV64-NEXT: addi a1, a1, 257 -; LMULMAX2-RV64-NEXT: slli a1, a1, 16 -; LMULMAX2-RV64-NEXT: addi a1, a1, 257 -; LMULMAX2-RV64-NEXT: mul a4, a4, a1 -; LMULMAX2-RV64-NEXT: srli a4, a4, 56 -; LMULMAX2-RV64-NEXT: sw a4, 28(sp) +; LMULMAX2-RV64-NEXT: add a5, a4, a5 +; LMULMAX2-RV64-NEXT: lui a4, 3855 +; LMULMAX2-RV64-NEXT: addiw a4, a4, 241 +; LMULMAX2-RV64-NEXT: slli a4, a4, 12 +; LMULMAX2-RV64-NEXT: addi a4, a4, -241 +; LMULMAX2-RV64-NEXT: slli a4, a4, 12 +; LMULMAX2-RV64-NEXT: addi a4, a4, 241 +; LMULMAX2-RV64-NEXT: slli a4, a4, 12 +; LMULMAX2-RV64-NEXT: addi a4, a4, -241 +; LMULMAX2-RV64-NEXT: and a1, a5, a4 +; LMULMAX2-RV64-NEXT: lui a5, 4112 +; LMULMAX2-RV64-NEXT: addiw a5, a5, 257 +; LMULMAX2-RV64-NEXT: slli a5, a5, 16 +; LMULMAX2-RV64-NEXT: addi a5, a5, 257 +; LMULMAX2-RV64-NEXT: slli a5, a5, 16 +; LMULMAX2-RV64-NEXT: addi a5, a5, 257 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: sw a1, 28(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 2 -; LMULMAX2-RV64-NEXT: vmv.x.s a4, v26 -; LMULMAX2-RV64-NEXT: or a4, a4, a6 -; LMULMAX2-RV64-NEXT: addi a2, a4, -1 -; LMULMAX2-RV64-NEXT: not a4, a4 -; LMULMAX2-RV64-NEXT: and a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a7 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a2, a2, a5 -; LMULMAX2-RV64-NEXT: mul a2, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: sw a2, 24(sp) +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV64-NEXT: or a1, a1, a6 +; LMULMAX2-RV64-NEXT: addi a2, a1, -1 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: and a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: sw a1, 24(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 1 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX2-RV64-NEXT: or a2, a2, a6 -; LMULMAX2-RV64-NEXT: addi a4, a2, -1 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: and a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a7 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a2, a2, a5 -; LMULMAX2-RV64-NEXT: mul a2, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: sw a2, 20(sp) -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v25 -; LMULMAX2-RV64-NEXT: or a2, a2, a6 -; LMULMAX2-RV64-NEXT: addi a4, a2, -1 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: and a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a7 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a3, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a3 -; LMULMAX2-RV64-NEXT: and a2, a2, a5 -; LMULMAX2-RV64-NEXT: mul a1, a2, a1 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV64-NEXT: or a1, a1, a6 +; LMULMAX2-RV64-NEXT: addi a2, a1, -1 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: and a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 +; LMULMAX2-RV64-NEXT: srli a1, a1, 56 +; LMULMAX2-RV64-NEXT: sw a1, 20(sp) +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX2-RV64-NEXT: or a1, a1, a6 +; LMULMAX2-RV64-NEXT: addi a2, a1, -1 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: and a1, a1, a2 +; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: sub a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a2, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a2, a1 +; LMULMAX2-RV64-NEXT: srli a2, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: sw a1, 16(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu @@ -2328,80 +2328,80 @@ ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: addi a2, a1, -1 ; LMULMAX1-RV32-NEXT: not a1, a1 -; LMULMAX1-RV32-NEXT: and a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 1 -; LMULMAX1-RV32-NEXT: lui a3, 349525 -; LMULMAX1-RV32-NEXT: addi a6, a3, 1365 -; LMULMAX1-RV32-NEXT: and a2, a2, a6 -; LMULMAX1-RV32-NEXT: sub a1, a1, a2 +; LMULMAX1-RV32-NEXT: and a2, a1, a2 +; LMULMAX1-RV32-NEXT: srli a3, a2, 1 +; LMULMAX1-RV32-NEXT: lui a1, 349525 +; LMULMAX1-RV32-NEXT: addi a6, a1, 1365 +; LMULMAX1-RV32-NEXT: and a3, a3, a6 +; LMULMAX1-RV32-NEXT: sub a3, a2, a3 ; LMULMAX1-RV32-NEXT: lui a2, 209715 ; LMULMAX1-RV32-NEXT: addi a2, a2, 819 +; LMULMAX1-RV32-NEXT: and a4, a3, a2 +; LMULMAX1-RV32-NEXT: srli a3, a3, 2 +; LMULMAX1-RV32-NEXT: and a3, a3, a2 +; LMULMAX1-RV32-NEXT: add a3, a4, a3 +; LMULMAX1-RV32-NEXT: srli a4, a3, 4 +; LMULMAX1-RV32-NEXT: add a4, a3, a4 +; LMULMAX1-RV32-NEXT: lui a3, 61681 +; LMULMAX1-RV32-NEXT: addi a3, a3, -241 +; LMULMAX1-RV32-NEXT: and a4, a4, a3 +; LMULMAX1-RV32-NEXT: lui a5, 4112 +; LMULMAX1-RV32-NEXT: addi a5, a5, 257 +; LMULMAX1-RV32-NEXT: mul a4, a4, a5 +; LMULMAX1-RV32-NEXT: srli a4, a4, 24 +; LMULMAX1-RV32-NEXT: sw a4, 16(sp) +; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 3 +; LMULMAX1-RV32-NEXT: vmv.x.s a4, v26 +; LMULMAX1-RV32-NEXT: addi a1, a4, -1 +; LMULMAX1-RV32-NEXT: not a4, a4 +; LMULMAX1-RV32-NEXT: and a1, a4, a1 +; LMULMAX1-RV32-NEXT: srli a4, a1, 1 +; LMULMAX1-RV32-NEXT: and a4, a4, a6 +; LMULMAX1-RV32-NEXT: sub a1, a1, a4 ; LMULMAX1-RV32-NEXT: and a4, a1, a2 ; LMULMAX1-RV32-NEXT: srli a1, a1, 2 ; LMULMAX1-RV32-NEXT: and a1, a1, a2 ; LMULMAX1-RV32-NEXT: add a1, a4, a1 ; LMULMAX1-RV32-NEXT: srli a4, a1, 4 ; LMULMAX1-RV32-NEXT: add a1, a1, a4 -; LMULMAX1-RV32-NEXT: lui a4, 61681 -; LMULMAX1-RV32-NEXT: addi a4, a4, -241 -; LMULMAX1-RV32-NEXT: and a1, a1, a4 -; LMULMAX1-RV32-NEXT: lui a5, 4112 -; LMULMAX1-RV32-NEXT: addi a5, a5, 257 -; LMULMAX1-RV32-NEXT: mul a1, a1, a5 -; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: sw a1, 16(sp) -; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 3 -; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX1-RV32-NEXT: addi a3, a1, -1 -; LMULMAX1-RV32-NEXT: not a1, a1 ; LMULMAX1-RV32-NEXT: and a1, a1, a3 -; LMULMAX1-RV32-NEXT: srli a3, a1, 1 -; LMULMAX1-RV32-NEXT: and a3, a3, a6 -; LMULMAX1-RV32-NEXT: sub a1, a1, a3 -; LMULMAX1-RV32-NEXT: and a3, a1, a2 -; LMULMAX1-RV32-NEXT: srli a1, a1, 2 -; LMULMAX1-RV32-NEXT: and a1, a1, a2 -; LMULMAX1-RV32-NEXT: add a1, a3, a1 -; LMULMAX1-RV32-NEXT: srli a3, a1, 4 -; LMULMAX1-RV32-NEXT: add a1, a1, a3 -; LMULMAX1-RV32-NEXT: and a1, a1, a4 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: sw a1, 28(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX1-RV32-NEXT: addi a3, a1, -1 +; LMULMAX1-RV32-NEXT: addi a4, a1, -1 ; LMULMAX1-RV32-NEXT: not a1, a1 -; LMULMAX1-RV32-NEXT: and a1, a1, a3 -; LMULMAX1-RV32-NEXT: srli a3, a1, 1 -; LMULMAX1-RV32-NEXT: and a3, a3, a6 -; LMULMAX1-RV32-NEXT: sub a1, a1, a3 -; LMULMAX1-RV32-NEXT: and a3, a1, a2 +; LMULMAX1-RV32-NEXT: and a1, a1, a4 +; LMULMAX1-RV32-NEXT: srli a4, a1, 1 +; LMULMAX1-RV32-NEXT: and a4, a4, a6 +; LMULMAX1-RV32-NEXT: sub a1, a1, a4 +; LMULMAX1-RV32-NEXT: and a4, a1, a2 ; LMULMAX1-RV32-NEXT: srli a1, a1, 2 ; LMULMAX1-RV32-NEXT: and a1, a1, a2 -; LMULMAX1-RV32-NEXT: add a1, a3, a1 -; LMULMAX1-RV32-NEXT: srli a3, a1, 4 -; LMULMAX1-RV32-NEXT: add a1, a1, a3 -; LMULMAX1-RV32-NEXT: and a1, a1, a4 +; LMULMAX1-RV32-NEXT: add a1, a4, a1 +; LMULMAX1-RV32-NEXT: srli a4, a1, 4 +; LMULMAX1-RV32-NEXT: add a1, a1, a4 +; LMULMAX1-RV32-NEXT: and a1, a1, a3 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: sw a1, 24(sp) ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 -; LMULMAX1-RV32-NEXT: addi a3, a1, -1 +; LMULMAX1-RV32-NEXT: addi a4, a1, -1 ; LMULMAX1-RV32-NEXT: not a1, a1 -; LMULMAX1-RV32-NEXT: and a1, a1, a3 -; LMULMAX1-RV32-NEXT: srli a3, a1, 1 -; LMULMAX1-RV32-NEXT: and a3, a3, a6 -; LMULMAX1-RV32-NEXT: sub a1, a1, a3 -; LMULMAX1-RV32-NEXT: and a3, a1, a2 +; LMULMAX1-RV32-NEXT: and a1, a1, a4 +; LMULMAX1-RV32-NEXT: srli a4, a1, 1 +; LMULMAX1-RV32-NEXT: and a4, a4, a6 +; LMULMAX1-RV32-NEXT: sub a1, a1, a4 +; LMULMAX1-RV32-NEXT: and a4, a1, a2 ; LMULMAX1-RV32-NEXT: srli a1, a1, 2 ; LMULMAX1-RV32-NEXT: and a1, a1, a2 -; LMULMAX1-RV32-NEXT: add a1, a3, a1 +; LMULMAX1-RV32-NEXT: add a1, a4, a1 ; LMULMAX1-RV32-NEXT: srli a2, a1, 4 ; LMULMAX1-RV32-NEXT: add a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a1, a1, a4 +; LMULMAX1-RV32-NEXT: and a1, a1, a3 ; LMULMAX1-RV32-NEXT: mul a1, a1, a5 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: sw a1, 20(sp) @@ -2451,79 +2451,79 @@ ; LMULMAX1-RV64-NEXT: and a4, a4, a3 ; LMULMAX1-RV64-NEXT: add a4, a5, a4 ; LMULMAX1-RV64-NEXT: srli a5, a4, 4 -; LMULMAX1-RV64-NEXT: add a4, a4, a5 -; LMULMAX1-RV64-NEXT: lui a5, 3855 -; LMULMAX1-RV64-NEXT: addiw a5, a5, 241 -; LMULMAX1-RV64-NEXT: slli a5, a5, 12 -; LMULMAX1-RV64-NEXT: addi a5, a5, -241 -; LMULMAX1-RV64-NEXT: slli a5, a5, 12 -; LMULMAX1-RV64-NEXT: addi a5, a5, 241 -; LMULMAX1-RV64-NEXT: slli a5, a5, 12 -; LMULMAX1-RV64-NEXT: addi a5, a5, -241 -; LMULMAX1-RV64-NEXT: and a4, a4, a5 -; LMULMAX1-RV64-NEXT: lui a1, 4112 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 257 -; LMULMAX1-RV64-NEXT: slli a1, a1, 16 -; LMULMAX1-RV64-NEXT: addi a1, a1, 257 -; LMULMAX1-RV64-NEXT: slli a1, a1, 16 -; LMULMAX1-RV64-NEXT: addi a1, a1, 257 -; LMULMAX1-RV64-NEXT: mul a4, a4, a1 -; LMULMAX1-RV64-NEXT: srli a4, a4, 56 -; LMULMAX1-RV64-NEXT: sw a4, 28(sp) +; LMULMAX1-RV64-NEXT: add a5, a4, a5 +; LMULMAX1-RV64-NEXT: lui a4, 3855 +; LMULMAX1-RV64-NEXT: addiw a4, a4, 241 +; LMULMAX1-RV64-NEXT: slli a4, a4, 12 +; LMULMAX1-RV64-NEXT: addi a4, a4, -241 +; LMULMAX1-RV64-NEXT: slli a4, a4, 12 +; LMULMAX1-RV64-NEXT: addi a4, a4, 241 +; LMULMAX1-RV64-NEXT: slli a4, a4, 12 +; LMULMAX1-RV64-NEXT: addi a4, a4, -241 +; LMULMAX1-RV64-NEXT: and a1, a5, a4 +; LMULMAX1-RV64-NEXT: lui a5, 4112 +; LMULMAX1-RV64-NEXT: addiw a5, a5, 257 +; LMULMAX1-RV64-NEXT: slli a5, a5, 16 +; LMULMAX1-RV64-NEXT: addi a5, a5, 257 +; LMULMAX1-RV64-NEXT: slli a5, a5, 16 +; LMULMAX1-RV64-NEXT: addi a5, a5, 257 +; LMULMAX1-RV64-NEXT: mul a1, a1, a5 +; LMULMAX1-RV64-NEXT: srli a1, a1, 56 +; LMULMAX1-RV64-NEXT: sw a1, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 -; LMULMAX1-RV64-NEXT: vmv.x.s a4, v26 -; LMULMAX1-RV64-NEXT: or a4, a4, a6 -; LMULMAX1-RV64-NEXT: addi a2, a4, -1 -; LMULMAX1-RV64-NEXT: not a4, a4 -; LMULMAX1-RV64-NEXT: and a2, a4, a2 -; LMULMAX1-RV64-NEXT: srli a4, a2, 1 -; LMULMAX1-RV64-NEXT: and a4, a4, a7 -; LMULMAX1-RV64-NEXT: sub a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a4, a2, a3 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a3 -; LMULMAX1-RV64-NEXT: add a2, a4, a2 -; LMULMAX1-RV64-NEXT: srli a4, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sw a2, 24(sp) +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: or a1, a1, a6 +; LMULMAX1-RV64-NEXT: addi a2, a1, -1 +; LMULMAX1-RV64-NEXT: not a1, a1 +; LMULMAX1-RV64-NEXT: and a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 1 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: sub a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a2, a1, a3 +; LMULMAX1-RV64-NEXT: srli a1, a1, 2 +; LMULMAX1-RV64-NEXT: and a1, a1, a3 +; LMULMAX1-RV64-NEXT: add a1, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a1, 4 +; LMULMAX1-RV64-NEXT: add a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a1, a1, a4 +; LMULMAX1-RV64-NEXT: mul a1, a1, a5 +; LMULMAX1-RV64-NEXT: srli a1, a1, 56 +; LMULMAX1-RV64-NEXT: sw a1, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 1 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV64-NEXT: or a2, a2, a6 -; LMULMAX1-RV64-NEXT: addi a4, a2, -1 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 1 -; LMULMAX1-RV64-NEXT: and a4, a4, a7 -; LMULMAX1-RV64-NEXT: sub a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a4, a2, a3 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a3 -; LMULMAX1-RV64-NEXT: add a2, a4, a2 -; LMULMAX1-RV64-NEXT: srli a4, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sw a2, 20(sp) -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 -; LMULMAX1-RV64-NEXT: or a2, a2, a6 -; LMULMAX1-RV64-NEXT: addi a4, a2, -1 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 1 -; LMULMAX1-RV64-NEXT: and a4, a4, a7 -; LMULMAX1-RV64-NEXT: sub a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a4, a2, a3 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a3 -; LMULMAX1-RV64-NEXT: add a2, a4, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a1, a2, a1 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: or a1, a1, a6 +; LMULMAX1-RV64-NEXT: addi a2, a1, -1 +; LMULMAX1-RV64-NEXT: not a1, a1 +; LMULMAX1-RV64-NEXT: and a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 1 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: sub a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a2, a1, a3 +; LMULMAX1-RV64-NEXT: srli a1, a1, 2 +; LMULMAX1-RV64-NEXT: and a1, a1, a3 +; LMULMAX1-RV64-NEXT: add a1, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a1, 4 +; LMULMAX1-RV64-NEXT: add a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a1, a1, a4 +; LMULMAX1-RV64-NEXT: mul a1, a1, a5 +; LMULMAX1-RV64-NEXT: srli a1, a1, 56 +; LMULMAX1-RV64-NEXT: sw a1, 20(sp) +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV64-NEXT: or a1, a1, a6 +; LMULMAX1-RV64-NEXT: addi a2, a1, -1 +; LMULMAX1-RV64-NEXT: not a1, a1 +; LMULMAX1-RV64-NEXT: and a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 1 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: sub a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a2, a1, a3 +; LMULMAX1-RV64-NEXT: srli a1, a1, 2 +; LMULMAX1-RV64-NEXT: and a1, a1, a3 +; LMULMAX1-RV64-NEXT: add a1, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a1, 4 +; LMULMAX1-RV64-NEXT: add a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a1, a1, a4 +; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: sw a1, 16(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu @@ -7904,75 +7904,75 @@ ; LMULMAX2-RV64-NEXT: and a3, a3, a2 ; LMULMAX2-RV64-NEXT: add a3, a4, a3 ; LMULMAX2-RV64-NEXT: srli a4, a3, 4 -; LMULMAX2-RV64-NEXT: add a3, a3, a4 -; LMULMAX2-RV64-NEXT: lui a4, 3855 -; LMULMAX2-RV64-NEXT: addiw a4, a4, 241 -; LMULMAX2-RV64-NEXT: slli a4, a4, 12 -; LMULMAX2-RV64-NEXT: addi a4, a4, -241 -; LMULMAX2-RV64-NEXT: slli a4, a4, 12 -; LMULMAX2-RV64-NEXT: addi a4, a4, 241 -; LMULMAX2-RV64-NEXT: slli a4, a4, 12 -; LMULMAX2-RV64-NEXT: addi a4, a4, -241 -; LMULMAX2-RV64-NEXT: and a3, a3, a4 +; LMULMAX2-RV64-NEXT: add a4, a3, a4 +; LMULMAX2-RV64-NEXT: lui a3, 3855 +; LMULMAX2-RV64-NEXT: addiw a3, a3, 241 +; LMULMAX2-RV64-NEXT: slli a3, a3, 12 +; LMULMAX2-RV64-NEXT: addi a3, a3, -241 +; LMULMAX2-RV64-NEXT: slli a3, a3, 12 +; LMULMAX2-RV64-NEXT: addi a3, a3, 241 +; LMULMAX2-RV64-NEXT: slli a3, a3, 12 +; LMULMAX2-RV64-NEXT: addi a3, a3, -241 +; LMULMAX2-RV64-NEXT: and a4, a4, a3 ; LMULMAX2-RV64-NEXT: lui a5, 4112 ; LMULMAX2-RV64-NEXT: addiw a5, a5, 257 ; LMULMAX2-RV64-NEXT: slli a5, a5, 16 ; LMULMAX2-RV64-NEXT: addi a5, a5, 257 ; LMULMAX2-RV64-NEXT: slli a5, a5, 16 ; LMULMAX2-RV64-NEXT: addi a5, a5, 257 -; LMULMAX2-RV64-NEXT: mul a3, a3, a5 -; LMULMAX2-RV64-NEXT: srli a3, a3, 56 -; LMULMAX2-RV64-NEXT: sd a3, 56(sp) +; LMULMAX2-RV64-NEXT: mul a4, a4, a5 +; LMULMAX2-RV64-NEXT: srli a4, a4, 56 +; LMULMAX2-RV64-NEXT: sd a4, 56(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 -; LMULMAX2-RV64-NEXT: vmv.x.s a3, v28 -; LMULMAX2-RV64-NEXT: addi a1, a3, -1 -; LMULMAX2-RV64-NEXT: not a3, a3 -; LMULMAX2-RV64-NEXT: and a1, a3, a1 -; LMULMAX2-RV64-NEXT: srli a3, a1, 1 -; LMULMAX2-RV64-NEXT: and a3, a3, a6 -; LMULMAX2-RV64-NEXT: sub a1, a1, a3 -; LMULMAX2-RV64-NEXT: and a3, a1, a2 +; LMULMAX2-RV64-NEXT: vmv.x.s a4, v28 +; LMULMAX2-RV64-NEXT: addi a1, a4, -1 +; LMULMAX2-RV64-NEXT: not a4, a4 +; LMULMAX2-RV64-NEXT: and a1, a4, a1 +; LMULMAX2-RV64-NEXT: srli a4, a1, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a1, a1, a4 +; LMULMAX2-RV64-NEXT: and a4, a1, a2 ; LMULMAX2-RV64-NEXT: srli a1, a1, 2 ; LMULMAX2-RV64-NEXT: and a1, a1, a2 -; LMULMAX2-RV64-NEXT: add a1, a3, a1 -; LMULMAX2-RV64-NEXT: srli a3, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a3 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: add a1, a4, a1 +; LMULMAX2-RV64-NEXT: srli a4, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a4 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: sd a1, 48(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 1 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: addi a3, a1, -1 +; LMULMAX2-RV64-NEXT: addi a4, a1, -1 ; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: srli a3, a1, 1 -; LMULMAX2-RV64-NEXT: and a3, a3, a6 -; LMULMAX2-RV64-NEXT: sub a1, a1, a3 -; LMULMAX2-RV64-NEXT: and a3, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: srli a4, a1, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a1, a1, a4 +; LMULMAX2-RV64-NEXT: and a4, a1, a2 ; LMULMAX2-RV64-NEXT: srli a1, a1, 2 ; LMULMAX2-RV64-NEXT: and a1, a1, a2 -; LMULMAX2-RV64-NEXT: add a1, a3, a1 -; LMULMAX2-RV64-NEXT: srli a3, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a3 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: add a1, a4, a1 +; LMULMAX2-RV64-NEXT: srli a4, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a4 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: sd a1, 40(sp) ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV64-NEXT: addi a3, a1, -1 +; LMULMAX2-RV64-NEXT: addi a4, a1, -1 ; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: srli a3, a1, 1 -; LMULMAX2-RV64-NEXT: and a3, a3, a6 -; LMULMAX2-RV64-NEXT: sub a1, a1, a3 -; LMULMAX2-RV64-NEXT: and a3, a1, a2 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: srli a4, a1, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a1, a1, a4 +; LMULMAX2-RV64-NEXT: and a4, a1, a2 ; LMULMAX2-RV64-NEXT: srli a1, a1, 2 ; LMULMAX2-RV64-NEXT: and a1, a1, a2 -; LMULMAX2-RV64-NEXT: add a1, a3, a1 +; LMULMAX2-RV64-NEXT: add a1, a4, a1 ; LMULMAX2-RV64-NEXT: srli a2, a1, 4 ; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: sd a1, 32(sp) @@ -8218,79 +8218,79 @@ ; LMULMAX1-RV64-NEXT: and a4, a4, a3 ; LMULMAX1-RV64-NEXT: add a4, a5, a4 ; LMULMAX1-RV64-NEXT: srli a5, a4, 4 -; LMULMAX1-RV64-NEXT: add a4, a4, a5 -; LMULMAX1-RV64-NEXT: lui a5, 3855 -; LMULMAX1-RV64-NEXT: addiw a5, a5, 241 -; LMULMAX1-RV64-NEXT: slli a5, a5, 12 -; LMULMAX1-RV64-NEXT: addi a5, a5, -241 -; LMULMAX1-RV64-NEXT: slli a5, a5, 12 -; LMULMAX1-RV64-NEXT: addi a5, a5, 241 -; LMULMAX1-RV64-NEXT: slli a5, a5, 12 -; LMULMAX1-RV64-NEXT: addi a5, a5, -241 -; LMULMAX1-RV64-NEXT: and a4, a4, a5 -; LMULMAX1-RV64-NEXT: lui a1, 4112 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 257 -; LMULMAX1-RV64-NEXT: slli a1, a1, 16 -; LMULMAX1-RV64-NEXT: addi a1, a1, 257 -; LMULMAX1-RV64-NEXT: slli a1, a1, 16 -; LMULMAX1-RV64-NEXT: addi a1, a1, 257 -; LMULMAX1-RV64-NEXT: mul a4, a4, a1 -; LMULMAX1-RV64-NEXT: srli a4, a4, 56 +; LMULMAX1-RV64-NEXT: add a5, a4, a5 +; LMULMAX1-RV64-NEXT: lui a4, 3855 +; LMULMAX1-RV64-NEXT: addiw a4, a4, 241 +; LMULMAX1-RV64-NEXT: slli a4, a4, 12 +; LMULMAX1-RV64-NEXT: addi a4, a4, -241 +; LMULMAX1-RV64-NEXT: slli a4, a4, 12 +; LMULMAX1-RV64-NEXT: addi a4, a4, 241 +; LMULMAX1-RV64-NEXT: slli a4, a4, 12 +; LMULMAX1-RV64-NEXT: addi a4, a4, -241 +; LMULMAX1-RV64-NEXT: and a1, a5, a4 +; LMULMAX1-RV64-NEXT: lui a5, 4112 +; LMULMAX1-RV64-NEXT: addiw a5, a5, 257 +; LMULMAX1-RV64-NEXT: slli a5, a5, 16 +; LMULMAX1-RV64-NEXT: addi a5, a5, 257 +; LMULMAX1-RV64-NEXT: slli a5, a5, 16 +; LMULMAX1-RV64-NEXT: addi a5, a5, 257 +; LMULMAX1-RV64-NEXT: mul a1, a1, a5 +; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-RV64-NEXT: vmv.v.x v27, a4 -; LMULMAX1-RV64-NEXT: vmv.x.s a4, v26 -; LMULMAX1-RV64-NEXT: addi a2, a4, -1 -; LMULMAX1-RV64-NEXT: not a4, a4 -; LMULMAX1-RV64-NEXT: and a2, a4, a2 -; LMULMAX1-RV64-NEXT: srli a4, a2, 1 -; LMULMAX1-RV64-NEXT: and a4, a4, a7 -; LMULMAX1-RV64-NEXT: sub a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a4, a2, a3 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a3 -; LMULMAX1-RV64-NEXT: add a2, a4, a2 -; LMULMAX1-RV64-NEXT: srli a4, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: vmv.v.x v27, a1 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: addi a2, a1, -1 +; LMULMAX1-RV64-NEXT: not a1, a1 +; LMULMAX1-RV64-NEXT: and a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 1 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: sub a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a2, a1, a3 +; LMULMAX1-RV64-NEXT: srli a1, a1, 2 +; LMULMAX1-RV64-NEXT: and a1, a1, a3 +; LMULMAX1-RV64-NEXT: add a1, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a1, 4 +; LMULMAX1-RV64-NEXT: add a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a1, a1, a4 +; LMULMAX1-RV64-NEXT: mul a1, a1, a5 +; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: vsetvli zero, zero, e64, m1, tu, mu -; LMULMAX1-RV64-NEXT: vmv.s.x v27, a2 +; LMULMAX1-RV64-NEXT: vmv.s.x v27, a1 ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 1 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV64-NEXT: addi a4, a2, -1 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 1 -; LMULMAX1-RV64-NEXT: and a4, a4, a7 -; LMULMAX1-RV64-NEXT: sub a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a4, a2, a3 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a3 -; LMULMAX1-RV64-NEXT: add a2, a4, a2 -; LMULMAX1-RV64-NEXT: srli a4, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: addi a2, a1, -1 +; LMULMAX1-RV64-NEXT: not a1, a1 +; LMULMAX1-RV64-NEXT: and a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 1 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: sub a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a2, a1, a3 +; LMULMAX1-RV64-NEXT: srli a1, a1, 2 +; LMULMAX1-RV64-NEXT: and a1, a1, a3 +; LMULMAX1-RV64-NEXT: add a1, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a1, 4 +; LMULMAX1-RV64-NEXT: add a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a1, a1, a4 +; LMULMAX1-RV64-NEXT: mul a1, a1, a5 +; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-RV64-NEXT: vmv.v.x v26, a2 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 -; LMULMAX1-RV64-NEXT: addi a4, a2, -1 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 1 -; LMULMAX1-RV64-NEXT: and a4, a4, a7 -; LMULMAX1-RV64-NEXT: sub a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a4, a2, a3 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a3 -; LMULMAX1-RV64-NEXT: add a2, a4, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a1, a2, a1 +; LMULMAX1-RV64-NEXT: vmv.v.x v26, a1 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV64-NEXT: addi a2, a1, -1 +; LMULMAX1-RV64-NEXT: not a1, a1 +; LMULMAX1-RV64-NEXT: and a1, a1, a2 +; LMULMAX1-RV64-NEXT: srli a2, a1, 1 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: sub a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a2, a1, a3 +; LMULMAX1-RV64-NEXT: srli a1, a1, 2 +; LMULMAX1-RV64-NEXT: and a1, a1, a3 +; LMULMAX1-RV64-NEXT: add a1, a2, a1 +; LMULMAX1-RV64-NEXT: srli a2, a1, 4 +; LMULMAX1-RV64-NEXT: add a1, a1, a2 +; LMULMAX1-RV64-NEXT: and a1, a1, a4 +; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: vsetvli zero, zero, e64, m1, tu, mu ; LMULMAX1-RV64-NEXT: vmv.s.x v26, a1 Index: llvm/test/CodeGen/RISCV/stack-store-check.ll =================================================================== --- llvm/test/CodeGen/RISCV/stack-store-check.ll +++ llvm/test/CodeGen/RISCV/stack-store-check.ll @@ -32,12 +32,12 @@ ; CHECK-NEXT: lw s6, %lo(U)(a0) ; CHECK-NEXT: lw s7, %lo(U+4)(a0) ; CHECK-NEXT: lw s8, %lo(U+8)(a0) -; CHECK-NEXT: lw s0, %lo(U+12)(a0) +; CHECK-NEXT: lw s2, %lo(U+12)(a0) ; CHECK-NEXT: sw zero, 612(sp) ; CHECK-NEXT: sw zero, 608(sp) ; CHECK-NEXT: sw zero, 604(sp) ; CHECK-NEXT: sw zero, 600(sp) -; CHECK-NEXT: sw s0, 596(sp) +; CHECK-NEXT: sw s2, 596(sp) ; CHECK-NEXT: sw s8, 592(sp) ; CHECK-NEXT: sw s7, 588(sp) ; CHECK-NEXT: addi a0, sp, 616 @@ -45,21 +45,21 @@ ; CHECK-NEXT: addi a2, sp, 584 ; CHECK-NEXT: sw s6, 584(sp) ; CHECK-NEXT: call __subtf3@plt -; CHECK-NEXT: lw s3, 616(sp) -; CHECK-NEXT: lw s4, 620(sp) -; CHECK-NEXT: lw s9, 624(sp) +; CHECK-NEXT: lw s4, 616(sp) +; CHECK-NEXT: lw s5, 620(sp) +; CHECK-NEXT: lw s3, 624(sp) ; CHECK-NEXT: lw s11, 628(sp) -; CHECK-NEXT: sw s0, 548(sp) +; CHECK-NEXT: sw s2, 548(sp) ; CHECK-NEXT: sw s8, 544(sp) ; CHECK-NEXT: sw s7, 540(sp) ; CHECK-NEXT: sw s6, 536(sp) ; CHECK-NEXT: sw s11, 564(sp) -; CHECK-NEXT: sw s9, 560(sp) -; CHECK-NEXT: sw s4, 556(sp) +; CHECK-NEXT: sw s3, 560(sp) +; CHECK-NEXT: sw s5, 556(sp) ; CHECK-NEXT: addi a0, sp, 568 ; CHECK-NEXT: addi a1, sp, 552 ; CHECK-NEXT: addi a2, sp, 536 -; CHECK-NEXT: sw s3, 552(sp) +; CHECK-NEXT: sw s4, 552(sp) ; CHECK-NEXT: call __subtf3@plt ; CHECK-NEXT: lw a0, 568(sp) ; CHECK-NEXT: sw a0, 40(sp) # 4-byte Folded Spill @@ -68,12 +68,12 @@ ; CHECK-NEXT: lw a0, 576(sp) ; CHECK-NEXT: sw a0, 24(sp) # 4-byte Folded Spill ; CHECK-NEXT: lw a0, 580(sp) -; CHECK-NEXT: sw a0, 16(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw a0, 48(sp) # 4-byte Folded Spill ; CHECK-NEXT: sw zero, 500(sp) ; CHECK-NEXT: sw zero, 496(sp) ; CHECK-NEXT: sw zero, 492(sp) ; CHECK-NEXT: sw zero, 488(sp) -; CHECK-NEXT: sw s0, 516(sp) +; CHECK-NEXT: sw s2, 516(sp) ; CHECK-NEXT: sw s8, 512(sp) ; CHECK-NEXT: sw s7, 508(sp) ; CHECK-NEXT: addi a0, sp, 520 @@ -81,31 +81,32 @@ ; CHECK-NEXT: addi a2, sp, 488 ; CHECK-NEXT: sw s6, 504(sp) ; CHECK-NEXT: call __addtf3@plt -; CHECK-NEXT: lw s2, 520(sp) +; CHECK-NEXT: lw s9, 520(sp) ; CHECK-NEXT: lw s10, 524(sp) -; CHECK-NEXT: lw s5, 528(sp) +; CHECK-NEXT: lw s0, 528(sp) +; CHECK-NEXT: sw s0, 20(sp) # 4-byte Folded Spill ; CHECK-NEXT: lw s1, 532(sp) -; CHECK-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw s1, 16(sp) # 4-byte Folded Spill ; CHECK-NEXT: lui a0, %hi(Y1) ; CHECK-NEXT: lw a1, %lo(Y1)(a0) -; CHECK-NEXT: sw a1, 48(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw a1, 52(sp) # 4-byte Folded Spill ; CHECK-NEXT: lw a2, %lo(Y1+4)(a0) -; CHECK-NEXT: sw a2, 52(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw a2, 12(sp) # 4-byte Folded Spill ; CHECK-NEXT: lw a3, %lo(Y1+8)(a0) -; CHECK-NEXT: sw a3, 4(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw a3, 8(sp) # 4-byte Folded Spill ; CHECK-NEXT: lw a0, %lo(Y1+12)(a0) -; CHECK-NEXT: sw a0, 0(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw a0, 4(sp) # 4-byte Folded Spill ; CHECK-NEXT: sw a0, 308(sp) ; CHECK-NEXT: sw a3, 304(sp) ; CHECK-NEXT: sw a2, 300(sp) ; CHECK-NEXT: sw a1, 296(sp) ; CHECK-NEXT: sw s11, 324(sp) -; CHECK-NEXT: sw s9, 320(sp) -; CHECK-NEXT: sw s4, 316(sp) +; CHECK-NEXT: sw s3, 320(sp) +; CHECK-NEXT: sw s5, 316(sp) ; CHECK-NEXT: addi a0, sp, 328 ; CHECK-NEXT: addi a1, sp, 312 ; CHECK-NEXT: addi a2, sp, 296 -; CHECK-NEXT: sw s3, 312(sp) +; CHECK-NEXT: sw s4, 312(sp) ; CHECK-NEXT: call __multf3@plt ; CHECK-NEXT: lw a0, 328(sp) ; CHECK-NEXT: sw a0, 44(sp) # 4-byte Folded Spill @@ -113,19 +114,18 @@ ; CHECK-NEXT: sw a0, 36(sp) # 4-byte Folded Spill ; CHECK-NEXT: lw a0, 336(sp) ; CHECK-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; CHECK-NEXT: lw a0, 340(sp) -; CHECK-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; CHECK-NEXT: sw s0, 468(sp) +; CHECK-NEXT: lw s4, 340(sp) +; CHECK-NEXT: sw s2, 468(sp) ; CHECK-NEXT: sw s8, 464(sp) ; CHECK-NEXT: sw s7, 460(sp) ; CHECK-NEXT: sw s6, 456(sp) ; CHECK-NEXT: sw s1, 452(sp) -; CHECK-NEXT: sw s5, 448(sp) +; CHECK-NEXT: sw s0, 448(sp) ; CHECK-NEXT: sw s10, 444(sp) ; CHECK-NEXT: addi a0, sp, 472 ; CHECK-NEXT: addi a1, sp, 456 ; CHECK-NEXT: addi a2, sp, 440 -; CHECK-NEXT: sw s2, 440(sp) +; CHECK-NEXT: sw s9, 440(sp) ; CHECK-NEXT: call __addtf3@plt ; CHECK-NEXT: lw a3, 472(sp) ; CHECK-NEXT: lw a0, 476(sp) @@ -152,43 +152,44 @@ ; CHECK-NEXT: sw a2, %lo(X+8)(a4) ; CHECK-NEXT: sw a3, %lo(X+4)(a4) ; CHECK-NEXT: sw a0, %lo(X)(a4) -; CHECK-NEXT: lw s8, 0(sp) # 4-byte Folded Reload +; CHECK-NEXT: lw s8, 4(sp) # 4-byte Folded Reload ; CHECK-NEXT: sw s8, 212(sp) -; CHECK-NEXT: lw s7, 4(sp) # 4-byte Folded Reload +; CHECK-NEXT: lw s7, 8(sp) # 4-byte Folded Reload ; CHECK-NEXT: sw s7, 208(sp) +; CHECK-NEXT: lw s11, 12(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s11, 204(sp) ; CHECK-NEXT: lw a0, 52(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw a0, 204(sp) -; CHECK-NEXT: lw a0, 48(sp) # 4-byte Folded Reload ; CHECK-NEXT: sw a0, 200(sp) -; CHECK-NEXT: lw s6, 16(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s6, 228(sp) -; CHECK-NEXT: lw s4, 24(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s4, 224(sp) -; CHECK-NEXT: lw s0, 32(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s0, 220(sp) +; CHECK-NEXT: lw a0, 48(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw a0, 228(sp) +; CHECK-NEXT: lw s3, 24(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s3, 224(sp) +; CHECK-NEXT: lw s2, 32(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s2, 220(sp) ; CHECK-NEXT: addi a0, sp, 232 ; CHECK-NEXT: addi a1, sp, 216 ; CHECK-NEXT: addi a2, sp, 200 ; CHECK-NEXT: lw s1, 40(sp) # 4-byte Folded Reload ; CHECK-NEXT: sw s1, 216(sp) ; CHECK-NEXT: call __multf3@plt -; CHECK-NEXT: lw a0, 232(sp) -; CHECK-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; CHECK-NEXT: lw s3, 236(sp) -; CHECK-NEXT: lw s9, 240(sp) -; CHECK-NEXT: lw s11, 244(sp) +; CHECK-NEXT: lw s5, 232(sp) +; CHECK-NEXT: lw a0, 236(sp) +; CHECK-NEXT: sw a0, 0(sp) # 4-byte Folded Spill +; CHECK-NEXT: lw s6, 240(sp) +; CHECK-NEXT: lw s0, 244(sp) ; CHECK-NEXT: sw zero, 356(sp) ; CHECK-NEXT: sw zero, 352(sp) ; CHECK-NEXT: sw zero, 348(sp) ; CHECK-NEXT: sw zero, 344(sp) -; CHECK-NEXT: lw a0, 8(sp) # 4-byte Folded Reload +; CHECK-NEXT: lw a0, 16(sp) # 4-byte Folded Reload ; CHECK-NEXT: sw a0, 372(sp) -; CHECK-NEXT: sw s5, 368(sp) +; CHECK-NEXT: lw a0, 20(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw a0, 368(sp) ; CHECK-NEXT: sw s10, 364(sp) ; CHECK-NEXT: addi a0, sp, 376 ; CHECK-NEXT: addi a1, sp, 360 ; CHECK-NEXT: addi a2, sp, 344 -; CHECK-NEXT: sw s2, 360(sp) +; CHECK-NEXT: sw s9, 360(sp) ; CHECK-NEXT: call __multf3@plt ; CHECK-NEXT: lw a0, 376(sp) ; CHECK-NEXT: lw a1, 388(sp) @@ -199,12 +200,12 @@ ; CHECK-NEXT: sw a2, %lo(S+8)(a4) ; CHECK-NEXT: sw a3, %lo(S+4)(a4) ; CHECK-NEXT: sw a0, %lo(S)(a4) -; CHECK-NEXT: sw s6, 260(sp) -; CHECK-NEXT: sw s4, 256(sp) -; CHECK-NEXT: sw s0, 252(sp) +; CHECK-NEXT: lw a0, 48(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw a0, 260(sp) +; CHECK-NEXT: sw s3, 256(sp) +; CHECK-NEXT: sw s2, 252(sp) ; CHECK-NEXT: sw s1, 248(sp) -; CHECK-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw a0, 276(sp) +; CHECK-NEXT: sw s4, 276(sp) ; CHECK-NEXT: lw a0, 28(sp) # 4-byte Folded Reload ; CHECK-NEXT: sw a0, 272(sp) ; CHECK-NEXT: lw a0, 36(sp) # 4-byte Folded Reload @@ -228,14 +229,14 @@ ; CHECK-NEXT: sw zero, 160(sp) ; CHECK-NEXT: sw zero, 156(sp) ; CHECK-NEXT: sw zero, 152(sp) -; CHECK-NEXT: sw s11, 180(sp) -; CHECK-NEXT: sw s9, 176(sp) -; CHECK-NEXT: sw s3, 172(sp) +; CHECK-NEXT: sw s0, 180(sp) +; CHECK-NEXT: sw s6, 176(sp) +; CHECK-NEXT: lw a0, 0(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw a0, 172(sp) ; CHECK-NEXT: addi a0, sp, 184 ; CHECK-NEXT: addi a1, sp, 168 ; CHECK-NEXT: addi a2, sp, 152 -; CHECK-NEXT: lw a3, 12(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw a3, 168(sp) +; CHECK-NEXT: sw s5, 168(sp) ; CHECK-NEXT: call __addtf3@plt ; CHECK-NEXT: lw a0, 184(sp) ; CHECK-NEXT: lw a1, 196(sp) @@ -252,12 +253,11 @@ ; CHECK-NEXT: sw zero, 104(sp) ; CHECK-NEXT: sw s8, 132(sp) ; CHECK-NEXT: sw s7, 128(sp) -; CHECK-NEXT: lw a0, 52(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw a0, 124(sp) +; CHECK-NEXT: sw s11, 124(sp) ; CHECK-NEXT: addi a0, sp, 136 ; CHECK-NEXT: addi a1, sp, 120 ; CHECK-NEXT: addi a2, sp, 104 -; CHECK-NEXT: lw a3, 48(sp) # 4-byte Folded Reload +; CHECK-NEXT: lw a3, 52(sp) # 4-byte Folded Reload ; CHECK-NEXT: sw a3, 120(sp) ; CHECK-NEXT: call __multf3@plt ; CHECK-NEXT: lw a3, 136(sp) Index: llvm/test/CodeGen/Thumb2/mve-simple-arith.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-simple-arith.ll +++ llvm/test/CodeGen/Thumb2/mve-simple-arith.ll @@ -114,13 +114,13 @@ ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vmov q5, q0 -; CHECK-NEXT: vmov r0, r1, d9 -; CHECK-NEXT: vmov r2, r3, d11 +; CHECK-NEXT: vmov q5, q1 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov r0, r1, d11 +; CHECK-NEXT: vmov r2, r3, d9 ; CHECK-NEXT: bl __aeabi_dadd -; CHECK-NEXT: vmov lr, r12, d8 -; CHECK-NEXT: vmov r2, r3, d10 +; CHECK-NEXT: vmov lr, r12, d10 +; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 ; CHECK-NEXT: mov r0, lr ; CHECK-NEXT: mov r1, r12 @@ -247,13 +247,13 @@ ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vmov q5, q0 -; CHECK-NEXT: vmov r0, r1, d9 -; CHECK-NEXT: vmov r2, r3, d11 +; CHECK-NEXT: vmov q5, q1 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov r0, r1, d11 +; CHECK-NEXT: vmov r2, r3, d9 ; CHECK-NEXT: bl __aeabi_dsub -; CHECK-NEXT: vmov lr, r12, d8 -; CHECK-NEXT: vmov r2, r3, d10 +; CHECK-NEXT: vmov lr, r12, d10 +; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 ; CHECK-NEXT: mov r0, lr ; CHECK-NEXT: mov r1, r12 @@ -382,13 +382,13 @@ ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vmov q5, q0 -; CHECK-NEXT: vmov r0, r1, d9 -; CHECK-NEXT: vmov r2, r3, d11 +; CHECK-NEXT: vmov q5, q1 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov r0, r1, d11 +; CHECK-NEXT: vmov r2, r3, d9 ; CHECK-NEXT: bl __aeabi_dmul -; CHECK-NEXT: vmov lr, r12, d8 -; CHECK-NEXT: vmov r2, r3, d10 +; CHECK-NEXT: vmov lr, r12, d10 +; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 ; CHECK-NEXT: mov r0, lr ; CHECK-NEXT: mov r1, r12 Index: llvm/test/CodeGen/Thumb2/mve-vld4.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -73,27 +73,27 @@ define void @vld4_v8i32(<32 x i32> *%src, <8 x i32> *%dst) { ; CHECK-LABEL: vld4_v8i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0]! ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i32 q6, q2, q3 +; CHECK-NEXT: vadd.i32 q4, q2, q3 +; CHECK-NEXT: vadd.i32 q5, q0, q1 +; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vadd.i32 q4, q5, q4 +; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 +; CHECK-NEXT: vadd.i32 q5, q2, q3 ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vadd.i32 q0, q0, q6 -; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 -; CHECK-NEXT: vadd.i32 q5, q3, q4 -; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vadd.i32 q1, q1, q5 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vadd.i32 q0, q0, q5 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x i32>, <32 x i32>* %src, align 4 @@ -130,12 +130,12 @@ ; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r3] ; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [sp, #96] @ 16-byte Reload ; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vldrw.u32 q5, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vadd.i32 q6, q5, q6 -; CHECK-NEXT: vstrw.32 q6, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q5, q6, q5 +; CHECK-NEXT: vstrw.32 q5, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r3] ; CHECK-NEXT: vstrw.32 q4, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vmov q0, q1 @@ -149,27 +149,26 @@ ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov q5, q1 +; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vstrw.32 q4, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q0, q0, q5 +; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vadd.i32 q1, q3, q1 ; CHECK-NEXT: vldmia sp, {d6, d7, d8, d9, d10, d11, d12, d13} @ 64-byte Reload -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6 -; CHECK-NEXT: vadd.i32 q2, q3, q4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6 ; CHECK-NEXT: vadd.i32 q1, q5, q6 +; CHECK-NEXT: vadd.i32 q2, q3, q4 +; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vadd.i32 q1, q2, q1 ; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: add sp, #112 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} @@ -349,27 +348,27 @@ define void @vld4_v16i16(<64 x i16> *%src, <16 x i16> *%dst) { ; CHECK-LABEL: vld4_v16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0]! ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i16 q6, q2, q3 +; CHECK-NEXT: vadd.i16 q4, q2, q3 +; CHECK-NEXT: vadd.i16 q5, q0, q1 +; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vadd.i16 q4, q5, q4 +; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 +; CHECK-NEXT: vadd.i16 q5, q2, q3 ; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vld40.16 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vadd.i16 q0, q0, q6 -; CHECK-NEXT: vld41.16 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld42.16 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld43.16 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 -; CHECK-NEXT: vadd.i16 q5, q3, q4 -; CHECK-NEXT: vadd.i16 q1, q1, q2 -; CHECK-NEXT: vadd.i16 q1, q1, q5 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vadd.i16 q0, q0, q5 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %l1 = load <64 x i16>, <64 x i16>* %src, align 2 @@ -871,27 +870,27 @@ define void @vld4_v8f32(<32 x float> *%src, <8 x float> *%dst) { ; CHECK-LABEL: vld4_v8f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0]! ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.f32 q6, q2, q3 +; CHECK-NEXT: vadd.f32 q4, q2, q3 +; CHECK-NEXT: vadd.f32 q5, q0, q1 +; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vadd.f32 q4, q5, q4 +; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 +; CHECK-NEXT: vadd.f32 q5, q2, q3 ; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vadd.f32 q0, q0, q6 -; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 -; CHECK-NEXT: vadd.f32 q5, q3, q4 -; CHECK-NEXT: vadd.f32 q1, q1, q2 -; CHECK-NEXT: vadd.f32 q1, q1, q5 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vadd.f32 q0, q0, q5 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x float>, <32 x float>* %src, align 4 @@ -928,12 +927,12 @@ ; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r3] ; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [sp, #96] @ 16-byte Reload ; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vldrw.u32 q5, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vadd.f32 q6, q5, q6 -; CHECK-NEXT: vstrw.32 q6, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vadd.f32 q5, q6, q5 +; CHECK-NEXT: vstrw.32 q5, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r3] ; CHECK-NEXT: vstrw.32 q4, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vmov q0, q1 @@ -947,27 +946,26 @@ ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov q5, q1 +; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vstrw.32 q4, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vadd.f32 q0, q0, q5 +; CHECK-NEXT: vadd.f32 q0, q0, q2 +; CHECK-NEXT: vadd.f32 q1, q3, q1 ; CHECK-NEXT: vldmia sp, {d6, d7, d8, d9, d10, d11, d12, d13} @ 64-byte Reload -; CHECK-NEXT: vadd.f32 q1, q2, q1 -; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6 -; CHECK-NEXT: vadd.f32 q2, q3, q4 ; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6 ; CHECK-NEXT: vadd.f32 q1, q5, q6 +; CHECK-NEXT: vadd.f32 q2, q3, q4 +; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vadd.f32 q1, q2, q1 ; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: add sp, #112 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} Index: llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll =================================================================== --- llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll +++ llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll @@ -68,39 +68,39 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: and r0, r3, #1 -; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: rsbs r1, r0, #0 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: movs r2, #9 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: bl __aeabi_ldivmod -; CHECK-NEXT: and r0, r4, #1 -; CHECK-NEXT: mov r6, r2 +; CHECK-NEXT: and r0, r5, #1 +; CHECK-NEXT: mov r7, r2 ; CHECK-NEXT: rsbs r1, r0, #0 -; CHECK-NEXT: mov r7, r3 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r4, r3 +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: movs r2, #9 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: bl __aeabi_ldivmod ; CHECK-NEXT: ldr r1, [sp, #44] ; CHECK-NEXT: vmov.32 d8[0], r2 ; CHECK-NEXT: ldr r0, [sp, #40] -; CHECK-NEXT: mov r4, r3 +; CHECK-NEXT: mov r5, r3 ; CHECK-NEXT: and r1, r1, #1 ; CHECK-NEXT: mvn r2, #8 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: mov.w r3, #-1 -; CHECK-NEXT: vmov.32 d9[0], r6 +; CHECK-NEXT: vmov.32 d9[0], r7 ; CHECK-NEXT: bl __aeabi_ldivmod ; CHECK-NEXT: vmov.32 d16[0], r2 ; CHECK-NEXT: adr r0, .LCPI3_0 -; CHECK-NEXT: vmov.32 d9[1], r7 +; CHECK-NEXT: vmov.32 d9[1], r4 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0:128] ; CHECK-NEXT: adr r0, .LCPI3_1 ; CHECK-NEXT: vmov.32 d16[1], r3 -; CHECK-NEXT: vmov.32 d8[1], r4 +; CHECK-NEXT: vmov.32 d8[1], r5 ; CHECK-NEXT: vand q8, q8, q9 ; CHECK-NEXT: vld1.64 {d20, d21}, [r0:128] ; CHECK-NEXT: adr r0, .LCPI3_2 Index: llvm/test/CodeGen/X86/2007-10-12-SpillerUnfold1.ll =================================================================== --- llvm/test/CodeGen/X86/2007-10-12-SpillerUnfold1.ll +++ llvm/test/CodeGen/X86/2007-10-12-SpillerUnfold1.ll @@ -4,32 +4,32 @@ define fastcc void @fht(float* %fz, i16 signext %n) { ; CHECK-LABEL: fht: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: xorps %xmm2, %xmm2 -; CHECK-NEXT: subss %xmm1, %xmm2 -; CHECK-NEXT: movaps %xmm1, %xmm3 -; CHECK-NEXT: mulss %xmm0, %xmm3 -; CHECK-NEXT: addss %xmm1, %xmm3 -; CHECK-NEXT: movaps %xmm1, %xmm4 -; CHECK-NEXT: subss %xmm3, %xmm4 -; CHECK-NEXT: addss %xmm1, %xmm3 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: subss %xmm3, %xmm1 +; CHECK-NEXT: movaps %xmm3, %xmm4 +; CHECK-NEXT: mulss %xmm0, %xmm4 +; CHECK-NEXT: addss %xmm3, %xmm4 +; CHECK-NEXT: movaps %xmm3, %xmm2 +; CHECK-NEXT: subss %xmm4, %xmm2 +; CHECK-NEXT: addss %xmm3, %xmm4 ; CHECK-NEXT: xorps %xmm5, %xmm5 -; CHECK-NEXT: subss %xmm2, %xmm5 -; CHECK-NEXT: addss %xmm0, %xmm2 -; CHECK-NEXT: mulss %xmm0, %xmm3 +; CHECK-NEXT: subss %xmm1, %xmm5 +; CHECK-NEXT: addss %xmm0, %xmm1 +; CHECK-NEXT: mulss %xmm0, %xmm4 ; CHECK-NEXT: mulss %xmm0, %xmm5 -; CHECK-NEXT: addss %xmm3, %xmm5 +; CHECK-NEXT: addss %xmm4, %xmm5 ; CHECK-NEXT: addss %xmm0, %xmm5 ; CHECK-NEXT: movss %xmm5, 0 -; CHECK-NEXT: movss %xmm1, (%ecx) -; CHECK-NEXT: addss %xmm0, %xmm1 -; CHECK-NEXT: movss %xmm1, 0 +; CHECK-NEXT: movss %xmm3, (%ecx) +; CHECK-NEXT: addss %xmm0, %xmm3 +; CHECK-NEXT: movss %xmm3, 0 +; CHECK-NEXT: mulss %xmm0, %xmm1 ; CHECK-NEXT: mulss %xmm0, %xmm2 -; CHECK-NEXT: mulss %xmm0, %xmm4 -; CHECK-NEXT: addss %xmm2, %xmm4 -; CHECK-NEXT: addss %xmm0, %xmm4 -; CHECK-NEXT: movss %xmm4, (%ecx) +; CHECK-NEXT: addss %xmm1, %xmm2 +; CHECK-NEXT: addss %xmm0, %xmm2 +; CHECK-NEXT: movss %xmm2, (%ecx) ; CHECK-NEXT: retl entry: br i1 true, label %bb171.preheader, label %bb431 Index: llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll =================================================================== --- llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll +++ llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll @@ -29,20 +29,20 @@ ; CHECK-NEXT: movw $0, 40(%edi) ; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: leal (,%ecx,4), %eax -; CHECK-NEXT: leal (,%ebx,4), %ecx +; CHECK-NEXT: leal (,%ecx,4), %ecx +; CHECK-NEXT: leal (,%ebx,4), %edx ; CHECK-NEXT: subl $12, %esp -; CHECK-NEXT: movzwl %bp, %edx -; CHECK-NEXT: cwtl +; CHECK-NEXT: movzwl %bp, %eax ; CHECK-NEXT: movswl %cx, %ecx +; CHECK-NEXT: movswl %dx, %edx ; CHECK-NEXT: pushl $87 ; CHECK-NEXT: pushl {{[0-9]+}}(%esp) -; CHECK-NEXT: pushl %eax -; CHECK-NEXT: pushl $0 -; CHECK-NEXT: pushl {{[0-9]+}}(%esp) ; CHECK-NEXT: pushl %ecx ; CHECK-NEXT: pushl $0 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) ; CHECK-NEXT: pushl %edx +; CHECK-NEXT: pushl $0 +; CHECK-NEXT: pushl %eax ; CHECK-NEXT: pushl %edi ; CHECK-NEXT: calll _SQLDrivers_Internal ; CHECK-NEXT: addl $48, %esp Index: llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll =================================================================== --- llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll +++ llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll @@ -337,20 +337,20 @@ ; X32-BMI2-NEXT: pushl %ebx ; X32-BMI2-NEXT: pushl %edi ; X32-BMI2-NEXT: pushl %esi -; X32-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-BMI2-NEXT: movl %eax, %ebx -; X32-BMI2-NEXT: addl $32, %ebx -; X32-BMI2-NEXT: adcl $0, %edi -; X32-BMI2-NEXT: movl %ebx, (%ecx) -; X32-BMI2-NEXT: movl %edi, 4(%ecx) +; X32-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-BMI2-NEXT: movl %ebx, %edi +; X32-BMI2-NEXT: addl $32, %edi +; X32-BMI2-NEXT: adcl $0, %esi +; X32-BMI2-NEXT: movl %edi, (%ecx) +; X32-BMI2-NEXT: movl %esi, 4(%ecx) ; X32-BMI2-NEXT: movb $32, %cl -; X32-BMI2-NEXT: subb %al, %cl -; X32-BMI2-NEXT: shldl %cl, %esi, %edx -; X32-BMI2-NEXT: shlxl %ecx, %esi, %eax +; X32-BMI2-NEXT: subb %bl, %cl +; X32-BMI2-NEXT: shldl %cl, %eax, %edx +; X32-BMI2-NEXT: shlxl %ecx, %eax, %eax ; X32-BMI2-NEXT: testb $32, %cl ; X32-BMI2-NEXT: je .LBB5_2 ; X32-BMI2-NEXT: # %bb.1: Index: llvm/test/CodeGen/X86/abs.ll =================================================================== --- llvm/test/CodeGen/X86/abs.ll +++ llvm/test/CodeGen/X86/abs.ll @@ -278,25 +278,25 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: negl %edx +; X86-NEXT: cmovll %ebx, %edx ; X86-NEXT: movl %edi, %ebx ; X86-NEXT: negl %ebx ; X86-NEXT: cmovll %edi, %ebx ; X86-NEXT: movl %esi, %edi ; X86-NEXT: negl %edi ; X86-NEXT: cmovll %esi, %edi -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %ecx, %esi ; X86-NEXT: negl %esi -; X86-NEXT: cmovll %edx, %esi -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: negl %edx -; X86-NEXT: cmovll %ecx, %edx -; X86-NEXT: movl %edx, 12(%eax) -; X86-NEXT: movl %esi, 8(%eax) -; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: movl %ebx, (%eax) +; X86-NEXT: cmovll %ecx, %esi +; X86-NEXT: movl %esi, 12(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %ebx, 4(%eax) +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -339,31 +339,31 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: negl %ecx ; X86-NEXT: cmovll %edx, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: negl %ecx -; X86-NEXT: cmovll %ebp, %ecx +; X86-NEXT: cmovll %esi, %ecx ; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: negl %ebp -; X86-NEXT: cmovll %ebx, %ebp -; X86-NEXT: movl %edi, %ebx +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: negl %esi +; X86-NEXT: cmovll %ebx, %esi +; X86-NEXT: movl %ebp, %ebx ; X86-NEXT: negl %ebx -; X86-NEXT: cmovll %edi, %ebx -; X86-NEXT: movl %esi, %edi +; X86-NEXT: cmovll %ebp, %ebx +; X86-NEXT: movl %edi, %ebp +; X86-NEXT: negl %ebp +; X86-NEXT: cmovll %edi, %ebp +; X86-NEXT: movl %eax, %edi ; X86-NEXT: negl %edi -; X86-NEXT: cmovll %esi, %edi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: negl %esi -; X86-NEXT: cmovll %eax, %esi +; X86-NEXT: cmovll %eax, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: negl %eax @@ -375,10 +375,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %ecx, 28(%edx) ; X86-NEXT: movl %eax, 24(%edx) -; X86-NEXT: movl %esi, 20(%edx) -; X86-NEXT: movl %edi, 16(%edx) +; X86-NEXT: movl %edi, 20(%edx) +; X86-NEXT: movl %ebp, 16(%edx) ; X86-NEXT: movl %ebx, 12(%edx) -; X86-NEXT: movl %ebp, 8(%edx) +; X86-NEXT: movl %esi, 8(%edx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NEXT: movl %eax, 4(%edx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -415,31 +415,31 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %eax ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: negw %cx ; X86-NEXT: cmovlw %dx, %cx ; X86-NEXT: movw %cx, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill -; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: negw %cx -; X86-NEXT: cmovlw %bp, %cx +; X86-NEXT: cmovlw %si, %cx ; X86-NEXT: movw %cx, (%esp) # 2-byte Spill -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: negw %bp -; X86-NEXT: cmovlw %bx, %bp -; X86-NEXT: movl %edi, %ebx +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: negw %si +; X86-NEXT: cmovlw %bx, %si +; X86-NEXT: movl %ebp, %ebx ; X86-NEXT: negw %bx -; X86-NEXT: cmovlw %di, %bx -; X86-NEXT: movl %esi, %edi +; X86-NEXT: cmovlw %bp, %bx +; X86-NEXT: movl %edi, %ebp +; X86-NEXT: negw %bp +; X86-NEXT: cmovlw %di, %bp +; X86-NEXT: movl %eax, %edi ; X86-NEXT: negw %di -; X86-NEXT: cmovlw %si, %di -; X86-NEXT: movl %eax, %esi -; X86-NEXT: negw %si -; X86-NEXT: cmovlw %ax, %si +; X86-NEXT: cmovlw %ax, %di ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: negw %ax @@ -451,10 +451,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movw %cx, 14(%edx) ; X86-NEXT: movw %ax, 12(%edx) -; X86-NEXT: movw %si, 10(%edx) -; X86-NEXT: movw %di, 8(%edx) +; X86-NEXT: movw %di, 10(%edx) +; X86-NEXT: movw %bp, 8(%edx) ; X86-NEXT: movw %bx, 6(%edx) -; X86-NEXT: movw %bp, 4(%edx) +; X86-NEXT: movw %si, 4(%edx) ; X86-NEXT: movzwl (%esp), %eax # 2-byte Folded Reload ; X86-NEXT: movw %ax, 2(%edx) ; X86-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload Index: llvm/test/CodeGen/X86/avx512-calling-conv.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-calling-conv.ll +++ llvm/test/CodeGen/X86/avx512-calling-conv.ll @@ -1564,11 +1564,11 @@ ; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_X32-NEXT: kmovw %k1, %ebx ; KNL_X32-NEXT: kshiftrw $1, %k0, %k1 -; KNL_X32-NEXT: kmovw %k1, %esi +; KNL_X32-NEXT: kmovw %k1, %ebp ; KNL_X32-NEXT: kshiftrw $2, %k0, %k1 -; KNL_X32-NEXT: kmovw %k1, %edi +; KNL_X32-NEXT: kmovw %k1, %esi ; KNL_X32-NEXT: kshiftrw $3, %k0, %k1 -; KNL_X32-NEXT: kmovw %k1, %ebp +; KNL_X32-NEXT: kmovw %k1, %edi ; KNL_X32-NEXT: kshiftrw $4, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %edx ; KNL_X32-NEXT: kshiftrw $5, %k0, %k1 @@ -1578,66 +1578,66 @@ ; KNL_X32-NEXT: movb %bl, 2(%eax) ; KNL_X32-NEXT: kmovw %k0, %ebx ; KNL_X32-NEXT: andl $1, %ebx -; KNL_X32-NEXT: andl $1, %esi -; KNL_X32-NEXT: leal (%ebx,%esi,2), %esi -; KNL_X32-NEXT: kmovw %k1, %ebx +; KNL_X32-NEXT: andl $1, %ebp +; KNL_X32-NEXT: leal (%ebx,%ebp,2), %ebx +; KNL_X32-NEXT: kmovw %k1, %ebp ; KNL_X32-NEXT: kshiftrw $7, %k0, %k1 +; KNL_X32-NEXT: andl $1, %esi +; KNL_X32-NEXT: leal (%ebx,%esi,4), %ebx +; KNL_X32-NEXT: kmovw %k1, %esi +; KNL_X32-NEXT: kshiftrw $8, %k0, %k1 ; KNL_X32-NEXT: andl $1, %edi -; KNL_X32-NEXT: leal (%esi,%edi,4), %esi +; KNL_X32-NEXT: leal (%ebx,%edi,8), %ebx ; KNL_X32-NEXT: kmovw %k1, %edi -; KNL_X32-NEXT: kshiftrw $8, %k0, %k1 -; KNL_X32-NEXT: andl $1, %ebp -; KNL_X32-NEXT: leal (%esi,%ebp,8), %esi -; KNL_X32-NEXT: kmovw %k1, %ebp ; KNL_X32-NEXT: kshiftrw $9, %k0, %k1 ; KNL_X32-NEXT: andl $1, %edx ; KNL_X32-NEXT: shll $4, %edx -; KNL_X32-NEXT: orl %esi, %edx -; KNL_X32-NEXT: kmovw %k1, %esi +; KNL_X32-NEXT: orl %ebx, %edx +; KNL_X32-NEXT: kmovw %k1, %ebx ; KNL_X32-NEXT: kshiftrw $10, %k0, %k1 ; KNL_X32-NEXT: andl $1, %ecx ; KNL_X32-NEXT: shll $5, %ecx ; KNL_X32-NEXT: orl %edx, %ecx ; KNL_X32-NEXT: kmovw %k1, %edx ; KNL_X32-NEXT: kshiftrw $11, %k0, %k1 -; KNL_X32-NEXT: andl $1, %ebx -; KNL_X32-NEXT: shll $6, %ebx -; KNL_X32-NEXT: andl $1, %edi -; KNL_X32-NEXT: shll $7, %edi -; KNL_X32-NEXT: orl %ebx, %edi -; KNL_X32-NEXT: kmovw %k1, %ebx -; KNL_X32-NEXT: kshiftrw $12, %k0, %k1 ; KNL_X32-NEXT: andl $1, %ebp -; KNL_X32-NEXT: shll $8, %ebp -; KNL_X32-NEXT: orl %edi, %ebp -; KNL_X32-NEXT: kmovw %k1, %edi -; KNL_X32-NEXT: kshiftrw $13, %k0, %k1 +; KNL_X32-NEXT: shll $6, %ebp ; KNL_X32-NEXT: andl $1, %esi -; KNL_X32-NEXT: shll $9, %esi +; KNL_X32-NEXT: shll $7, %esi ; KNL_X32-NEXT: orl %ebp, %esi ; KNL_X32-NEXT: kmovw %k1, %ebp +; KNL_X32-NEXT: kshiftrw $12, %k0, %k1 +; KNL_X32-NEXT: andl $1, %edi +; KNL_X32-NEXT: shll $8, %edi +; KNL_X32-NEXT: orl %esi, %edi +; KNL_X32-NEXT: kmovw %k1, %esi +; KNL_X32-NEXT: kshiftrw $13, %k0, %k1 +; KNL_X32-NEXT: andl $1, %ebx +; KNL_X32-NEXT: shll $9, %ebx +; KNL_X32-NEXT: orl %edi, %ebx +; KNL_X32-NEXT: kmovw %k1, %edi ; KNL_X32-NEXT: kshiftrw $14, %k0, %k1 ; KNL_X32-NEXT: andl $1, %edx ; KNL_X32-NEXT: shll $10, %edx -; KNL_X32-NEXT: orl %esi, %edx -; KNL_X32-NEXT: kmovw %k1, %esi +; KNL_X32-NEXT: orl %ebx, %edx +; KNL_X32-NEXT: kmovw %k1, %ebx ; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 ; KNL_X32-NEXT: orl %ecx, %edx ; KNL_X32-NEXT: kmovw %k0, %ecx -; KNL_X32-NEXT: andl $1, %ebx -; KNL_X32-NEXT: shll $11, %ebx -; KNL_X32-NEXT: andl $1, %edi -; KNL_X32-NEXT: shll $12, %edi -; KNL_X32-NEXT: orl %ebx, %edi ; KNL_X32-NEXT: andl $1, %ebp -; KNL_X32-NEXT: shll $13, %ebp -; KNL_X32-NEXT: orl %edi, %ebp +; KNL_X32-NEXT: shll $11, %ebp ; KNL_X32-NEXT: andl $1, %esi -; KNL_X32-NEXT: shll $14, %esi +; KNL_X32-NEXT: shll $12, %esi ; KNL_X32-NEXT: orl %ebp, %esi +; KNL_X32-NEXT: andl $1, %edi +; KNL_X32-NEXT: shll $13, %edi +; KNL_X32-NEXT: orl %esi, %edi +; KNL_X32-NEXT: andl $1, %ebx +; KNL_X32-NEXT: shll $14, %ebx +; KNL_X32-NEXT: orl %edi, %ebx ; KNL_X32-NEXT: andl $1, %ecx ; KNL_X32-NEXT: shll $15, %ecx -; KNL_X32-NEXT: orl %esi, %ecx +; KNL_X32-NEXT: orl %ebx, %ecx ; KNL_X32-NEXT: orl %edx, %ecx ; KNL_X32-NEXT: movw %cx, (%eax) ; KNL_X32-NEXT: addl $20, %esp Index: llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll +++ llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll @@ -947,35 +947,35 @@ ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: subl %ecx, %ebx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl %esi, %ebp -; X32-NEXT: subl {{[0-9]+}}(%esp), %ebp -; X32-NEXT: imull %ebp, %ebx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: subl %edi, %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: subl {{[0-9]+}}(%esp), %eax +; X32-NEXT: imull %eax, %ebx +; X32-NEXT: movl %edx, %eax +; X32-NEXT: subl %edi, %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: imull %ebp, %ecx +; X32-NEXT: imull %eax, %ecx ; X32-NEXT: addl %ecx, %ebx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %ebp -; X32-NEXT: subl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: movl %edi, %esi +; X32-NEXT: subl {{[0-9]+}}(%esp), %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: subl {{[0-9]+}}(%esp), %eax -; X32-NEXT: imull %ebp, %eax +; X32-NEXT: imull %esi, %eax ; X32-NEXT: addl %eax, %ebx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl (%esp), %ebp # 4-byte Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl (%esp), %esi # 4-byte Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: addl {{[0-9]+}}(%esp), %edi -; X32-NEXT: addl {{[0-9]+}}(%esp), %esi -; X32-NEXT: imull %eax, %esi +; X32-NEXT: addl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: imull %eax, %ebp ; X32-NEXT: addl {{[0-9]+}}(%esp), %edx -; X32-NEXT: imull %ebp, %edx -; X32-NEXT: addl %esi, %edx +; X32-NEXT: imull %esi, %edx +; X32-NEXT: addl %ebp, %edx ; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: imull %edi, %ecx ; X32-NEXT: addl %edx, %ecx Index: llvm/test/CodeGen/X86/avx512-select.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-select.ll +++ llvm/test/CodeGen/X86/avx512-select.ll @@ -556,22 +556,22 @@ ; X86-AVX512F-LABEL: vselect_v1i1: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: pushl %esi -; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX512F-NEXT: movzbl (%edx), %esi +; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512F-NEXT: movzbl (%eax), %esi ; X86-AVX512F-NEXT: kmovw %esi, %k0 +; X86-AVX512F-NEXT: movzbl (%edx), %edx +; X86-AVX512F-NEXT: kmovw %edx, %k1 ; X86-AVX512F-NEXT: movzbl (%ecx), %ecx -; X86-AVX512F-NEXT: kmovw %ecx, %k1 -; X86-AVX512F-NEXT: movzbl (%eax), %eax -; X86-AVX512F-NEXT: kmovw %eax, %k2 +; X86-AVX512F-NEXT: kmovw %ecx, %k2 ; X86-AVX512F-NEXT: kandnw %k1, %k2, %k1 ; X86-AVX512F-NEXT: kandw %k2, %k0, %k0 ; X86-AVX512F-NEXT: korw %k1, %k0, %k0 ; X86-AVX512F-NEXT: kshiftlw $15, %k0, %k0 ; X86-AVX512F-NEXT: kshiftrw $15, %k0, %k0 -; X86-AVX512F-NEXT: kmovw %k0, %eax -; X86-AVX512F-NEXT: movb %al, (%edx) +; X86-AVX512F-NEXT: kmovw %k0, %ecx +; X86-AVX512F-NEXT: movb %cl, (%eax) ; X86-AVX512F-NEXT: popl %esi ; X86-AVX512F-NEXT: retl ; @@ -595,22 +595,22 @@ ; X86-AVX512BW-LABEL: vselect_v1i1: ; X86-AVX512BW: # %bb.0: ; X86-AVX512BW-NEXT: pushl %esi -; X86-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX512BW-NEXT: movzbl (%edx), %esi +; X86-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512BW-NEXT: movzbl (%eax), %esi ; X86-AVX512BW-NEXT: kmovd %esi, %k0 +; X86-AVX512BW-NEXT: movzbl (%edx), %edx +; X86-AVX512BW-NEXT: kmovd %edx, %k1 ; X86-AVX512BW-NEXT: movzbl (%ecx), %ecx -; X86-AVX512BW-NEXT: kmovd %ecx, %k1 -; X86-AVX512BW-NEXT: movzbl (%eax), %eax -; X86-AVX512BW-NEXT: kmovd %eax, %k2 +; X86-AVX512BW-NEXT: kmovd %ecx, %k2 ; X86-AVX512BW-NEXT: kandnw %k1, %k2, %k1 ; X86-AVX512BW-NEXT: kandw %k2, %k0, %k0 ; X86-AVX512BW-NEXT: korw %k1, %k0, %k0 ; X86-AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; X86-AVX512BW-NEXT: kshiftrw $15, %k0, %k0 -; X86-AVX512BW-NEXT: kmovd %k0, %eax -; X86-AVX512BW-NEXT: movb %al, (%edx) +; X86-AVX512BW-NEXT: kmovd %k0, %ecx +; X86-AVX512BW-NEXT: movb %cl, (%eax) ; X86-AVX512BW-NEXT: popl %esi ; X86-AVX512BW-NEXT: retl ; Index: llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll =================================================================== --- llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -1860,32 +1860,32 @@ ; X86-NEXT: .cfi_offset %edi, -8 ; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc1] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X86-NEXT: vpcmpgtb %zmm0, %zmm1, %k0 # encoding: [0x62,0xf1,0x75,0x48,0x64,0xc0] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] +; X86-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] ; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %ecx, %esi # encoding: [0x01,0xce] -; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] +; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] +; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] ; X86-NEXT: vpcmpleb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x02] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: addl %esi, %ecx # encoding: [0x01,0xf1] -; X86-NEXT: adcl %edx, %eax # encoding: [0x11,0xd0] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] +; X86-NEXT: addl %esi, %edx # encoding: [0x01,0xf2] +; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] ; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x04] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] +; X86-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] ; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %ecx, %esi # encoding: [0x01,0xce] -; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] +; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] +; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] ; X86-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x05] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] ; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] ; X86-NEXT: kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8] ; X86-NEXT: addl %esi, %edi # encoding: [0x01,0xf7] -; X86-NEXT: adcl %edx, %ecx # encoding: [0x11,0xd1] +; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] ; X86-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] ; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] @@ -1956,57 +1956,57 @@ ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x14] -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x18] +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x18] ; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xd1] ; X86-NEXT: kmovd %esi, %k0 # encoding: [0xc5,0xfb,0x92,0xc6] -; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9] ; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] ; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] -; X86-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] +; X86-NEXT: kmovd %k3, %edx # encoding: [0xc5,0xfb,0x93,0xd3] ; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] -; X86-NEXT: kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2] +; X86-NEXT: kmovd %k2, %edi # encoding: [0xc5,0xfb,0x93,0xfa] ; X86-NEXT: vpcmpgtb %zmm0, %zmm1, %k2 # encoding: [0x62,0xf1,0x75,0x48,0x64,0xd0] ; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] ; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] -; X86-NEXT: kmovd %k3, %edi # encoding: [0xc5,0xfb,0x93,0xfb] +; X86-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] ; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] ; X86-NEXT: kmovd %k2, %ebx # encoding: [0xc5,0xfb,0x93,0xda] -; X86-NEXT: addl %edx, %ebx # encoding: [0x01,0xd3] -; X86-NEXT: adcl %eax, %edi # encoding: [0x11,0xc7] +; X86-NEXT: addl %edi, %ebx # encoding: [0x01,0xfb] +; X86-NEXT: adcl %edx, %eax # encoding: [0x11,0xd0] ; X86-NEXT: vpcmpleb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xd1,0x02] ; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] ; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] -; X86-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] +; X86-NEXT: kmovd %k3, %edx # encoding: [0xc5,0xfb,0x93,0xd3] ; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] -; X86-NEXT: kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2] -; X86-NEXT: addl %ebx, %edx # encoding: [0x01,0xda] -; X86-NEXT: adcl %edi, %eax # encoding: [0x11,0xf8] +; X86-NEXT: kmovd %k2, %edi # encoding: [0xc5,0xfb,0x93,0xfa] +; X86-NEXT: addl %ebx, %edi # encoding: [0x01,0xdf] +; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] ; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xd1,0x04] ; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] ; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] -; X86-NEXT: kmovd %k3, %edi # encoding: [0xc5,0xfb,0x93,0xfb] +; X86-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] ; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] ; X86-NEXT: kmovd %k2, %ebx # encoding: [0xc5,0xfb,0x93,0xda] -; X86-NEXT: addl %edx, %ebx # encoding: [0x01,0xd3] -; X86-NEXT: adcl %eax, %edi # encoding: [0x11,0xc7] +; X86-NEXT: addl %edi, %ebx # encoding: [0x01,0xfb] +; X86-NEXT: adcl %edx, %eax # encoding: [0x11,0xd0] ; X86-NEXT: vpcmpnltb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xd1,0x05] ; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] ; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] -; X86-NEXT: kmovd %k3, %ebp # encoding: [0xc5,0xfb,0x93,0xeb] +; X86-NEXT: kmovd %k3, %edi # encoding: [0xc5,0xfb,0x93,0xfb] ; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: addl %ebx, %ecx # encoding: [0x01,0xd9] -; X86-NEXT: adcl %edi, %ebp # encoding: [0x11,0xfd] +; X86-NEXT: kmovd %k2, %ebp # encoding: [0xc5,0xfb,0x93,0xea] +; X86-NEXT: addl %ebx, %ebp # encoding: [0x01,0xdd] +; X86-NEXT: adcl %eax, %edi # encoding: [0x11,0xc7] ; X86-NEXT: vpcmpgtb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xd1] ; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] ; X86-NEXT: kandd %k1, %k3, %k1 # encoding: [0xc4,0xe1,0xe5,0x41,0xc9] ; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] ; X86-NEXT: kandd %k0, %k2, %k0 # encoding: [0xc4,0xe1,0xed,0x41,0xc0] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] -; X86-NEXT: adcl %ebp, %edx # encoding: [0x11,0xea] +; X86-NEXT: addl %ebp, %eax # encoding: [0x01,0xe8] +; X86-NEXT: adcl %edi, %edx # encoding: [0x11,0xfa] ; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x18] +; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] ; X86-NEXT: popl %esi # encoding: [0x5e] ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: popl %edi # encoding: [0x5f] @@ -2072,32 +2072,32 @@ ; X86-NEXT: .cfi_offset %edi, -8 ; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc1] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X86-NEXT: vpcmpltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x01] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] +; X86-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] ; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %ecx, %esi # encoding: [0x01,0xce] -; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] +; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] +; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] ; X86-NEXT: vpcmpleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x02] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: addl %esi, %ecx # encoding: [0x01,0xf1] -; X86-NEXT: adcl %edx, %eax # encoding: [0x11,0xd0] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] +; X86-NEXT: addl %esi, %edx # encoding: [0x01,0xf2] +; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] ; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x04] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] +; X86-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] ; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %ecx, %esi # encoding: [0x01,0xce] -; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] +; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] +; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] ; X86-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x05] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] ; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] ; X86-NEXT: kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8] ; X86-NEXT: addl %esi, %edi # encoding: [0x01,0xf7] -; X86-NEXT: adcl %edx, %ecx # encoding: [0x11,0xd1] +; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] ; X86-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x06] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] ; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] @@ -2168,57 +2168,57 @@ ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x14] -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x18] +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x18] ; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xd1] ; X86-NEXT: kmovd %esi, %k0 # encoding: [0xc5,0xfb,0x92,0xc6] -; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9] ; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] ; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] -; X86-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] +; X86-NEXT: kmovd %k3, %edx # encoding: [0xc5,0xfb,0x93,0xd3] ; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] -; X86-NEXT: kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2] +; X86-NEXT: kmovd %k2, %edi # encoding: [0xc5,0xfb,0x93,0xfa] ; X86-NEXT: vpcmpltub %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xd1,0x01] ; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] ; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] -; X86-NEXT: kmovd %k3, %edi # encoding: [0xc5,0xfb,0x93,0xfb] +; X86-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] ; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] ; X86-NEXT: kmovd %k2, %ebx # encoding: [0xc5,0xfb,0x93,0xda] -; X86-NEXT: addl %edx, %ebx # encoding: [0x01,0xd3] -; X86-NEXT: adcl %eax, %edi # encoding: [0x11,0xc7] +; X86-NEXT: addl %edi, %ebx # encoding: [0x01,0xfb] +; X86-NEXT: adcl %edx, %eax # encoding: [0x11,0xd0] ; X86-NEXT: vpcmpleub %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xd1,0x02] ; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] ; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] -; X86-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] +; X86-NEXT: kmovd %k3, %edx # encoding: [0xc5,0xfb,0x93,0xd3] ; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] -; X86-NEXT: kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2] -; X86-NEXT: addl %ebx, %edx # encoding: [0x01,0xda] -; X86-NEXT: adcl %edi, %eax # encoding: [0x11,0xf8] +; X86-NEXT: kmovd %k2, %edi # encoding: [0xc5,0xfb,0x93,0xfa] +; X86-NEXT: addl %ebx, %edi # encoding: [0x01,0xdf] +; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] ; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xd1,0x04] ; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] ; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] -; X86-NEXT: kmovd %k3, %edi # encoding: [0xc5,0xfb,0x93,0xfb] +; X86-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] ; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] ; X86-NEXT: kmovd %k2, %ebx # encoding: [0xc5,0xfb,0x93,0xda] -; X86-NEXT: addl %edx, %ebx # encoding: [0x01,0xd3] -; X86-NEXT: adcl %eax, %edi # encoding: [0x11,0xc7] +; X86-NEXT: addl %edi, %ebx # encoding: [0x01,0xfb] +; X86-NEXT: adcl %edx, %eax # encoding: [0x11,0xd0] ; X86-NEXT: vpcmpnltub %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xd1,0x05] ; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] ; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] -; X86-NEXT: kmovd %k3, %ebp # encoding: [0xc5,0xfb,0x93,0xeb] +; X86-NEXT: kmovd %k3, %edi # encoding: [0xc5,0xfb,0x93,0xfb] ; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: addl %ebx, %ecx # encoding: [0x01,0xd9] -; X86-NEXT: adcl %edi, %ebp # encoding: [0x11,0xfd] +; X86-NEXT: kmovd %k2, %ebp # encoding: [0xc5,0xfb,0x93,0xea] +; X86-NEXT: addl %ebx, %ebp # encoding: [0x01,0xdd] +; X86-NEXT: adcl %eax, %edi # encoding: [0x11,0xc7] ; X86-NEXT: vpcmpnleub %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xd1,0x06] ; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] ; X86-NEXT: kandd %k1, %k3, %k1 # encoding: [0xc4,0xe1,0xe5,0x41,0xc9] ; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] ; X86-NEXT: kandd %k0, %k2, %k0 # encoding: [0xc4,0xe1,0xed,0x41,0xc0] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] -; X86-NEXT: adcl %ebp, %edx # encoding: [0x11,0xea] +; X86-NEXT: addl %ebp, %eax # encoding: [0x01,0xe8] +; X86-NEXT: adcl %edi, %edx # encoding: [0x11,0xfa] ; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x18] +; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] ; X86-NEXT: popl %esi # encoding: [0x5e] ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: popl %edi # encoding: [0x5f] Index: llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll =================================================================== --- llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -4810,9 +4810,9 @@ ; X86-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 # encoding: [0x62,0xf1,0x75,0x28,0x64,0xc0] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: vpcmpleb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x02] ; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] +; X86-NEXT: vpcmpleb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x02] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X86-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x04] ; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] ; X86-NEXT: vpcmpnltb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x05] @@ -4825,11 +4825,11 @@ ; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] ; X86-NEXT: vpblendd $8, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08] ; X86-NEXT: # xmm0 = xmm0[0,1,2],xmm1[3] -; X86-NEXT: vmovd %ecx, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc9] +; X86-NEXT: vmovd %edx, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xca] ; X86-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] ; X86-NEXT: vpunpckldq %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9] ; X86-NEXT: # xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-NEXT: vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] ; X86-NEXT: vpunpcklqdq %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xca] ; X86-NEXT: # xmm1 = xmm1[0],xmm2[0] ; X86-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01] @@ -5004,9 +5004,9 @@ ; X86-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: vpcmpltub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x01] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: vpcmpleub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x02] ; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] +; X86-NEXT: vpcmpleub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x02] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X86-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x04] ; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] ; X86-NEXT: vpcmpnltub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x05] @@ -5019,11 +5019,11 @@ ; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] ; X86-NEXT: vpblendd $8, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08] ; X86-NEXT: # xmm0 = xmm0[0,1,2],xmm1[3] -; X86-NEXT: vmovd %ecx, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc9] +; X86-NEXT: vmovd %edx, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xca] ; X86-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] ; X86-NEXT: vpunpckldq %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9] ; X86-NEXT: # xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-NEXT: vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] ; X86-NEXT: vpunpcklqdq %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xca] ; X86-NEXT: # xmm1 = xmm1[0],xmm2[0] ; X86-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01] Index: llvm/test/CodeGen/X86/bitreverse.ll =================================================================== --- llvm/test/CodeGen/X86/bitreverse.ll +++ llvm/test/CodeGen/X86/bitreverse.ll @@ -651,8 +651,7 @@ ; X86-NEXT: andl $1431655765, %ebx # imm = 0x55555555 ; X86-NEXT: andl $-1431655766, %edi # imm = 0xAAAAAAAA ; X86-NEXT: shrl %edi -; X86-NEXT: leal (%edi,%ebx,2), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: leal (%edi,%ebx,2), %ebx ; X86-NEXT: bswapl %esi ; X86-NEXT: movl %esi, %edi ; X86-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F @@ -669,7 +668,8 @@ ; X86-NEXT: andl $1431655765, %edi # imm = 0x55555555 ; X86-NEXT: andl $-1431655766, %esi # imm = 0xAAAAAAAA ; X86-NEXT: shrl %esi -; X86-NEXT: leal (%esi,%edi,2), %ebx +; X86-NEXT: leal (%esi,%edi,2), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: bswapl %edx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: andl $252645135, %esi # imm = 0xF0F0F0F @@ -932,13 +932,13 @@ ; X86-NEXT: shrl %eax ; X86-NEXT: leal (%eax,%ecx,2), %edx ; X86-NEXT: movl (%esp), %esi # 4-byte Reload +; X86-NEXT: shrdl $16, %ebx, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shrdl $16, %eax, %esi -; X86-NEXT: shrdl $16, %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shrdl $16, %ecx, %ebx +; X86-NEXT: shrdl $16, %eax, %ebx ; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shrdl $16, %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shrdl $16, %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -996,9 +996,9 @@ ; X86-NEXT: movl %ecx, 16(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 8(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, 4(%eax) ; X86-NEXT: movl %esi, (%eax) ; X86-NEXT: shrl $16, %edx @@ -1170,7 +1170,7 @@ ; X64-NEXT: andq %rbx, %rdx ; X64-NEXT: andq %r15, %rax ; X64-NEXT: shrq %rax -; X64-NEXT: leaq (%rax,%rdx,2), %rax +; X64-NEXT: leaq (%rax,%rdx,2), %rdx ; X64-NEXT: bswapq %rsi ; X64-NEXT: andq %rsi, %r13 ; X64-NEXT: andq %rdi, %rsi @@ -1180,11 +1180,11 @@ ; X64-NEXT: andq %rsi, %r11 ; X64-NEXT: andq %r14, %rsi ; X64-NEXT: shrq $2, %rsi -; X64-NEXT: leaq (%rsi,%r11,4), %rdx -; X64-NEXT: andq %rdx, %rbx -; X64-NEXT: andq %r15, %rdx -; X64-NEXT: shrq %rdx -; X64-NEXT: leaq (%rdx,%rbx,2), %rdx +; X64-NEXT: leaq (%rsi,%r11,4), %rax +; X64-NEXT: andq %rax, %rbx +; X64-NEXT: andq %r15, %rax +; X64-NEXT: shrq %rax +; X64-NEXT: leaq (%rax,%rbx,2), %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; X64-NEXT: shrdq $48, %rdi, %rsi @@ -1193,9 +1193,9 @@ ; X64-NEXT: shrdq $48, %r9, %r10 ; X64-NEXT: shrdq $48, %r8, %r9 ; X64-NEXT: shrdq $48, %rcx, %r8 -; X64-NEXT: shrdq $48, %rax, %rcx -; X64-NEXT: shrdq $48, %rdx, %rax -; X64-NEXT: movq %rax, 56(%r12) +; X64-NEXT: shrdq $48, %rdx, %rcx +; X64-NEXT: shrdq $48, %rax, %rdx +; X64-NEXT: movq %rdx, 56(%r12) ; X64-NEXT: movq %rcx, 48(%r12) ; X64-NEXT: movq %r8, 40(%r12) ; X64-NEXT: movq %r9, 32(%r12) @@ -1203,8 +1203,8 @@ ; X64-NEXT: movq %rbp, 16(%r12) ; X64-NEXT: movq %rdi, 8(%r12) ; X64-NEXT: movq %rsi, (%r12) -; X64-NEXT: shrq $48, %rdx -; X64-NEXT: movw %dx, 64(%r12) +; X64-NEXT: shrq $48, %rax +; X64-NEXT: movw %ax, 64(%r12) ; X64-NEXT: movq %r12, %rax ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 Index: llvm/test/CodeGen/X86/bool-vector.ll =================================================================== --- llvm/test/CodeGen/X86/bool-vector.ll +++ llvm/test/CodeGen/X86/bool-vector.ll @@ -10,18 +10,18 @@ ; X86-LABEL: PR15215_bad: ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al -; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movb {{[0-9]+}}(%esp), %dl +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movb {{[0-9]+}}(%esp), %ah ; X86-NEXT: addb %ah, %ah -; X86-NEXT: andb $1, %dl -; X86-NEXT: orb %ah, %dl -; X86-NEXT: shlb $2, %dl -; X86-NEXT: addb %cl, %cl +; X86-NEXT: andb $1, %cl +; X86-NEXT: orb %ah, %cl +; X86-NEXT: shlb $2, %cl +; X86-NEXT: addb %dl, %dl ; X86-NEXT: andb $1, %al -; X86-NEXT: orb %cl, %al -; X86-NEXT: andb $3, %al ; X86-NEXT: orb %dl, %al +; X86-NEXT: andb $3, %al +; X86-NEXT: orb %cl, %al ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: andl $15, %eax ; X86-NEXT: retl Index: llvm/test/CodeGen/X86/bswap.ll =================================================================== --- llvm/test/CodeGen/X86/bswap.ll +++ llvm/test/CodeGen/X86/bswap.ll @@ -277,14 +277,10 @@ ; CHECK-NEXT: bswapl %ebp ; CHECK-NEXT: shrdl $16, %ebp, %ebx ; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: bswapl %ecx -; CHECK-NEXT: shrdl $16, %ecx, %ebp -; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: bswapl %eax -; CHECK-NEXT: shrdl $16, %eax, %ecx -; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: shrdl $16, %eax, %ebp +; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: bswapl %ecx ; CHECK-NEXT: shrdl $16, %ecx, %eax @@ -293,10 +289,14 @@ ; CHECK-NEXT: bswapl %eax ; CHECK-NEXT: shrdl $16, %eax, %ecx ; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: bswapl %ecx +; CHECK-NEXT: shrdl $16, %ecx, %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp ; CHECK-NEXT: bswapl %ebp -; CHECK-NEXT: shrdl $16, %ebp, %eax -; CHECK-NEXT: movl %eax, (%esp) # 4-byte Spill +; CHECK-NEXT: shrdl $16, %ebp, %ecx +; CHECK-NEXT: movl %ecx, (%esp) # 4-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx ; CHECK-NEXT: bswapl %ebx ; CHECK-NEXT: shrdl $16, %ebx, %ebp Index: llvm/test/CodeGen/X86/build-vector-128.ll =================================================================== --- llvm/test/CodeGen/X86/build-vector-128.ll +++ llvm/test/CodeGen/X86/build-vector-128.ll @@ -252,29 +252,29 @@ ; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-32-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE2-32-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-32-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE2-32-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-32-NEXT: retl ; ; SSE2-64-LABEL: test_buildvector_v16i8: Index: llvm/test/CodeGen/X86/combine-sbb.ll =================================================================== --- llvm/test/CodeGen/X86/combine-sbb.ll +++ llvm/test/CodeGen/X86/combine-sbb.ll @@ -63,23 +63,23 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl (%edx), %esi -; X86-NEXT: movl 4(%edx), %edi -; X86-NEXT: subl (%ecx), %esi -; X86-NEXT: sbbl 4(%ecx), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl (%edi), %ecx +; X86-NEXT: movl 4(%edi), %edx +; X86-NEXT: subl (%esi), %ecx +; X86-NEXT: sbbl 4(%esi), %edx ; X86-NEXT: setb %bl -; X86-NEXT: movl 12(%edx), %ebp -; X86-NEXT: movl 8(%edx), %edx -; X86-NEXT: subl 8(%ecx), %edx -; X86-NEXT: sbbl 12(%ecx), %ebp -; X86-NEXT: movzbl %bl, %ecx -; X86-NEXT: subl %ecx, %edx +; X86-NEXT: movl 12(%edi), %ebp +; X86-NEXT: movl 8(%edi), %edi +; X86-NEXT: subl 8(%esi), %edi +; X86-NEXT: sbbl 12(%esi), %ebp +; X86-NEXT: movzbl %bl, %esi +; X86-NEXT: subl %esi, %edi ; X86-NEXT: sbbl $0, %ebp -; X86-NEXT: movl %esi, (%eax) -; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %edi, 8(%eax) ; X86-NEXT: movl %ebp, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi Index: llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll =================================================================== --- llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -124,10 +124,10 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: pushl %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: calll __divdi3 @@ -136,10 +136,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %ecx, 4(%edx) ; X86-NEXT: movl %eax, (%edx) -; X86-NEXT: imull %eax, %ebx -; X86-NEXT: mull %ebp -; X86-NEXT: addl %ebx, %edx -; X86-NEXT: imull %ebp, %ecx +; X86-NEXT: imull %eax, %ebp +; X86-NEXT: mull %ebx +; X86-NEXT: addl %ebp, %edx +; X86-NEXT: imull %ebx, %ecx ; X86-NEXT: addl %edx, %ecx ; X86-NEXT: subl %eax, %esi ; X86-NEXT: sbbl %ecx, %edi @@ -178,15 +178,13 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $48, %esp +; X86-NEXT: subl $40, %esp ; X86-NEXT: movl 44(%ebp), %edi -; X86-NEXT: movl 28(%ebp), %ecx ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl 40(%ebp) ; X86-NEXT: pushl 36(%ebp) ; X86-NEXT: pushl 32(%ebp) -; X86-NEXT: pushl %ecx -; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: pushl 28(%ebp) ; X86-NEXT: pushl 24(%ebp) ; X86-NEXT: pushl 20(%ebp) ; X86-NEXT: pushl 16(%ebp) @@ -194,18 +192,18 @@ ; X86-NEXT: pushl %eax ; X86-NEXT: calll __divti3 ; X86-NEXT: addl $32, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %edi, %edx ; X86-NEXT: movl %ecx, 12(%edi) ; X86-NEXT: movl %esi, 8(%edi) ; X86-NEXT: movl %eax, 4(%edi) -; X86-NEXT: movl %edx, (%edi) -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: imull %ebx, %ecx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %ebx, (%edx) +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: imull %eax, %ecx ; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %ecx, %edx @@ -213,40 +211,38 @@ ; X86-NEXT: addl %edx, %esi ; X86-NEXT: movl 36(%ebp), %eax ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: imull %ebx, %ecx -; X86-NEXT: mull %edi +; X86-NEXT: imull %edi, %ecx +; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl 40(%ebp), %edi -; X86-NEXT: imull %eax, %edi -; X86-NEXT: addl %edx, %edi +; X86-NEXT: movl 40(%ebp), %eax +; X86-NEXT: imull %ebx, %eax +; X86-NEXT: addl %edx, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl %esi, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl 28(%ebp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl 28(%ebp), %ecx ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull 32(%ebp) -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %esi, %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: adcl %ecx, %ebx ; X86-NEXT: setb %cl -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull 32(%ebp) -; X86-NEXT: addl %edi, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload @@ -254,7 +250,7 @@ ; X86-NEXT: movl 12(%ebp), %ecx ; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl 16(%ebp), %esi -; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: sbbl (%esp), %esi # 4-byte Folded Reload ; X86-NEXT: movl 20(%ebp), %edi ; X86-NEXT: sbbl %eax, %edi ; X86-NEXT: movl 24(%ebp), %ebx @@ -386,35 +382,35 @@ ; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; X86-NEXT: movd %edx, %xmm4 ; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X86-NEXT: movd %esi, %xmm2 +; X86-NEXT: movd %esi, %xmm7 ; X86-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; X86-NEXT: movd %edi, %xmm5 -; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; X86-NEXT: movd %edi, %xmm2 +; X86-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; X86-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; X86-NEXT: movd %ebx, %xmm4 ; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movd %ecx, %xmm6 +; X86-NEXT: movd %ecx, %xmm5 ; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movd %eax, %xmm5 -; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; X86-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; X86-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; X86-NEXT: movdqa %xmm5, %xmm2 -; X86-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X86-NEXT: movdqa %xmm2, (%ecx) +; X86-NEXT: movd %eax, %xmm2 +; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; X86-NEXT: movdqa %xmm2, %xmm4 +; X86-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; X86-NEXT: movdqa %xmm4, (%ecx) ; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: movdqa %xmm1, %xmm2 -; X86-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; X86-NEXT: pmullw %xmm3, %xmm2 +; X86-NEXT: movdqa %xmm1, %xmm4 +; X86-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X86-NEXT: pmullw %xmm3, %xmm4 ; X86-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; X86-NEXT: pand %xmm3, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-NEXT: pand %xmm3, %xmm4 +; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: pmullw %xmm5, %xmm1 +; X86-NEXT: pmullw %xmm2, %xmm1 ; X86-NEXT: pand %xmm3, %xmm1 -; X86-NEXT: packuswb %xmm2, %xmm1 +; X86-NEXT: packuswb %xmm4, %xmm1 ; X86-NEXT: psubb %xmm1, %xmm0 ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi @@ -585,22 +581,22 @@ ; X86-NEXT: cwtd ; X86-NEXT: idivw %si ; X86-NEXT: # kill: def $ax killed $ax def $eax -; X86-NEXT: movd %eax, %xmm3 +; X86-NEXT: movd %eax, %xmm4 ; X86-NEXT: pextrw $2, %xmm0, %eax ; X86-NEXT: pextrw $2, %xmm1, %esi ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: cwtd ; X86-NEXT: idivw %si ; X86-NEXT: # kill: def $ax killed $ax def $eax -; X86-NEXT: movd %eax, %xmm4 -; X86-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; X86-NEXT: movd %eax, %xmm3 +; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; X86-NEXT: pextrw $1, %xmm0, %eax ; X86-NEXT: pextrw $1, %xmm1, %esi ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: cwtd ; X86-NEXT: idivw %si ; X86-NEXT: # kill: def $ax killed $ax def $eax -; X86-NEXT: movd %eax, %xmm3 +; X86-NEXT: movd %eax, %xmm4 ; X86-NEXT: movd %xmm0, %eax ; X86-NEXT: movd %xmm1, %esi ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -608,8 +604,8 @@ ; X86-NEXT: idivw %si ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: movd %eax, %xmm5 -; X86-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; X86-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; X86-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; X86-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; X86-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; X86-NEXT: movdqa %xmm5, (%ecx) ; X86-NEXT: pmullw %xmm1, %xmm5 @@ -704,20 +700,20 @@ ; X86-NEXT: movd %xmm2, %esi ; X86-NEXT: cltd ; X86-NEXT: idivl %esi -; X86-NEXT: movd %eax, %xmm2 -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X86-NEXT: movd %xmm3, %eax -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X86-NEXT: movd %xmm3, %esi +; X86-NEXT: movd %eax, %xmm3 +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; X86-NEXT: movd %xmm2, %eax +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; X86-NEXT: movd %xmm2, %esi ; X86-NEXT: cltd ; X86-NEXT: idivl %esi -; X86-NEXT: movd %eax, %xmm3 -; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X86-NEXT: movd %eax, %xmm2 +; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X86-NEXT: movd %xmm0, %eax ; X86-NEXT: movd %xmm1, %esi ; X86-NEXT: cltd ; X86-NEXT: idivl %esi -; X86-NEXT: movd %eax, %xmm2 +; X86-NEXT: movd %eax, %xmm3 ; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] ; X86-NEXT: movd %xmm4, %eax ; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] @@ -725,17 +721,17 @@ ; X86-NEXT: cltd ; X86-NEXT: idivl %esi ; X86-NEXT: movd %eax, %xmm4 -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; X86-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X86-NEXT: movdqa %xmm2, (%ecx) -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; X86-NEXT: pmuludq %xmm1, %xmm2 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; X86-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; X86-NEXT: movdqa %xmm3, (%ecx) +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X86-NEXT: pmuludq %xmm1, %xmm3 +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-NEXT: pmuludq %xmm3, %xmm1 +; X86-NEXT: pmuludq %xmm2, %xmm1 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-NEXT: psubd %xmm2, %xmm0 +; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; X86-NEXT: psubd %xmm3, %xmm0 ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -817,25 +813,25 @@ ; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X86-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __divdi3 -; X86-NEXT: movd %edx, %xmm0 -; X86-NEXT: movd %eax, %xmm1 -; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-NEXT: movd %edx, %xmm1 +; X86-NEXT: movd %eax, %xmm3 +; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X86-NEXT: movdqa %xmm1, (%esi) -; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload -; X86-NEXT: movdqa %xmm3, %xmm0 -; X86-NEXT: psrlq $32, %xmm0 -; X86-NEXT: pmuludq %xmm1, %xmm0 -; X86-NEXT: movdqa %xmm1, %xmm2 +; X86-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; X86-NEXT: movdqa %xmm3, (%esi) +; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: psrlq $32, %xmm1 +; X86-NEXT: pmuludq %xmm3, %xmm1 +; X86-NEXT: movdqa %xmm3, %xmm2 ; X86-NEXT: psrlq $32, %xmm2 -; X86-NEXT: pmuludq %xmm3, %xmm2 -; X86-NEXT: paddq %xmm0, %xmm2 +; X86-NEXT: pmuludq %xmm0, %xmm2 +; X86-NEXT: paddq %xmm1, %xmm2 ; X86-NEXT: psllq $32, %xmm2 -; X86-NEXT: pmuludq %xmm3, %xmm1 -; X86-NEXT: paddq %xmm2, %xmm1 +; X86-NEXT: pmuludq %xmm0, %xmm3 +; X86-NEXT: paddq %xmm2, %xmm3 ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: psubq %xmm1, %xmm0 +; X86-NEXT: psubq %xmm3, %xmm0 ; X86-NEXT: addl $64, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl Index: llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll =================================================================== --- llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll +++ llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll @@ -124,10 +124,10 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: pushl %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: calll __udivdi3 @@ -136,10 +136,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %ecx, 4(%edx) ; X86-NEXT: movl %eax, (%edx) -; X86-NEXT: imull %eax, %ebx -; X86-NEXT: mull %ebp -; X86-NEXT: addl %ebx, %edx -; X86-NEXT: imull %ebp, %ecx +; X86-NEXT: imull %eax, %ebp +; X86-NEXT: mull %ebx +; X86-NEXT: addl %ebp, %edx +; X86-NEXT: imull %ebx, %ecx ; X86-NEXT: addl %edx, %ecx ; X86-NEXT: subl %eax, %esi ; X86-NEXT: sbbl %ecx, %edi @@ -178,15 +178,13 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $48, %esp +; X86-NEXT: subl $40, %esp ; X86-NEXT: movl 44(%ebp), %edi -; X86-NEXT: movl 28(%ebp), %ecx ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl 40(%ebp) ; X86-NEXT: pushl 36(%ebp) ; X86-NEXT: pushl 32(%ebp) -; X86-NEXT: pushl %ecx -; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: pushl 28(%ebp) ; X86-NEXT: pushl 24(%ebp) ; X86-NEXT: pushl 20(%ebp) ; X86-NEXT: pushl 16(%ebp) @@ -194,18 +192,18 @@ ; X86-NEXT: pushl %eax ; X86-NEXT: calll __udivti3 ; X86-NEXT: addl $32, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %edi, %edx ; X86-NEXT: movl %ecx, 12(%edi) ; X86-NEXT: movl %esi, 8(%edi) ; X86-NEXT: movl %eax, 4(%edi) -; X86-NEXT: movl %edx, (%edi) -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: imull %ebx, %ecx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %ebx, (%edx) +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: imull %eax, %ecx ; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %ecx, %edx @@ -213,40 +211,38 @@ ; X86-NEXT: addl %edx, %esi ; X86-NEXT: movl 36(%ebp), %eax ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: imull %ebx, %ecx -; X86-NEXT: mull %edi +; X86-NEXT: imull %edi, %ecx +; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl 40(%ebp), %edi -; X86-NEXT: imull %eax, %edi -; X86-NEXT: addl %edx, %edi +; X86-NEXT: movl 40(%ebp), %eax +; X86-NEXT: imull %ebx, %eax +; X86-NEXT: addl %edx, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl %esi, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl 28(%ebp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl 28(%ebp), %ecx ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull 32(%ebp) -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %esi, %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: adcl %ecx, %ebx ; X86-NEXT: setb %cl -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull 32(%ebp) -; X86-NEXT: addl %edi, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload @@ -254,7 +250,7 @@ ; X86-NEXT: movl 12(%ebp), %ecx ; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl 16(%ebp), %esi -; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: sbbl (%esp), %esi # 4-byte Folded Reload ; X86-NEXT: movl 20(%ebp), %edi ; X86-NEXT: sbbl %eax, %edi ; X86-NEXT: movl 24(%ebp), %ebx @@ -386,35 +382,35 @@ ; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; X86-NEXT: movd %edx, %xmm4 ; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X86-NEXT: movd %esi, %xmm2 +; X86-NEXT: movd %esi, %xmm7 ; X86-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; X86-NEXT: movd %edi, %xmm5 -; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; X86-NEXT: movd %edi, %xmm2 +; X86-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; X86-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; X86-NEXT: movd %ebx, %xmm4 ; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movd %ecx, %xmm6 +; X86-NEXT: movd %ecx, %xmm5 ; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movd %eax, %xmm5 -; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; X86-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; X86-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; X86-NEXT: movdqa %xmm5, %xmm2 -; X86-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X86-NEXT: movdqa %xmm2, (%ecx) +; X86-NEXT: movd %eax, %xmm2 +; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; X86-NEXT: movdqa %xmm2, %xmm4 +; X86-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; X86-NEXT: movdqa %xmm4, (%ecx) ; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: movdqa %xmm1, %xmm2 -; X86-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; X86-NEXT: pmullw %xmm3, %xmm2 +; X86-NEXT: movdqa %xmm1, %xmm4 +; X86-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X86-NEXT: pmullw %xmm3, %xmm4 ; X86-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; X86-NEXT: pand %xmm3, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-NEXT: pand %xmm3, %xmm4 +; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: pmullw %xmm5, %xmm1 +; X86-NEXT: pmullw %xmm2, %xmm1 ; X86-NEXT: pand %xmm3, %xmm1 -; X86-NEXT: packuswb %xmm2, %xmm1 +; X86-NEXT: packuswb %xmm4, %xmm1 ; X86-NEXT: psubb %xmm1, %xmm0 ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi @@ -585,22 +581,22 @@ ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divw %si ; X86-NEXT: # kill: def $ax killed $ax def $eax -; X86-NEXT: movd %eax, %xmm3 +; X86-NEXT: movd %eax, %xmm4 ; X86-NEXT: pextrw $2, %xmm0, %eax ; X86-NEXT: pextrw $2, %xmm1, %esi ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divw %si ; X86-NEXT: # kill: def $ax killed $ax def $eax -; X86-NEXT: movd %eax, %xmm4 -; X86-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; X86-NEXT: movd %eax, %xmm3 +; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; X86-NEXT: pextrw $1, %xmm0, %eax ; X86-NEXT: pextrw $1, %xmm1, %esi ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divw %si ; X86-NEXT: # kill: def $ax killed $ax def $eax -; X86-NEXT: movd %eax, %xmm3 +; X86-NEXT: movd %eax, %xmm4 ; X86-NEXT: movd %xmm0, %eax ; X86-NEXT: movd %xmm1, %esi ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -608,8 +604,8 @@ ; X86-NEXT: divw %si ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: movd %eax, %xmm5 -; X86-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; X86-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; X86-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; X86-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; X86-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; X86-NEXT: movdqa %xmm5, (%ecx) ; X86-NEXT: pmullw %xmm1, %xmm5 @@ -704,20 +700,20 @@ ; X86-NEXT: movd %xmm2, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %esi -; X86-NEXT: movd %eax, %xmm2 -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X86-NEXT: movd %xmm3, %eax -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X86-NEXT: movd %xmm3, %esi +; X86-NEXT: movd %eax, %xmm3 +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; X86-NEXT: movd %xmm2, %eax +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; X86-NEXT: movd %xmm2, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %esi -; X86-NEXT: movd %eax, %xmm3 -; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X86-NEXT: movd %eax, %xmm2 +; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X86-NEXT: movd %xmm0, %eax ; X86-NEXT: movd %xmm1, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %esi -; X86-NEXT: movd %eax, %xmm2 +; X86-NEXT: movd %eax, %xmm3 ; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] ; X86-NEXT: movd %xmm4, %eax ; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] @@ -725,17 +721,17 @@ ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %esi ; X86-NEXT: movd %eax, %xmm4 -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; X86-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X86-NEXT: movdqa %xmm2, (%ecx) -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; X86-NEXT: pmuludq %xmm1, %xmm2 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; X86-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; X86-NEXT: movdqa %xmm3, (%ecx) +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X86-NEXT: pmuludq %xmm1, %xmm3 +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-NEXT: pmuludq %xmm3, %xmm1 +; X86-NEXT: pmuludq %xmm2, %xmm1 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-NEXT: psubd %xmm2, %xmm0 +; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; X86-NEXT: psubd %xmm3, %xmm0 ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -817,25 +813,25 @@ ; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X86-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-NEXT: calll __udivdi3 -; X86-NEXT: movd %edx, %xmm0 -; X86-NEXT: movd %eax, %xmm1 -; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-NEXT: movd %edx, %xmm1 +; X86-NEXT: movd %eax, %xmm3 +; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X86-NEXT: movdqa %xmm1, (%esi) -; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload -; X86-NEXT: movdqa %xmm3, %xmm0 -; X86-NEXT: psrlq $32, %xmm0 -; X86-NEXT: pmuludq %xmm1, %xmm0 -; X86-NEXT: movdqa %xmm1, %xmm2 +; X86-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; X86-NEXT: movdqa %xmm3, (%esi) +; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: psrlq $32, %xmm1 +; X86-NEXT: pmuludq %xmm3, %xmm1 +; X86-NEXT: movdqa %xmm3, %xmm2 ; X86-NEXT: psrlq $32, %xmm2 -; X86-NEXT: pmuludq %xmm3, %xmm2 -; X86-NEXT: paddq %xmm0, %xmm2 +; X86-NEXT: pmuludq %xmm0, %xmm2 +; X86-NEXT: paddq %xmm1, %xmm2 ; X86-NEXT: psllq $32, %xmm2 -; X86-NEXT: pmuludq %xmm3, %xmm1 -; X86-NEXT: paddq %xmm2, %xmm1 +; X86-NEXT: pmuludq %xmm0, %xmm3 +; X86-NEXT: paddq %xmm2, %xmm3 ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-NEXT: psubq %xmm1, %xmm0 +; X86-NEXT: psubq %xmm3, %xmm0 ; X86-NEXT: addl $64, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl Index: llvm/test/CodeGen/X86/fp128-cast.ll =================================================================== --- llvm/test/CodeGen/X86/fp128-cast.ll +++ llvm/test/CodeGen/X86/fp128-cast.ll @@ -1139,19 +1139,19 @@ ; X32-NEXT: subl $20, %esp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi ; X32-NEXT: subl $12, %esp -; X32-NEXT: leal {{[0-9]+}}(%esp), %edi +; X32-NEXT: leal {{[0-9]+}}(%esp), %edx +; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: pushl %edx ; X32-NEXT: pushl %ecx ; X32-NEXT: pushl %eax +; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: pushl %edx ; X32-NEXT: pushl %ecx ; X32-NEXT: pushl %eax -; X32-NEXT: pushl %edi +; X32-NEXT: pushl %edx ; X32-NEXT: calll __multf3 ; X32-NEXT: addl $44, %esp ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx Index: llvm/test/CodeGen/X86/fptosi-sat-scalar.ll =================================================================== --- llvm/test/CodeGen/X86/fptosi-sat-scalar.ll +++ llvm/test/CodeGen/X86/fptosi-sat-scalar.ll @@ -971,21 +971,21 @@ ; X86-SSE-NEXT: cmovbl %ecx, %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SSE-NEXT: cmovbl %ecx, %edi -; X86-SSE-NEXT: movl $-2147483648, %ebx # imm = 0x80000000 -; X86-SSE-NEXT: cmovael {{[0-9]+}}(%esp), %ebx +; X86-SSE-NEXT: movl $-2147483648, %ebp # imm = 0x80000000 +; X86-SSE-NEXT: cmovael {{[0-9]+}}(%esp), %ebp ; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: movl $2147483647, %ebp # imm = 0x7FFFFFFF -; X86-SSE-NEXT: cmovbel %ebx, %ebp -; X86-SSE-NEXT: movl $-1, %ebx -; X86-SSE-NEXT: cmoval %ebx, %edi -; X86-SSE-NEXT: cmoval %ebx, %edx -; X86-SSE-NEXT: cmoval %ebx, %eax +; X86-SSE-NEXT: movl $2147483647, %ebx # imm = 0x7FFFFFFF +; X86-SSE-NEXT: cmovbel %ebp, %ebx +; X86-SSE-NEXT: movl $-1, %ebp +; X86-SSE-NEXT: cmoval %ebp, %edi +; X86-SSE-NEXT: cmoval %ebp, %edx +; X86-SSE-NEXT: cmoval %ebp, %eax ; X86-SSE-NEXT: ucomiss %xmm0, %xmm0 ; X86-SSE-NEXT: cmovpl %ecx, %eax ; X86-SSE-NEXT: cmovpl %ecx, %edx ; X86-SSE-NEXT: cmovpl %ecx, %edi -; X86-SSE-NEXT: cmovpl %ecx, %ebp -; X86-SSE-NEXT: movl %ebp, 12(%esi) +; X86-SSE-NEXT: cmovpl %ecx, %ebx +; X86-SSE-NEXT: movl %ebx, 12(%esi) ; X86-SSE-NEXT: movl %edi, 8(%esi) ; X86-SSE-NEXT: movl %edx, 4(%esi) ; X86-SSE-NEXT: movl %eax, (%esi) @@ -1985,21 +1985,21 @@ ; X86-SSE-NEXT: cmovbl %ecx, %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SSE-NEXT: cmovbl %ecx, %edi -; X86-SSE-NEXT: movl $-2147483648, %ebx # imm = 0x80000000 -; X86-SSE-NEXT: cmovael {{[0-9]+}}(%esp), %ebx +; X86-SSE-NEXT: movl $-2147483648, %ebp # imm = 0x80000000 +; X86-SSE-NEXT: cmovael {{[0-9]+}}(%esp), %ebp ; X86-SSE-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: movl $2147483647, %ebp # imm = 0x7FFFFFFF -; X86-SSE-NEXT: cmovbel %ebx, %ebp -; X86-SSE-NEXT: movl $-1, %ebx -; X86-SSE-NEXT: cmoval %ebx, %edi -; X86-SSE-NEXT: cmoval %ebx, %edx -; X86-SSE-NEXT: cmoval %ebx, %eax +; X86-SSE-NEXT: movl $2147483647, %ebx # imm = 0x7FFFFFFF +; X86-SSE-NEXT: cmovbel %ebp, %ebx +; X86-SSE-NEXT: movl $-1, %ebp +; X86-SSE-NEXT: cmoval %ebp, %edi +; X86-SSE-NEXT: cmoval %ebp, %edx +; X86-SSE-NEXT: cmoval %ebp, %eax ; X86-SSE-NEXT: ucomisd %xmm0, %xmm0 ; X86-SSE-NEXT: cmovpl %ecx, %eax ; X86-SSE-NEXT: cmovpl %ecx, %edx ; X86-SSE-NEXT: cmovpl %ecx, %edi -; X86-SSE-NEXT: cmovpl %ecx, %ebp -; X86-SSE-NEXT: movl %ebp, 12(%esi) +; X86-SSE-NEXT: cmovpl %ecx, %ebx +; X86-SSE-NEXT: movl %ebx, 12(%esi) ; X86-SSE-NEXT: movl %edi, 8(%esi) ; X86-SSE-NEXT: movl %edx, 4(%esi) ; X86-SSE-NEXT: movl %eax, (%esi) @@ -3115,21 +3115,21 @@ ; X86-SSE-NEXT: cmovbl %ecx, %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SSE-NEXT: cmovbl %ecx, %edi -; X86-SSE-NEXT: movl $-2147483648, %ebx # imm = 0x80000000 -; X86-SSE-NEXT: cmovael {{[0-9]+}}(%esp), %ebx +; X86-SSE-NEXT: movl $-2147483648, %ebp # imm = 0x80000000 +; X86-SSE-NEXT: cmovael {{[0-9]+}}(%esp), %ebp ; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: movl $2147483647, %ebp # imm = 0x7FFFFFFF -; X86-SSE-NEXT: cmovbel %ebx, %ebp -; X86-SSE-NEXT: movl $-1, %ebx -; X86-SSE-NEXT: cmoval %ebx, %edi -; X86-SSE-NEXT: cmoval %ebx, %edx -; X86-SSE-NEXT: cmoval %ebx, %eax +; X86-SSE-NEXT: movl $2147483647, %ebx # imm = 0x7FFFFFFF +; X86-SSE-NEXT: cmovbel %ebp, %ebx +; X86-SSE-NEXT: movl $-1, %ebp +; X86-SSE-NEXT: cmoval %ebp, %edi +; X86-SSE-NEXT: cmoval %ebp, %edx +; X86-SSE-NEXT: cmoval %ebp, %eax ; X86-SSE-NEXT: ucomiss %xmm0, %xmm0 ; X86-SSE-NEXT: cmovpl %ecx, %eax ; X86-SSE-NEXT: cmovpl %ecx, %edx ; X86-SSE-NEXT: cmovpl %ecx, %edi -; X86-SSE-NEXT: cmovpl %ecx, %ebp -; X86-SSE-NEXT: movl %ebp, 12(%esi) +; X86-SSE-NEXT: cmovpl %ecx, %ebx +; X86-SSE-NEXT: movl %ebx, 12(%esi) ; X86-SSE-NEXT: movl %edi, 8(%esi) ; X86-SSE-NEXT: movl %edx, 4(%esi) ; X86-SSE-NEXT: movl %eax, (%esi) @@ -4459,24 +4459,24 @@ ; X86-SSE-NEXT: cmovbl %ecx, %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SSE-NEXT: cmovbl %ecx, %edi -; X86-SSE-NEXT: movl $-2147483648, %ebx # imm = 0x80000000 -; X86-SSE-NEXT: cmovael {{[0-9]+}}(%esp), %ebx +; X86-SSE-NEXT: movl $-2147483648, %ebp # imm = 0x80000000 +; X86-SSE-NEXT: cmovael {{[0-9]+}}(%esp), %ebp ; X86-SSE-NEXT: fldt {{\.?LCPI[0-9]+_[0-9]+}} ; X86-SSE-NEXT: fxch %st(1) ; X86-SSE-NEXT: fucomi %st(1), %st ; X86-SSE-NEXT: fstp %st(1) -; X86-SSE-NEXT: movl $2147483647, %ebp # imm = 0x7FFFFFFF -; X86-SSE-NEXT: cmovbel %ebx, %ebp -; X86-SSE-NEXT: movl $-1, %ebx -; X86-SSE-NEXT: cmoval %ebx, %edi -; X86-SSE-NEXT: cmoval %ebx, %edx -; X86-SSE-NEXT: cmoval %ebx, %eax +; X86-SSE-NEXT: movl $2147483647, %ebx # imm = 0x7FFFFFFF +; X86-SSE-NEXT: cmovbel %ebp, %ebx +; X86-SSE-NEXT: movl $-1, %ebp +; X86-SSE-NEXT: cmoval %ebp, %edi +; X86-SSE-NEXT: cmoval %ebp, %edx +; X86-SSE-NEXT: cmoval %ebp, %eax ; X86-SSE-NEXT: fucompi %st(0), %st ; X86-SSE-NEXT: cmovpl %ecx, %eax ; X86-SSE-NEXT: cmovpl %ecx, %edx ; X86-SSE-NEXT: cmovpl %ecx, %edi -; X86-SSE-NEXT: cmovpl %ecx, %ebp -; X86-SSE-NEXT: movl %ebp, 12(%esi) +; X86-SSE-NEXT: cmovpl %ecx, %ebx +; X86-SSE-NEXT: movl %ebx, 12(%esi) ; X86-SSE-NEXT: movl %edi, 8(%esi) ; X86-SSE-NEXT: movl %edx, 4(%esi) ; X86-SSE-NEXT: movl %eax, (%esi) Index: llvm/test/CodeGen/X86/funnel-shift-rot.ll =================================================================== --- llvm/test/CodeGen/X86/funnel-shift-rot.ll +++ llvm/test/CodeGen/X86/funnel-shift-rot.ll @@ -281,25 +281,25 @@ ; X32-SSE2-NEXT: pushl %edi ; X32-SSE2-NEXT: pushl %esi ; X32-SSE2-NEXT: movb {{[0-9]+}}(%esp), %cl -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-SSE2-NEXT: movl %edx, %edi -; X32-SSE2-NEXT: shrl %cl, %edi -; X32-SSE2-NEXT: movl %esi, %ebx -; X32-SSE2-NEXT: shrdl %cl, %edx, %ebx +; X32-SSE2-NEXT: movl %edx, %esi +; X32-SSE2-NEXT: shrl %cl, %esi +; X32-SSE2-NEXT: movl %ebx, %edi +; X32-SSE2-NEXT: shrdl %cl, %edx, %edi ; X32-SSE2-NEXT: xorl %ebp, %ebp ; X32-SSE2-NEXT: testb $32, %cl -; X32-SSE2-NEXT: cmovnel %edi, %ebx -; X32-SSE2-NEXT: cmovnel %ebp, %edi +; X32-SSE2-NEXT: cmovnel %esi, %edi +; X32-SSE2-NEXT: cmovnel %ebp, %esi ; X32-SSE2-NEXT: negb %cl -; X32-SSE2-NEXT: movl %esi, %eax +; X32-SSE2-NEXT: movl %ebx, %eax ; X32-SSE2-NEXT: shll %cl, %eax -; X32-SSE2-NEXT: shldl %cl, %esi, %edx +; X32-SSE2-NEXT: shldl %cl, %ebx, %edx ; X32-SSE2-NEXT: testb $32, %cl ; X32-SSE2-NEXT: cmovnel %eax, %edx ; X32-SSE2-NEXT: cmovnel %ebp, %eax -; X32-SSE2-NEXT: orl %ebx, %eax -; X32-SSE2-NEXT: orl %edi, %edx +; X32-SSE2-NEXT: orl %edi, %eax +; X32-SSE2-NEXT: orl %esi, %edx ; X32-SSE2-NEXT: popl %esi ; X32-SSE2-NEXT: popl %edi ; X32-SSE2-NEXT: popl %ebx Index: llvm/test/CodeGen/X86/funnel-shift.ll =================================================================== --- llvm/test/CodeGen/X86/funnel-shift.ll +++ llvm/test/CodeGen/X86/funnel-shift.ll @@ -231,26 +231,27 @@ ; X32-SSE2-NEXT: pushl {{[0-9]+}}(%esp) ; X32-SSE2-NEXT: calll __umoddi3 ; X32-SSE2-NEXT: addl $16, %esp -; X32-SSE2-NEXT: addb $27, %al ; X32-SSE2-NEXT: movl %eax, %edx -; X32-SSE2-NEXT: notb %dl -; X32-SSE2-NEXT: movl %edx, %ecx +; X32-SSE2-NEXT: addb $27, %dl +; X32-SSE2-NEXT: movl %edx, %eax +; X32-SSE2-NEXT: notb %al +; X32-SSE2-NEXT: movl %eax, %ecx ; X32-SSE2-NEXT: shldl %cl, %edi, %esi ; X32-SSE2-NEXT: shldl $27, %ebp, %ebx ; X32-SSE2-NEXT: shll $27, %ebp -; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: movl %edx, %ecx ; X32-SSE2-NEXT: shrdl %cl, %ebx, %ebp ; X32-SSE2-NEXT: shrl %cl, %ebx ; X32-SSE2-NEXT: xorl %ecx, %ecx -; X32-SSE2-NEXT: testb $32, %al +; X32-SSE2-NEXT: testb $32, %dl ; X32-SSE2-NEXT: cmovnel %ebx, %ebp ; X32-SSE2-NEXT: cmovnel %ecx, %ebx -; X32-SSE2-NEXT: xorl %eax, %eax -; X32-SSE2-NEXT: movl %edx, %ecx +; X32-SSE2-NEXT: xorl %edx, %edx +; X32-SSE2-NEXT: movl %eax, %ecx ; X32-SSE2-NEXT: shll %cl, %edi -; X32-SSE2-NEXT: testb $32, %dl +; X32-SSE2-NEXT: testb $32, %al ; X32-SSE2-NEXT: cmovnel %edi, %esi -; X32-SSE2-NEXT: cmovnel %eax, %edi +; X32-SSE2-NEXT: cmovnel %edx, %edi ; X32-SSE2-NEXT: orl %ebp, %edi ; X32-SSE2-NEXT: orl %ebx, %esi ; X32-SSE2-NEXT: movl %edi, %eax @@ -901,21 +902,21 @@ ; X32-SSE2-NEXT: pushl %edi ; X32-SSE2-NEXT: pushl %esi ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE2-NEXT: leal (%eax,%eax,2), %edx -; X32-SSE2-NEXT: movzwl 8(%ecx,%edx,4), %esi -; X32-SSE2-NEXT: movsbl 10(%ecx,%edx,4), %edi -; X32-SSE2-NEXT: movl %edi, %ebx -; X32-SSE2-NEXT: shll $16, %ebx -; X32-SSE2-NEXT: orl %esi, %ebx -; X32-SSE2-NEXT: movl 4(%ecx,%edx,4), %ecx -; X32-SSE2-NEXT: shrdl $8, %ebx, %ecx -; X32-SSE2-NEXT: xorl %eax, %ecx +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-SSE2-NEXT: leal (%eax,%eax,2), %edi +; X32-SSE2-NEXT: movzwl 8(%esi,%edi,4), %ebx +; X32-SSE2-NEXT: movsbl 10(%esi,%edi,4), %ecx +; X32-SSE2-NEXT: movl %ecx, %edx +; X32-SSE2-NEXT: shll $16, %edx +; X32-SSE2-NEXT: orl %ebx, %edx +; X32-SSE2-NEXT: movl 4(%esi,%edi,4), %esi +; X32-SSE2-NEXT: shrdl $8, %edx, %esi +; X32-SSE2-NEXT: xorl %eax, %esi ; X32-SSE2-NEXT: sarl $31, %eax -; X32-SSE2-NEXT: sarl $31, %edi -; X32-SSE2-NEXT: shldl $24, %ebx, %edi -; X32-SSE2-NEXT: xorl %eax, %edi -; X32-SSE2-NEXT: orl %edi, %ecx +; X32-SSE2-NEXT: sarl $31, %ecx +; X32-SSE2-NEXT: shldl $24, %edx, %ecx +; X32-SSE2-NEXT: xorl %eax, %ecx +; X32-SSE2-NEXT: orl %ecx, %esi ; X32-SSE2-NEXT: jne .LBB44_1 ; X32-SSE2-NEXT: # %bb.2: ; X32-SSE2-NEXT: popl %esi Index: llvm/test/CodeGen/X86/gather-addresses.ll =================================================================== --- llvm/test/CodeGen/X86/gather-addresses.ll +++ llvm/test/CodeGen/X86/gather-addresses.ll @@ -229,19 +229,19 @@ ; LIN32-NEXT: movl {{[0-9]+}}(%esp), %edx ; LIN32-NEXT: movdqa (%edx), %xmm0 ; LIN32-NEXT: pand (%ecx), %xmm0 -; LIN32-NEXT: movd %xmm0, %ecx -; LIN32-NEXT: pextrd $1, %xmm0, %edx -; LIN32-NEXT: pextrd $2, %xmm0, %esi +; LIN32-NEXT: movd %xmm0, %edx +; LIN32-NEXT: pextrd $1, %xmm0, %esi +; LIN32-NEXT: pextrd $2, %xmm0, %ecx ; LIN32-NEXT: pextrd $3, %xmm0, %edi -; LIN32-NEXT: andl %eax, %ecx ; LIN32-NEXT: andl %eax, %edx ; LIN32-NEXT: andl %eax, %esi +; LIN32-NEXT: andl %eax, %ecx ; LIN32-NEXT: andl %eax, %edi -; LIN32-NEXT: movd %edx, %xmm1 -; LIN32-NEXT: movd %ecx, %xmm0 +; LIN32-NEXT: movd %esi, %xmm1 +; LIN32-NEXT: movd %edx, %xmm0 ; LIN32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; LIN32-NEXT: movd %edi, %xmm2 -; LIN32-NEXT: movd %esi, %xmm1 +; LIN32-NEXT: movd %ecx, %xmm1 ; LIN32-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; LIN32-NEXT: popl %esi ; LIN32-NEXT: popl %edi Index: llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll =================================================================== --- llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll +++ llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll @@ -544,28 +544,28 @@ } define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind { -; SSE2-LABEL: vec_4xi32_nonsplat_eq: -; SSE2: # %bb.0: -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,1,16776960,2147483648] -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psrld %xmm2, %xmm4 -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: psrld %xmm2, %xmm5 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psrld %xmm2, %xmm4 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrld %xmm1, %xmm3 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3] -; SSE2-NEXT: andps %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: ret{{[l|q]}} +; X86-SSE2-LABEL: vec_4xi32_nonsplat_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,16776960,2147483648] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: psrld %xmm3, %xmm4 +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: psrld %xmm3, %xmm5 +; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: psrld %xmm3, %xmm4 +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] +; X86-SSE2-NEXT: psrld %xmm1, %xmm2 +; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; X86-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm2[0,3] +; X86-SSE2-NEXT: andps %xmm5, %xmm0 +; X86-SSE2-NEXT: pxor %xmm1, %xmm1 +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; X86-SSE2-NEXT: retl ; ; AVX2-LABEL: vec_4xi32_nonsplat_eq: ; AVX2: # %bb.0: @@ -575,6 +575,29 @@ ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} +; +; X64-SSE2-LABEL: vec_4xi32_nonsplat_eq: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,1,16776960,2147483648] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE2-NEXT: psrld %xmm2, %xmm4 +; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X64-SSE2-NEXT: psrld %xmm2, %xmm5 +; X64-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE2-NEXT: psrld %xmm2, %xmm4 +; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] +; X64-SSE2-NEXT: psrld %xmm1, %xmm3 +; X64-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; X64-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3] +; X64-SSE2-NEXT: andps %xmm5, %xmm0 +; X64-SSE2-NEXT: pxor %xmm1, %xmm1 +; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; X64-SSE2-NEXT: retq %t0 = lshr <4 x i32> , %y %t1 = and <4 x i32> %t0, %x %res = icmp eq <4 x i32> %t1, @@ -630,28 +653,28 @@ ret <4 x i1> %res } define <4 x i1> @vec_4xi32_nonsplat_undef1_eq(<4 x i32> %x, <4 x i32> %y) nounwind { -; SSE2-LABEL: vec_4xi32_nonsplat_undef1_eq: -; SSE2: # %bb.0: -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1] -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psrld %xmm2, %xmm4 -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: psrld %xmm2, %xmm5 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psrld %xmm2, %xmm4 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrld %xmm1, %xmm3 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3] -; SSE2-NEXT: andps %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: ret{{[l|q]}} +; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef1_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: psrld %xmm3, %xmm4 +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: psrld %xmm3, %xmm5 +; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: psrld %xmm3, %xmm4 +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] +; X86-SSE2-NEXT: psrld %xmm1, %xmm2 +; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; X86-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm2[0,3] +; X86-SSE2-NEXT: andps %xmm5, %xmm0 +; X86-SSE2-NEXT: pxor %xmm1, %xmm1 +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; X86-SSE2-NEXT: retl ; ; AVX2-LABEL: vec_4xi32_nonsplat_undef1_eq: ; AVX2: # %bb.0: @@ -661,34 +684,57 @@ ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} +; +; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef1_eq: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE2-NEXT: psrld %xmm2, %xmm4 +; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X64-SSE2-NEXT: psrld %xmm2, %xmm5 +; X64-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE2-NEXT: psrld %xmm2, %xmm4 +; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] +; X64-SSE2-NEXT: psrld %xmm1, %xmm3 +; X64-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; X64-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3] +; X64-SSE2-NEXT: andps %xmm5, %xmm0 +; X64-SSE2-NEXT: pxor %xmm1, %xmm1 +; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; X64-SSE2-NEXT: retq %t0 = lshr <4 x i32> , %y %t1 = and <4 x i32> %t0, %x %res = icmp eq <4 x i32> %t1, ret <4 x i1> %res } define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwind { -; SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq: -; SSE2: # %bb.0: -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = <1,1,u,1> -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psrld %xmm2, %xmm4 -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: psrld %xmm2, %xmm5 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psrld %xmm2, %xmm4 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrld %xmm1, %xmm3 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3] -; SSE2-NEXT: andps %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: ret{{[l|q]}} +; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = <1,1,u,1> +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: psrld %xmm3, %xmm4 +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: psrld %xmm3, %xmm5 +; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: psrld %xmm3, %xmm4 +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] +; X86-SSE2-NEXT: psrld %xmm1, %xmm2 +; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; X86-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm2[0,3] +; X86-SSE2-NEXT: andps %xmm5, %xmm0 +; X86-SSE2-NEXT: pxor %xmm1, %xmm1 +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; X86-SSE2-NEXT: retl ; ; AVX2-LABEL: vec_4xi32_nonsplat_undef2_eq: ; AVX2: # %bb.0: @@ -698,6 +744,29 @@ ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} +; +; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm3 = <1,1,u,1> +; X64-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE2-NEXT: psrld %xmm2, %xmm4 +; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X64-SSE2-NEXT: psrld %xmm2, %xmm5 +; X64-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE2-NEXT: psrld %xmm2, %xmm4 +; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] +; X64-SSE2-NEXT: psrld %xmm1, %xmm3 +; X64-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; X64-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3] +; X64-SSE2-NEXT: andps %xmm5, %xmm0 +; X64-SSE2-NEXT: pxor %xmm1, %xmm1 +; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; X64-SSE2-NEXT: retq %t0 = lshr <4 x i32> , %y %t1 = and <4 x i32> %t0, %x %res = icmp eq <4 x i32> %t1, Index: llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll =================================================================== --- llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll +++ llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll @@ -346,16 +346,16 @@ ; X86-BMI1-NEXT: pushl %esi ; X86-BMI1-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-BMI1-NEXT: movl $1, %eax -; X86-BMI1-NEXT: xorl %edx, %edx ; X86-BMI1-NEXT: xorl %esi, %esi -; X86-BMI1-NEXT: shldl %cl, %eax, %esi +; X86-BMI1-NEXT: xorl %edx, %edx +; X86-BMI1-NEXT: shldl %cl, %eax, %edx ; X86-BMI1-NEXT: shll %cl, %eax ; X86-BMI1-NEXT: testb $32, %cl -; X86-BMI1-NEXT: cmovnel %eax, %esi -; X86-BMI1-NEXT: cmovnel %edx, %eax -; X86-BMI1-NEXT: andl {{[0-9]+}}(%esp), %esi +; X86-BMI1-NEXT: cmovnel %eax, %edx +; X86-BMI1-NEXT: cmovnel %esi, %eax +; X86-BMI1-NEXT: andl {{[0-9]+}}(%esp), %edx ; X86-BMI1-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-BMI1-NEXT: orl %esi, %eax +; X86-BMI1-NEXT: orl %edx, %eax ; X86-BMI1-NEXT: sete %al ; X86-BMI1-NEXT: popl %esi ; X86-BMI1-NEXT: retl Index: llvm/test/CodeGen/X86/horizontal-reduce-smax.ll =================================================================== --- llvm/test/CodeGen/X86/horizontal-reduce-smax.ll +++ llvm/test/CodeGen/X86/horizontal-reduce-smax.ll @@ -279,27 +279,27 @@ ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm2 +; X86-SSE2-NEXT: pandn %xmm1, %xmm0 +; X86-SSE2-NEXT: por %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: psrld $16, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm1 +; X86-SSE2-NEXT: por %xmm0, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: pandn %xmm0, %xmm2 ; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm2, %xmm1 -; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: movd %xmm2, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -412,28 +412,28 @@ ; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] ; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm6, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; X86-SSE2-NEXT: por %xmm3, %xmm4 -; X86-SSE2-NEXT: pand %xmm4, %xmm0 -; X86-SSE2-NEXT: pandn %xmm1, %xmm4 -; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm6, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; X86-SSE2-NEXT: por %xmm4, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm3 +; X86-SSE2-NEXT: por %xmm0, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pxor %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; X86-SSE2-NEXT: pand %xmm5, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] ; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pand %xmm2, %xmm3 ; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: por %xmm3, %xmm2 ; X86-SSE2-NEXT: movd %xmm2, %eax ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; X86-SSE2-NEXT: movd %xmm0, %edx @@ -844,19 +844,19 @@ ; X86-SSE2-NEXT: pand %xmm1, %xmm2 ; X86-SSE2-NEXT: pandn %xmm0, %xmm1 ; X86-SSE2-NEXT: por %xmm2, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE2-NEXT: psrld $16, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pandn %xmm2, %xmm0 +; X86-SSE2-NEXT: por %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: psrld $16, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm1 +; X86-SSE2-NEXT: por %xmm0, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE2-NEXT: psrlw $8, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 @@ -1050,32 +1050,32 @@ ; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm6 ; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm2 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: por %xmm2, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: por %xmm7, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: pandn %xmm3, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm4, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm2 -; X86-SSE2-NEXT: pxor %xmm4, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; X86-SSE2-NEXT: por %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm5 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm5, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm0, %xmm5 +; X86-SSE2-NEXT: pandn %xmm2, %xmm0 +; X86-SSE2-NEXT: por %xmm5, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 -; X86-SSE2-NEXT: pxor %xmm0, %xmm4 +; X86-SSE2-NEXT: pxor %xmm1, %xmm4 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 ; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3 ; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4 @@ -1084,9 +1084,9 @@ ; X86-SSE2-NEXT: pand %xmm2, %xmm4 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] ; X86-SSE2-NEXT: por %xmm4, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 ; X86-SSE2-NEXT: movd %xmm2, %eax ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; X86-SSE2-NEXT: movd %xmm0, %edx @@ -1628,27 +1628,27 @@ ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm2 +; X86-SSE2-NEXT: pandn %xmm1, %xmm0 +; X86-SSE2-NEXT: por %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: psrld $16, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm1 +; X86-SSE2-NEXT: por %xmm0, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: pandn %xmm0, %xmm2 ; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm2, %xmm1 -; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: movd %xmm2, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -1997,27 +1997,27 @@ ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm2 +; X86-SSE2-NEXT: pandn %xmm1, %xmm0 +; X86-SSE2-NEXT: por %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: psrld $16, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm1 +; X86-SSE2-NEXT: por %xmm0, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: pandn %xmm0, %xmm2 ; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm2, %xmm1 -; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: movd %xmm2, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -2125,27 +2125,27 @@ ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm2 +; X86-SSE2-NEXT: pandn %xmm1, %xmm0 +; X86-SSE2-NEXT: por %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: psrld $16, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm1 +; X86-SSE2-NEXT: por %xmm0, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: pandn %xmm0, %xmm2 ; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm2, %xmm1 -; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: movd %xmm2, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; Index: llvm/test/CodeGen/X86/horizontal-reduce-smin.ll =================================================================== --- llvm/test/CodeGen/X86/horizontal-reduce-smin.ll +++ llvm/test/CodeGen/X86/horizontal-reduce-smin.ll @@ -281,27 +281,27 @@ ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm2 +; X86-SSE2-NEXT: pandn %xmm1, %xmm0 +; X86-SSE2-NEXT: por %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: psrld $16, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm1 +; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: pandn %xmm0, %xmm2 ; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm2, %xmm1 -; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: movd %xmm2, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -414,28 +414,28 @@ ; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] ; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm6, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; X86-SSE2-NEXT: por %xmm3, %xmm4 -; X86-SSE2-NEXT: pand %xmm4, %xmm0 -; X86-SSE2-NEXT: pandn %xmm1, %xmm4 -; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm6, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; X86-SSE2-NEXT: por %xmm4, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm3 +; X86-SSE2-NEXT: por %xmm0, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pxor %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; X86-SSE2-NEXT: pand %xmm5, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] ; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pand %xmm2, %xmm3 ; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: por %xmm3, %xmm2 ; X86-SSE2-NEXT: movd %xmm2, %eax ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; X86-SSE2-NEXT: movd %xmm0, %edx @@ -848,19 +848,19 @@ ; X86-SSE2-NEXT: pand %xmm1, %xmm2 ; X86-SSE2-NEXT: pandn %xmm0, %xmm1 ; X86-SSE2-NEXT: por %xmm2, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE2-NEXT: psrld $16, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pandn %xmm2, %xmm0 +; X86-SSE2-NEXT: por %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: psrld $16, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm1 +; X86-SSE2-NEXT: por %xmm0, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE2-NEXT: psrlw $8, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 @@ -1632,27 +1632,27 @@ ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: pandn %xmm0, %xmm2 ; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm2 +; X86-SSE2-NEXT: pandn %xmm1, %xmm0 +; X86-SSE2-NEXT: por %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: psrld $16, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm1 +; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: pandn %xmm0, %xmm2 ; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm2, %xmm1 -; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: movd %xmm2, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -2001,27 +2001,27 @@ ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm2 +; X86-SSE2-NEXT: pandn %xmm1, %xmm0 +; X86-SSE2-NEXT: por %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: psrld $16, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm1 +; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: pandn %xmm0, %xmm2 ; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm2, %xmm1 -; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: movd %xmm2, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; @@ -2129,27 +2129,27 @@ ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm2 +; X86-SSE2-NEXT: pandn %xmm1, %xmm0 +; X86-SSE2-NEXT: por %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: psrld $16, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm1 +; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: pandn %xmm0, %xmm2 ; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE2-NEXT: psrlw $8, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm2, %xmm1 -; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: movd %xmm2, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; Index: llvm/test/CodeGen/X86/horizontal-reduce-umax.ll =================================================================== --- llvm/test/CodeGen/X86/horizontal-reduce-umax.ll +++ llvm/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -146,25 +146,25 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; X86-SSE2-LABEL: test_reduce_v4i32: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE2-NEXT: pxor %xmm2, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 -; X86-SSE2-NEXT: pxor %xmm2, %xmm4 -; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3 -; X86-SSE2-NEXT: pand %xmm3, %xmm0 -; X86-SSE2-NEXT: pandn %xmm1, %xmm3 -; X86-SSE2-NEXT: por %xmm0, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pandn %xmm3, %xmm1 +; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 ; X86-SSE2-NEXT: pxor %xmm0, %xmm2 -; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm3 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm3, %xmm1 -; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm3 +; X86-SSE2-NEXT: por %xmm1, %xmm3 +; X86-SSE2-NEXT: movd %xmm3, %eax ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: test_reduce_v4i32: @@ -476,28 +476,28 @@ ; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] ; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm6, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; X86-SSE2-NEXT: por %xmm3, %xmm4 -; X86-SSE2-NEXT: pand %xmm4, %xmm0 -; X86-SSE2-NEXT: pandn %xmm1, %xmm4 -; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm6, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; X86-SSE2-NEXT: por %xmm4, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm3 +; X86-SSE2-NEXT: por %xmm0, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pxor %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; X86-SSE2-NEXT: pand %xmm5, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] ; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pand %xmm2, %xmm3 ; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: por %xmm3, %xmm2 ; X86-SSE2-NEXT: movd %xmm2, %eax ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; X86-SSE2-NEXT: movd %xmm0, %edx @@ -670,31 +670,31 @@ ; X86-SSE2-LABEL: test_reduce_v8i32: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE2-NEXT: pxor %xmm2, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 ; X86-SSE2-NEXT: pxor %xmm2, %xmm4 -; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; X86-SSE2-NEXT: pand %xmm4, %xmm0 -; X86-SSE2-NEXT: pandn %xmm1, %xmm4 -; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 -; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm4 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm4, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm3 +; X86-SSE2-NEXT: por %xmm0, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE2-NEXT: pxor %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm3 +; X86-SSE2-NEXT: pandn %xmm1, %xmm0 +; X86-SSE2-NEXT: por %xmm3, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 -; X86-SSE2-NEXT: pxor %xmm0, %xmm2 +; X86-SSE2-NEXT: pxor %xmm1, %xmm2 ; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; X86-SSE2-NEXT: pand %xmm3, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm3 -; X86-SSE2-NEXT: por %xmm1, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm3 +; X86-SSE2-NEXT: por %xmm0, %xmm3 ; X86-SSE2-NEXT: movd %xmm3, %eax ; X86-SSE2-NEXT: retl ; @@ -1145,32 +1145,32 @@ ; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm6 ; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm2 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm0, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; X86-SSE2-NEXT: por %xmm2, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: por %xmm7, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: pandn %xmm3, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; X86-SSE2-NEXT: por %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm4, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm5, %xmm2 -; X86-SSE2-NEXT: pxor %xmm4, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; X86-SSE2-NEXT: por %xmm2, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm5 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm5, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm0, %xmm5 +; X86-SSE2-NEXT: pandn %xmm2, %xmm0 +; X86-SSE2-NEXT: por %xmm5, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 -; X86-SSE2-NEXT: pxor %xmm0, %xmm4 +; X86-SSE2-NEXT: pxor %xmm1, %xmm4 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 ; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3 ; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4 @@ -1179,9 +1179,9 @@ ; X86-SSE2-NEXT: pand %xmm2, %xmm4 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] ; X86-SSE2-NEXT: por %xmm4, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 ; X86-SSE2-NEXT: movd %xmm2, %eax ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; X86-SSE2-NEXT: movd %xmm0, %edx @@ -1447,40 +1447,40 @@ ; X86-SSE2-NEXT: pand %xmm5, %xmm0 ; X86-SSE2-NEXT: pandn %xmm2, %xmm5 ; X86-SSE2-NEXT: por %xmm0, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm0 -; X86-SSE2-NEXT: pxor %xmm4, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm3, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm0 +; X86-SSE2-NEXT: por %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm5, %xmm1 ; X86-SSE2-NEXT: pxor %xmm4, %xmm1 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 ; X86-SSE2-NEXT: pand %xmm1, %xmm5 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 ; X86-SSE2-NEXT: por %xmm5, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: pxor %xmm4, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 ; X86-SSE2-NEXT: pxor %xmm4, %xmm3 -; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 -; X86-SSE2-NEXT: pxor %xmm4, %xmm1 -; X86-SSE2-NEXT: pxor %xmm0, %xmm4 -; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm2, %xmm1 -; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pandn %xmm2, %xmm0 +; X86-SSE2-NEXT: por %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: pxor %xmm1, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: test_reduce_v16i32: Index: llvm/test/CodeGen/X86/horizontal-reduce-umin.ll =================================================================== --- llvm/test/CodeGen/X86/horizontal-reduce-umin.ll +++ llvm/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -148,25 +148,25 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) { ; X86-SSE2-LABEL: test_reduce_v4i32: ; X86-SSE2: ## %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE2-NEXT: pxor %xmm2, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 -; X86-SSE2-NEXT: pxor %xmm2, %xmm4 -; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; X86-SSE2-NEXT: pand %xmm4, %xmm0 -; X86-SSE2-NEXT: pandn %xmm1, %xmm4 -; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 -; X86-SSE2-NEXT: pxor %xmm0, %xmm2 -; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm4 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm4, %xmm2 -; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: pxor %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE2-NEXT: pxor %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm3, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: pxor %xmm1, %xmm3 +; X86-SSE2-NEXT: pxor %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: test_reduce_v4i32: @@ -418,28 +418,28 @@ ; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] ; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; X86-SSE2-NEXT: pand %xmm6, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; X86-SSE2-NEXT: por %xmm3, %xmm4 -; X86-SSE2-NEXT: pand %xmm4, %xmm0 -; X86-SSE2-NEXT: pandn %xmm1, %xmm4 -; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm6, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; X86-SSE2-NEXT: por %xmm4, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm3 +; X86-SSE2-NEXT: por %xmm0, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pxor %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; X86-SSE2-NEXT: pand %xmm5, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] ; X86-SSE2-NEXT: por %xmm1, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pand %xmm2, %xmm3 ; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: por %xmm3, %xmm2 ; X86-SSE2-NEXT: movd %xmm2, %eax ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; X86-SSE2-NEXT: movd %xmm0, %edx @@ -447,23 +447,23 @@ ; ; X86-SSE42-LABEL: test_reduce_v4i64: ; X86-SSE42: ## %bb.0: -; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] +; X86-SSE42-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] ; X86-SSE42-NEXT: movdqa %xmm0, %xmm4 -; X86-SSE42-NEXT: pxor %xmm3, %xmm4 +; X86-SSE42-NEXT: pxor %xmm2, %xmm4 ; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: pxor %xmm3, %xmm0 +; X86-SSE42-NEXT: pxor %xmm2, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: pxor %xmm3, %xmm0 -; X86-SSE42-NEXT: pxor %xmm2, %xmm3 -; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm3 -; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; X86-SSE42-NEXT: movd %xmm2, %eax -; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx +; X86-SSE42-NEXT: pxor %xmm2, %xmm0 +; X86-SSE42-NEXT: pxor %xmm3, %xmm2 +; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm2 +; X86-SSE42-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; X86-SSE42-NEXT: movd %xmm3, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm3, %edx ; X86-SSE42-NEXT: retl ; ; X86-AVX1-LABEL: test_reduce_v4i64: @@ -616,31 +616,31 @@ ; X86-SSE2-LABEL: test_reduce_v8i32: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm3 +; X86-SSE2-NEXT: por %xmm0, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE2-NEXT: pxor %xmm2, %xmm4 -; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; X86-SSE2-NEXT: pand %xmm4, %xmm0 -; X86-SSE2-NEXT: pandn %xmm1, %xmm4 -; X86-SSE2-NEXT: por %xmm0, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: pxor %xmm2, %xmm0 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm3 +; X86-SSE2-NEXT: pandn %xmm1, %xmm0 +; X86-SSE2-NEXT: por %xmm3, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm2, %xmm3 -; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; X86-SSE2-NEXT: pand %xmm3, %xmm4 -; X86-SSE2-NEXT: pandn %xmm0, %xmm3 -; X86-SSE2-NEXT: por %xmm4, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 -; X86-SSE2-NEXT: pxor %xmm0, %xmm2 -; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; X86-SSE2-NEXT: pand %xmm2, %xmm3 -; X86-SSE2-NEXT: pandn %xmm0, %xmm2 -; X86-SSE2-NEXT: por %xmm3, %xmm2 +; X86-SSE2-NEXT: pxor %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 ; X86-SSE2-NEXT: movd %xmm2, %eax ; X86-SSE2-NEXT: retl ; @@ -1103,32 +1103,32 @@ ; ; X86-SSE42-LABEL: test_reduce_v8i64: ; X86-SSE42: ## %bb.0: -; X86-SSE42-NEXT: movdqa %xmm0, %xmm5 -; X86-SSE42-NEXT: movdqa {{.*#+}} xmm4 = [0,2147483648,0,2147483648] +; X86-SSE42-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm5 = [0,2147483648,0,2147483648] ; X86-SSE42-NEXT: movdqa %xmm1, %xmm6 -; X86-SSE42-NEXT: pxor %xmm4, %xmm6 +; X86-SSE42-NEXT: pxor %xmm5, %xmm6 ; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 -; X86-SSE42-NEXT: pxor %xmm4, %xmm0 +; X86-SSE42-NEXT: pxor %xmm5, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; X86-SSE42-NEXT: movdqa %xmm5, %xmm1 -; X86-SSE42-NEXT: pxor %xmm4, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE42-NEXT: pxor %xmm5, %xmm1 ; X86-SSE42-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE42-NEXT: pxor %xmm4, %xmm0 +; X86-SSE42-NEXT: pxor %xmm5, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 -; X86-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm2 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 ; X86-SSE42-NEXT: movapd %xmm2, %xmm1 -; X86-SSE42-NEXT: xorpd %xmm4, %xmm1 +; X86-SSE42-NEXT: xorpd %xmm5, %xmm1 ; X86-SSE42-NEXT: movapd %xmm3, %xmm0 -; X86-SSE42-NEXT: xorpd %xmm4, %xmm0 +; X86-SSE42-NEXT: xorpd %xmm5, %xmm0 ; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 -; X86-SSE42-NEXT: pxor %xmm4, %xmm0 -; X86-SSE42-NEXT: pxor %xmm1, %xmm4 -; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm4 -; X86-SSE42-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE42-NEXT: pxor %xmm5, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm5 +; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm5 +; X86-SSE42-NEXT: movdqa %xmm5, %xmm0 ; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 ; X86-SSE42-NEXT: movd %xmm1, %eax ; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx @@ -1136,26 +1136,26 @@ ; ; X86-AVX1-LABEL: test_reduce_v8i64: ; X86-AVX1: ## %bb.0: -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; X86-AVX1-NEXT: ## xmm3 = mem[0,0] -; X86-AVX1-NEXT: vxorps %xmm3, %xmm2, %xmm4 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X86-AVX1-NEXT: ## xmm2 = mem[0,0] +; X86-AVX1-NEXT: vxorps %xmm2, %xmm3, %xmm4 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; X86-AVX1-NEXT: vxorps %xmm3, %xmm5, %xmm6 +; X86-AVX1-NEXT: vxorps %xmm2, %xmm5, %xmm6 ; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 -; X86-AVX1-NEXT: vxorps %xmm3, %xmm0, %xmm6 -; X86-AVX1-NEXT: vxorps %xmm3, %xmm1, %xmm7 +; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm6 +; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm7 ; X86-AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm6 ; X86-AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm1 -; X86-AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm5, %xmm2 -; X86-AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm4 +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 +; X86-AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm5, %xmm3 +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 -; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 +; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 ; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 -; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3 -; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx @@ -1357,47 +1357,47 @@ ; X86-SSE2-LABEL: test_reduce_v16i32: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 -; X86-SSE2-NEXT: pxor %xmm4, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 ; X86-SSE2-NEXT: pxor %xmm4, %xmm6 -; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm6 -; X86-SSE2-NEXT: pand %xmm6, %xmm1 -; X86-SSE2-NEXT: pandn %xmm3, %xmm6 -; X86-SSE2-NEXT: por %xmm1, %xmm6 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm4, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE2-NEXT: pxor %xmm4, %xmm5 +; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm5 +; X86-SSE2-NEXT: pand %xmm5, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm5 +; X86-SSE2-NEXT: por %xmm1, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm4, %xmm3 -; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; X86-SSE2-NEXT: pand %xmm3, %xmm0 -; X86-SSE2-NEXT: pandn %xmm2, %xmm3 -; X86-SSE2-NEXT: por %xmm0, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm0 -; X86-SSE2-NEXT: pxor %xmm4, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm6, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 ; X86-SSE2-NEXT: pxor %xmm4, %xmm1 -; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm3 -; X86-SSE2-NEXT: pandn %xmm6, %xmm1 -; X86-SSE2-NEXT: por %xmm3, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm1 +; X86-SSE2-NEXT: por %xmm0, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pandn %xmm5, %xmm0 +; X86-SSE2-NEXT: por %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: pxor %xmm4, %xmm3 -; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; X86-SSE2-NEXT: pand %xmm3, %xmm1 -; X86-SSE2-NEXT: pandn %xmm0, %xmm3 -; X86-SSE2-NEXT: por %xmm1, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 ; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm1 +; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm2 ; X86-SSE2-NEXT: pxor %xmm0, %xmm4 -; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; X86-SSE2-NEXT: pand %xmm4, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm1 ; X86-SSE2-NEXT: pandn %xmm0, %xmm4 -; X86-SSE2-NEXT: por %xmm3, %xmm4 +; X86-SSE2-NEXT: por %xmm1, %xmm4 ; X86-SSE2-NEXT: movd %xmm4, %eax ; X86-SSE2-NEXT: retl ; Index: llvm/test/CodeGen/X86/i128-mul.ll =================================================================== --- llvm/test/CodeGen/X86/i128-mul.ll +++ llvm/test/CodeGen/X86/i128-mul.ll @@ -13,11 +13,11 @@ ; X86-NOBMI-NEXT: pushl %ebx ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: movl %ecx, %eax +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: movl %esi, %eax ; X86-NOBMI-NEXT: mull %ebx ; X86-NOBMI-NEXT: movl %edx, %edi ; X86-NOBMI-NEXT: movl %ebp, %eax @@ -26,16 +26,16 @@ ; X86-NOBMI-NEXT: movl %eax, %ebp ; X86-NOBMI-NEXT: addl %edi, %ebp ; X86-NOBMI-NEXT: adcl $0, %ebx -; X86-NOBMI-NEXT: movl %ecx, %eax -; X86-NOBMI-NEXT: mull %esi -; X86-NOBMI-NEXT: movl %edx, %ecx +; X86-NOBMI-NEXT: movl %esi, %eax +; X86-NOBMI-NEXT: mull %ecx +; X86-NOBMI-NEXT: movl %edx, %esi ; X86-NOBMI-NEXT: addl %ebp, %eax -; X86-NOBMI-NEXT: adcl %ebx, %ecx +; X86-NOBMI-NEXT: adcl %ebx, %esi ; X86-NOBMI-NEXT: setb %al ; X86-NOBMI-NEXT: movzbl %al, %edi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOBMI-NEXT: mull %esi -; X86-NOBMI-NEXT: addl %ecx, %eax +; X86-NOBMI-NEXT: mull %ecx +; X86-NOBMI-NEXT: addl %esi, %eax ; X86-NOBMI-NEXT: adcl %edi, %edx ; X86-NOBMI-NEXT: popl %esi ; X86-NOBMI-NEXT: popl %edi @@ -49,26 +49,26 @@ ; X86-BMI-NEXT: pushl %ebx ; X86-BMI-NEXT: pushl %edi ; X86-BMI-NEXT: pushl %esi -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-BMI-NEXT: movl %eax, %edx -; X86-BMI-NEXT: mulxl %esi, %ebx, %ebx +; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-BMI-NEXT: movl %ecx, %edx -; X86-BMI-NEXT: mulxl %esi, %esi, %ebp -; X86-BMI-NEXT: addl %ebx, %esi -; X86-BMI-NEXT: adcl $0, %ebp +; X86-BMI-NEXT: mulxl %edi, %ebx, %ebx ; X86-BMI-NEXT: movl %eax, %edx -; X86-BMI-NEXT: mulxl %edi, %eax, %ebx -; X86-BMI-NEXT: addl %esi, %eax -; X86-BMI-NEXT: adcl %ebp, %ebx -; X86-BMI-NEXT: setb %al -; X86-BMI-NEXT: movzbl %al, %esi +; X86-BMI-NEXT: mulxl %edi, %edi, %ebp +; X86-BMI-NEXT: addl %ebx, %edi +; X86-BMI-NEXT: adcl $0, %ebp ; X86-BMI-NEXT: movl %ecx, %edx -; X86-BMI-NEXT: mulxl %edi, %eax, %edx +; X86-BMI-NEXT: mulxl %esi, %ecx, %ebx +; X86-BMI-NEXT: addl %edi, %ecx +; X86-BMI-NEXT: adcl %ebp, %ebx +; X86-BMI-NEXT: setb %cl +; X86-BMI-NEXT: movzbl %cl, %ecx +; X86-BMI-NEXT: movl %eax, %edx +; X86-BMI-NEXT: mulxl %esi, %eax, %edx ; X86-BMI-NEXT: addl %ebx, %eax -; X86-BMI-NEXT: adcl %esi, %edx +; X86-BMI-NEXT: adcl %ecx, %edx ; X86-BMI-NEXT: popl %esi ; X86-BMI-NEXT: popl %edi ; X86-BMI-NEXT: popl %ebx @@ -114,7 +114,7 @@ ; X86-NOBMI-NEXT: # %bb.1: # %for.body.preheader ; X86-NOBMI-NEXT: xorl %eax, %eax ; X86-NOBMI-NEXT: xorl %edx, %edx -; X86-NOBMI-NEXT: xorl %ebp, %ebp +; X86-NOBMI-NEXT: xorl %ecx, %ecx ; X86-NOBMI-NEXT: movl $0, (%esp) # 4-byte Folded Spill ; X86-NOBMI-NEXT: .p2align 4, 0x90 ; X86-NOBMI-NEXT: .LBB1_2: # %for.body @@ -122,49 +122,51 @@ ; X86-NOBMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOBMI-NEXT: movl (%eax,%ebp,8), %esi -; X86-NOBMI-NEXT: movl 4(%eax,%ebp,8), %ecx -; X86-NOBMI-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOBMI-NEXT: movl (%eax,%ecx,8), %esi +; X86-NOBMI-NEXT: movl 4(%eax,%ecx,8), %ebx +; X86-NOBMI-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl %esi, %eax ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NOBMI-NEXT: mull %edi ; X86-NOBMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NOBMI-NEXT: movl %ecx, %eax +; X86-NOBMI-NEXT: movl %ebx, %eax ; X86-NOBMI-NEXT: mull %edi -; X86-NOBMI-NEXT: movl %edx, %ecx +; X86-NOBMI-NEXT: movl %edx, %ebp ; X86-NOBMI-NEXT: movl %eax, %ebx ; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NOBMI-NEXT: adcl $0, %ecx +; X86-NOBMI-NEXT: adcl $0, %ebp ; X86-NOBMI-NEXT: movl %esi, %eax ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: mull %edx -; X86-NOBMI-NEXT: movl %edx, %esi -; X86-NOBMI-NEXT: movl %eax, %edi -; X86-NOBMI-NEXT: addl %ebx, %edi -; X86-NOBMI-NEXT: adcl %ecx, %esi +; X86-NOBMI-NEXT: movl %edx, %edi +; X86-NOBMI-NEXT: movl %eax, %esi +; X86-NOBMI-NEXT: addl %ebx, %esi +; X86-NOBMI-NEXT: adcl %ebp, %edi ; X86-NOBMI-NEXT: setb %bl ; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NOBMI-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOBMI-NEXT: addl %esi, %eax -; X86-NOBMI-NEXT: movzbl %bl, %esi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NOBMI-NEXT: adcl %esi, %edx +; X86-NOBMI-NEXT: addl %edi, %eax +; X86-NOBMI-NEXT: movzbl %bl, %edi +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NOBMI-NEXT: adcl %edi, %edx +; X86-NOBMI-NEXT: movl %ecx, %ebx ; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NOBMI-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NOBMI-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NOBMI-NEXT: adcl $0, %eax ; X86-NOBMI-NEXT: adcl $0, %edx -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: movl %ecx, (%esi,%ebp,8) -; X86-NOBMI-NEXT: movl %edi, 4(%esi,%ebp,8) -; X86-NOBMI-NEXT: addl $1, %ebp +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NOBMI-NEXT: movl %ecx, (%edi,%ebx,8) +; X86-NOBMI-NEXT: movl %ebx, %ecx +; X86-NOBMI-NEXT: movl %esi, 4(%edi,%ebx,8) +; X86-NOBMI-NEXT: addl $1, %ecx ; X86-NOBMI-NEXT: movl (%esp), %edi # 4-byte Reload ; X86-NOBMI-NEXT: adcl $0, %edi -; X86-NOBMI-NEXT: movl %ebp, %esi +; X86-NOBMI-NEXT: movl %ecx, %esi ; X86-NOBMI-NEXT: xorl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-NOBMI-NEXT: xorl %ebx, %edi +; X86-NOBMI-NEXT: xorl %ebp, %edi ; X86-NOBMI-NEXT: orl %esi, %edi ; X86-NOBMI-NEXT: jne .LBB1_2 ; X86-NOBMI-NEXT: .LBB1_3: # %for.end @@ -183,64 +185,71 @@ ; X86-BMI-NEXT: pushl %ebx ; X86-BMI-NEXT: pushl %edi ; X86-BMI-NEXT: pushl %esi -; X86-BMI-NEXT: subl $16, %esp +; X86-BMI-NEXT: subl $20, %esp ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI-NEXT: orl %ecx, %eax ; X86-BMI-NEXT: je .LBB1_3 ; X86-BMI-NEXT: # %bb.1: # %for.body.preheader ; X86-BMI-NEXT: xorl %ecx, %ecx -; X86-BMI-NEXT: xorl %edx, %edx +; X86-BMI-NEXT: xorl %eax, %eax ; X86-BMI-NEXT: xorl %ebx, %ebx ; X86-BMI-NEXT: xorl %ebp, %ebp ; X86-BMI-NEXT: .p2align 4, 0x90 ; X86-BMI-NEXT: .LBB1_2: # %for.body ; X86-BMI-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-BMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-BMI-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-BMI-NEXT: movl (%ecx,%ebx,8), %eax -; X86-BMI-NEXT: movl 4(%ecx,%ebx,8), %esi -; X86-BMI-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-BMI-NEXT: movl %eax, %edx -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-BMI-NEXT: mulxl %ecx, %edx, %edi +; X86-BMI-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-BMI-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI-NEXT: movl (%eax,%ebx,8), %ecx +; X86-BMI-NEXT: movl 4(%eax,%ebx,8), %esi +; X86-BMI-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-BMI-NEXT: movl %ecx, %edx +; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI-NEXT: mulxl %eax, %edx, %edi ; X86-BMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-BMI-NEXT: movl %esi, %edx -; X86-BMI-NEXT: mulxl %ecx, %esi, %ecx -; X86-BMI-NEXT: addl %edi, %esi -; X86-BMI-NEXT: adcl $0, %ecx +; X86-BMI-NEXT: mulxl %eax, %eax, %esi +; X86-BMI-NEXT: addl %edi, %eax +; X86-BMI-NEXT: adcl $0, %esi +; X86-BMI-NEXT: movl %ecx, %edx +; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI-NEXT: mulxl %ecx, %edi, %ebp +; X86-BMI-NEXT: addl %eax, %edi +; X86-BMI-NEXT: adcl %esi, %ebp +; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-BMI-NEXT: mulxl %ecx, %ecx, %eax +; X86-BMI-NEXT: setb %dl +; X86-BMI-NEXT: addl %ebp, %ecx +; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-BMI-NEXT: movzbl %dl, %edx +; X86-BMI-NEXT: adcl %edx, %eax ; X86-BMI-NEXT: movl %eax, %edx -; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %edi, %eax -; X86-BMI-NEXT: addl %esi, %edi -; X86-BMI-NEXT: adcl %ecx, %eax -; X86-BMI-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %ecx, %edx -; X86-BMI-NEXT: setb (%esp) # 1-byte Folded Spill -; X86-BMI-NEXT: addl %eax, %ecx -; X86-BMI-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload -; X86-BMI-NEXT: adcl %eax, %edx -; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-BMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-BMI-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-BMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-BMI-NEXT: adcl (%esp), %edi # 4-byte Folded Reload ; X86-BMI-NEXT: adcl $0, %ecx ; X86-BMI-NEXT: adcl $0, %edx -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI-NEXT: movl %esi, (%eax,%ebx,8) -; X86-BMI-NEXT: movl %edi, 4(%eax,%ebx,8) +; X86-BMI-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BMI-NEXT: movl %eax, (%edx,%ebx,8) +; X86-BMI-NEXT: movl %edi, 4(%edx,%ebx,8) ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-BMI-NEXT: addl $1, %ebx ; X86-BMI-NEXT: adcl $0, %ebp -; X86-BMI-NEXT: movl %ebx, %eax -; X86-BMI-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-BMI-NEXT: movl %ebx, %edx +; X86-BMI-NEXT: xorl %esi, %edx ; X86-BMI-NEXT: movl %ebp, %esi ; X86-BMI-NEXT: xorl %edi, %esi -; X86-BMI-NEXT: orl %eax, %esi +; X86-BMI-NEXT: orl %edx, %esi +; X86-BMI-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-BMI-NEXT: jne .LBB1_2 ; X86-BMI-NEXT: .LBB1_3: # %for.end ; X86-BMI-NEXT: xorl %eax, %eax ; X86-BMI-NEXT: xorl %edx, %edx -; X86-BMI-NEXT: addl $16, %esp +; X86-BMI-NEXT: addl $20, %esp ; X86-BMI-NEXT: popl %esi ; X86-BMI-NEXT: popl %edi ; X86-BMI-NEXT: popl %ebx Index: llvm/test/CodeGen/X86/i128-sdiv.ll =================================================================== --- llvm/test/CodeGen/X86/i128-sdiv.ll +++ llvm/test/CodeGen/X86/i128-sdiv.ll @@ -12,23 +12,23 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: sarl $31, %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shrl $30, %esi +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: movl %esi, %edx +; X86-NEXT: shrl $30, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: addl %edx, %edi -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx +; X86-NEXT: addl %esi, %edi ; X86-NEXT: adcl {{[0-9]+}}(%esp), %esi +; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: shrdl $2, %ecx, %esi -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: sarl $2, %edx +; X86-NEXT: shrdl $2, %ecx, %edx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: sarl $2, %esi ; X86-NEXT: sarl $31, %ecx ; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: retl $4 @@ -56,30 +56,30 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: shrl $30, %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: shrl $30, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: addl %edx, %edi -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx +; X86-NEXT: addl %esi, %edi ; X86-NEXT: adcl {{[0-9]+}}(%esp), %esi -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: shrdl $2, %ecx, %esi -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: sarl $31, %edx -; X86-NEXT: sarl $2, %ecx +; X86-NEXT: adcl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: adcl $0, %edx +; X86-NEXT: shrdl $2, %edx, %ecx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: sarl $2, %edx ; X86-NEXT: xorl %edi, %edi -; X86-NEXT: negl %esi +; X86-NEXT: negl %ecx ; X86-NEXT: movl $0, %ebx -; X86-NEXT: sbbl %ecx, %ebx -; X86-NEXT: movl $0, %ecx -; X86-NEXT: sbbl %edx, %ecx -; X86-NEXT: sbbl %edx, %edi -; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: movl $0, %edx +; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: sbbl %esi, %edi +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: movl %ebx, 4(%eax) -; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi Index: llvm/test/CodeGen/X86/i256-add.ll =================================================================== --- llvm/test/CodeGen/X86/i256-add.ll +++ llvm/test/CodeGen/X86/i256-add.ll @@ -10,20 +10,20 @@ ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi ; X32-NEXT: subl $8, %esp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 28(%ecx), %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 24(%ecx), %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl 20(%ecx), %esi +; X32-NEXT: movl 16(%ecx), %edi +; X32-NEXT: movl 12(%ecx), %ebx +; X32-NEXT: movl 8(%ecx), %ebp +; X32-NEXT: movl (%ecx), %edx +; X32-NEXT: movl 4(%ecx), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 28(%eax), %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl 24(%eax), %ecx -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X32-NEXT: movl 20(%eax), %esi -; X32-NEXT: movl 16(%eax), %edi -; X32-NEXT: movl 12(%eax), %ebx -; X32-NEXT: movl 8(%eax), %ebp -; X32-NEXT: movl (%eax), %ecx -; X32-NEXT: movl 4(%eax), %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: addl %ecx, (%eax) -; X32-NEXT: adcl %edx, 4(%eax) +; X32-NEXT: addl %edx, (%eax) +; X32-NEXT: adcl %ecx, 4(%eax) ; X32-NEXT: adcl %ebp, 8(%eax) ; X32-NEXT: adcl %ebx, 12(%eax) ; X32-NEXT: adcl %edi, 16(%eax) @@ -64,20 +64,20 @@ ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi ; X32-NEXT: subl $8, %esp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 28(%ecx), %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 24(%ecx), %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl 20(%ecx), %esi +; X32-NEXT: movl 16(%ecx), %edi +; X32-NEXT: movl 12(%ecx), %ebx +; X32-NEXT: movl 8(%ecx), %ebp +; X32-NEXT: movl (%ecx), %edx +; X32-NEXT: movl 4(%ecx), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 28(%eax), %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl 24(%eax), %ecx -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X32-NEXT: movl 20(%eax), %esi -; X32-NEXT: movl 16(%eax), %edi -; X32-NEXT: movl 12(%eax), %ebx -; X32-NEXT: movl 8(%eax), %ebp -; X32-NEXT: movl (%eax), %ecx -; X32-NEXT: movl 4(%eax), %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: subl %ecx, (%eax) -; X32-NEXT: sbbl %edx, 4(%eax) +; X32-NEXT: subl %edx, (%eax) +; X32-NEXT: sbbl %ecx, 4(%eax) ; X32-NEXT: sbbl %ebp, 8(%eax) ; X32-NEXT: sbbl %ebx, 12(%eax) ; X32-NEXT: sbbl %edi, 16(%eax) Index: llvm/test/CodeGen/X86/i64-to-float.ll =================================================================== --- llvm/test/CodeGen/X86/i64-to-float.ll +++ llvm/test/CodeGen/X86/i64-to-float.ll @@ -265,26 +265,26 @@ ; X86-SSE-NEXT: pcmpgtd %xmm3, %xmm4 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; X86-SSE-NEXT: pcmpeqd %xmm3, %xmm2 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; X86-SSE-NEXT: pand %xmm5, %xmm2 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; X86-SSE-NEXT: por %xmm2, %xmm3 -; X86-SSE-NEXT: pand %xmm3, %xmm0 -; X86-SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 -; X86-SSE-NEXT: por %xmm0, %xmm3 -; X86-SSE-NEXT: pxor %xmm3, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; X86-SSE-NEXT: pand %xmm5, %xmm3 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; X86-SSE-NEXT: por %xmm3, %xmm2 +; X86-SSE-NEXT: pand %xmm2, %xmm0 +; X86-SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-SSE-NEXT: por %xmm0, %xmm2 +; X86-SSE-NEXT: pxor %xmm2, %xmm1 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm0 = [2147483903,0,2147483903,0] -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: pcmpgtd %xmm1, %xmm2 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; X86-SSE-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE-NEXT: pcmpgtd %xmm1, %xmm3 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] ; X86-SSE-NEXT: pcmpeqd %xmm0, %xmm1 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] ; X86-SSE-NEXT: pand %xmm4, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] ; X86-SSE-NEXT: por %xmm0, %xmm1 -; X86-SSE-NEXT: pand %xmm1, %xmm3 +; X86-SSE-NEXT: pand %xmm1, %xmm2 ; X86-SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE-NEXT: por %xmm3, %xmm1 +; X86-SSE-NEXT: por %xmm2, %xmm1 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; X86-SSE-NEXT: cvtdq2pd %xmm0, %xmm0 ; X86-SSE-NEXT: retl Index: llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll =================================================================== --- llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll +++ llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll @@ -34,15 +34,15 @@ define void @i24_and_or(i24* %a) { ; X86-LABEL: i24_and_or: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %edx -; X86-NEXT: movzbl 2(%ecx), %eax -; X86-NEXT: movb %al, 2(%ecx) -; X86-NEXT: shll $16, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: orl $384, %eax # imm = 0x180 -; X86-NEXT: andl $16777088, %eax # imm = 0xFFFF80 -; X86-NEXT: movw %ax, (%ecx) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movzbl 2(%eax), %ecx +; X86-NEXT: movb %cl, 2(%eax) +; X86-NEXT: shll $16, %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: orl $384, %ecx # imm = 0x180 +; X86-NEXT: andl $16777088, %ecx # imm = 0xFFFF80 +; X86-NEXT: movw %cx, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: i24_and_or: @@ -66,21 +66,21 @@ define void @i24_insert_bit(i24* %a, i1 zeroext %bit) { ; X86-LABEL: i24_insert_bit: ; X86: # %bb.0: -; X86-NEXT: pushl %esi +; X86-NEXT: pushl %ebx ; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzwl (%ecx), %esi -; X86-NEXT: movzbl 2(%ecx), %eax -; X86-NEXT: movb %al, 2(%ecx) -; X86-NEXT: shll $16, %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: shll $13, %edx -; X86-NEXT: andl $16769023, %eax # imm = 0xFFDFFF -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movw %ax, (%ecx) -; X86-NEXT: popl %esi +; X86-NEXT: .cfi_offset %ebx, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movzbl 2(%eax), %ebx +; X86-NEXT: movb %bl, 2(%eax) +; X86-NEXT: shll $16, %ebx +; X86-NEXT: orl %edx, %ebx +; X86-NEXT: shll $13, %ecx +; X86-NEXT: andl $16769023, %ebx # imm = 0xFFDFFF +; X86-NEXT: orl %ecx, %ebx +; X86-NEXT: movw %bx, (%eax) +; X86-NEXT: popl %ebx ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; Index: llvm/test/CodeGen/X86/known-signbits-vector.ll =================================================================== --- llvm/test/CodeGen/X86/known-signbits-vector.ll +++ llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -429,8 +429,8 @@ ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $16, %esp -; X86-NEXT: vpmovsxdq 8(%ebp), %xmm3 -; X86-NEXT: vpmovsxdq 16(%ebp), %xmm4 +; X86-NEXT: vpmovsxdq 8(%ebp), %xmm4 +; X86-NEXT: vpmovsxdq 16(%ebp), %xmm3 ; X86-NEXT: vpsrad $31, %xmm2, %xmm5 ; X86-NEXT: vpsrad $1, %xmm2, %xmm6 ; X86-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] @@ -441,12 +441,12 @@ ; X86-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3],xmm2[4,5],xmm6[6,7] ; X86-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm6 -; X86-NEXT: vblendvpd %xmm6, %xmm5, %xmm3, %xmm3 +; X86-NEXT: vblendvpd %xmm6, %xmm5, %xmm4, %xmm4 ; X86-NEXT: vextractf128 $1, %ymm1, %xmm1 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 ; X86-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; X86-NEXT: vblendvpd %xmm0, %xmm2, %xmm4, %xmm0 -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; X86-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0 +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 ; X86-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] Index: llvm/test/CodeGen/X86/legalize-shl-vec.ll =================================================================== --- llvm/test/CodeGen/X86/legalize-shl-vec.ll +++ llvm/test/CodeGen/X86/legalize-shl-vec.ll @@ -6,30 +6,30 @@ ; X32-LABEL: test_shl: ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: shldl $2, %edx, %ecx -; X32-NEXT: movl %ecx, 60(%eax) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: shldl $2, %ecx, %edx -; X32-NEXT: movl %edx, 56(%eax) +; X32-NEXT: movl %edx, 60(%eax) ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: shldl $2, %edx, %ecx -; X32-NEXT: movl %ecx, 52(%eax) +; X32-NEXT: movl %ecx, 56(%eax) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: shldl $2, %ecx, %edx -; X32-NEXT: movl %edx, 48(%eax) +; X32-NEXT: movl %edx, 52(%eax) ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: shldl $2, %edx, %ecx -; X32-NEXT: movl %ecx, 44(%eax) +; X32-NEXT: movl %ecx, 48(%eax) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: shldl $2, %ecx, %edx -; X32-NEXT: movl %edx, 40(%eax) +; X32-NEXT: movl %edx, 44(%eax) ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: shldl $2, %edx, %ecx -; X32-NEXT: movl %ecx, 36(%eax) -; X32-NEXT: shll $2, %edx -; X32-NEXT: movl %edx, 32(%eax) +; X32-NEXT: movl %ecx, 40(%eax) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: shldl $2, %ecx, %edx +; X32-NEXT: movl %edx, 36(%eax) +; X32-NEXT: shll $2, %ecx +; X32-NEXT: movl %ecx, 32(%eax) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: shll $31, %ecx ; X32-NEXT: movl %ecx, 28(%eax) @@ -84,34 +84,34 @@ ; X32-NEXT: .cfi_offset %edi, -16 ; X32-NEXT: .cfi_offset %ebx, -12 ; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: shldl $28, %eax, %ecx +; X32-NEXT: movl %ebx, %ecx +; X32-NEXT: shldl $28, %edi, %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: shldl $28, %esi, %eax +; X32-NEXT: shldl $28, %esi, %edi +; X32-NEXT: shldl $28, %edx, %esi +; X32-NEXT: shldl $28, %eax, %edx +; X32-NEXT: shldl $28, %ebp, %eax ; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: shldl $28, %edi, %esi -; X32-NEXT: shldl $28, %ebx, %edi -; X32-NEXT: shldl $28, %ebp, %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: shldl $28, %eax, %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: shrdl $4, %eax, %ecx -; X32-NEXT: shrl $4, %edx +; X32-NEXT: shrl $4, %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl %edx, 60(%eax) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X32-NEXT: movl %edx, 56(%eax) -; X32-NEXT: movl (%esp), %edx # 4-byte Reload -; X32-NEXT: movl %edx, 52(%eax) +; X32-NEXT: movl %ebx, 60(%eax) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, 56(%eax) +; X32-NEXT: movl %edi, 52(%eax) ; X32-NEXT: movl %esi, 48(%eax) -; X32-NEXT: movl %edi, 44(%eax) -; X32-NEXT: movl %ebx, 40(%eax) +; X32-NEXT: movl %edx, 44(%eax) +; X32-NEXT: movl (%esp), %edx # 4-byte Reload +; X32-NEXT: movl %edx, 40(%eax) ; X32-NEXT: movl %ebp, 36(%eax) ; X32-NEXT: movl %ecx, 32(%eax) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -178,34 +178,34 @@ ; X32-NEXT: .cfi_offset %edi, -16 ; X32-NEXT: .cfi_offset %ebx, -12 ; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: shldl $26, %eax, %ecx +; X32-NEXT: movl %ebx, %ecx +; X32-NEXT: shldl $26, %edi, %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: shldl $26, %esi, %eax +; X32-NEXT: shldl $26, %esi, %edi +; X32-NEXT: shldl $26, %edx, %esi +; X32-NEXT: shldl $26, %eax, %edx +; X32-NEXT: shldl $26, %ebp, %eax ; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: shldl $26, %edi, %esi -; X32-NEXT: shldl $26, %ebx, %edi -; X32-NEXT: shldl $26, %ebp, %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: shldl $26, %eax, %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: shrdl $6, %eax, %ecx -; X32-NEXT: sarl $6, %edx +; X32-NEXT: sarl $6, %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl %edx, 60(%eax) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X32-NEXT: movl %edx, 56(%eax) -; X32-NEXT: movl (%esp), %edx # 4-byte Reload -; X32-NEXT: movl %edx, 52(%eax) +; X32-NEXT: movl %ebx, 60(%eax) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, 56(%eax) +; X32-NEXT: movl %edi, 52(%eax) ; X32-NEXT: movl %esi, 48(%eax) -; X32-NEXT: movl %edi, 44(%eax) -; X32-NEXT: movl %ebx, 40(%eax) +; X32-NEXT: movl %edx, 44(%eax) +; X32-NEXT: movl (%esp), %edx # 4-byte Reload +; X32-NEXT: movl %edx, 40(%eax) ; X32-NEXT: movl %ebp, 36(%eax) ; X32-NEXT: movl %ecx, 32(%eax) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx Index: llvm/test/CodeGen/X86/load-combine.ll =================================================================== --- llvm/test/CodeGen/X86/load-combine.ll +++ llvm/test/CodeGen/X86/load-combine.ll @@ -483,19 +483,19 @@ ; CHECK-NEXT: pushl %esi ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: .cfi_offset %esi, -8 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movzbl (%ecx), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movzbl (%eax), %edx ; CHECK-NEXT: shll $24, %edx -; CHECK-NEXT: movzbl 1(%ecx), %esi -; CHECK-NEXT: movl $0, (%eax) +; CHECK-NEXT: movzbl 1(%eax), %esi +; CHECK-NEXT: movl $0, (%ecx) ; CHECK-NEXT: shll $16, %esi ; CHECK-NEXT: orl %edx, %esi -; CHECK-NEXT: movzbl 2(%ecx), %edx -; CHECK-NEXT: shll $8, %edx -; CHECK-NEXT: orl %esi, %edx -; CHECK-NEXT: movzbl 3(%ecx), %eax -; CHECK-NEXT: orl %edx, %eax +; CHECK-NEXT: movzbl 2(%eax), %ecx +; CHECK-NEXT: shll $8, %ecx +; CHECK-NEXT: orl %esi, %ecx +; CHECK-NEXT: movzbl 3(%eax), %eax +; CHECK-NEXT: orl %ecx, %eax ; CHECK-NEXT: popl %esi ; CHECK-NEXT: .cfi_def_cfa_offset 4 ; CHECK-NEXT: retl Index: llvm/test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -4114,16 +4114,16 @@ ; KNL_32-NEXT: .cfi_def_cfa_register %ebp ; KNL_32-NEXT: andl $-64, %esp ; KNL_32-NEXT: subl $64, %esp -; KNL_32-NEXT: vmovapd 72(%ebp), %zmm3 +; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm3 +; KNL_32-NEXT: vmovapd 72(%ebp), %zmm1 ; KNL_32-NEXT: movl 8(%ebp), %eax -; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; KNL_32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 ; KNL_32-NEXT: vptestnmd %zmm4, %zmm4, %k1 -; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k2 +; KNL_32-NEXT: vptestnmd %zmm3, %zmm3, %k2 ; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2} ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm3 {%k1} +; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm1 {%k1} ; KNL_32-NEXT: vmovapd %zmm2, %zmm0 -; KNL_32-NEXT: vmovapd %zmm3, %zmm1 ; KNL_32-NEXT: movl %ebp, %esp ; KNL_32-NEXT: popl %ebp ; KNL_32-NEXT: .cfi_def_cfa %esp, 4 Index: llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll =================================================================== --- llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll +++ llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll @@ -1007,20 +1007,20 @@ ; X86-LABEL: length13_eq: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %esi -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: xorl 4(%eax), %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl 8(%ecx), %edx -; X86-NEXT: xorl 8(%eax), %edx -; X86-NEXT: movb 12(%ecx), %cl -; X86-NEXT: xorb 12(%eax), %cl -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl (%edx), %esi +; X86-NEXT: movl 4(%edx), %eax +; X86-NEXT: xorl (%ecx), %esi +; X86-NEXT: xorl 4(%ecx), %eax ; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl 8(%edx), %esi +; X86-NEXT: xorl 8(%ecx), %esi +; X86-NEXT: movb 12(%edx), %dl +; X86-NEXT: xorb 12(%ecx), %dl +; X86-NEXT: movzbl %dl, %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete %al ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -1043,20 +1043,20 @@ ; X86-LABEL: length14_eq: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %esi -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: xorl 4(%eax), %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl 8(%ecx), %edx -; X86-NEXT: xorl 8(%eax), %edx -; X86-NEXT: movzwl 12(%ecx), %ecx -; X86-NEXT: xorw 12(%eax), %cx -; X86-NEXT: movzwl %cx, %eax -; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl (%edx), %esi +; X86-NEXT: movl 4(%edx), %eax +; X86-NEXT: xorl (%ecx), %esi +; X86-NEXT: xorl 4(%ecx), %eax ; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl 8(%edx), %esi +; X86-NEXT: xorl 8(%ecx), %esi +; X86-NEXT: movzwl 12(%edx), %edx +; X86-NEXT: xorw 12(%ecx), %dx +; X86-NEXT: movzwl %dx, %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete %al ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -1079,19 +1079,19 @@ ; X86-LABEL: length15_eq: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %esi -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: xorl 4(%eax), %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl 8(%ecx), %edx -; X86-NEXT: xorl 8(%eax), %edx -; X86-NEXT: movl 11(%ecx), %ecx -; X86-NEXT: xorl 11(%eax), %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: orl %esi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl (%edx), %esi +; X86-NEXT: movl 4(%edx), %eax +; X86-NEXT: xorl (%ecx), %esi +; X86-NEXT: xorl 4(%ecx), %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl 8(%edx), %esi +; X86-NEXT: xorl 8(%ecx), %esi +; X86-NEXT: movl 11(%edx), %edx +; X86-NEXT: xorl 11(%ecx), %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: orl %eax, %edx ; X86-NEXT: sete %al ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -1186,19 +1186,19 @@ ; X86-NOSSE-LABEL: length16_eq: ; X86-NOSSE: # %bb.0: ; X86-NOSSE-NEXT: pushl %esi -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl (%ecx), %edx -; X86-NOSSE-NEXT: movl 4(%ecx), %esi -; X86-NOSSE-NEXT: xorl (%eax), %edx -; X86-NOSSE-NEXT: xorl 4(%eax), %esi -; X86-NOSSE-NEXT: orl %edx, %esi -; X86-NOSSE-NEXT: movl 8(%ecx), %edx -; X86-NOSSE-NEXT: xorl 8(%eax), %edx -; X86-NOSSE-NEXT: movl 12(%ecx), %ecx -; X86-NOSSE-NEXT: xorl 12(%eax), %ecx -; X86-NOSSE-NEXT: orl %edx, %ecx -; X86-NOSSE-NEXT: orl %esi, %ecx +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NOSSE-NEXT: movl (%edx), %esi +; X86-NOSSE-NEXT: movl 4(%edx), %eax +; X86-NOSSE-NEXT: xorl (%ecx), %esi +; X86-NOSSE-NEXT: xorl 4(%ecx), %eax +; X86-NOSSE-NEXT: orl %esi, %eax +; X86-NOSSE-NEXT: movl 8(%edx), %esi +; X86-NOSSE-NEXT: xorl 8(%ecx), %esi +; X86-NOSSE-NEXT: movl 12(%edx), %edx +; X86-NOSSE-NEXT: xorl 12(%ecx), %edx +; X86-NOSSE-NEXT: orl %esi, %edx +; X86-NOSSE-NEXT: orl %eax, %edx ; X86-NOSSE-NEXT: setne %al ; X86-NOSSE-NEXT: popl %esi ; X86-NOSSE-NEXT: retl @@ -1206,19 +1206,19 @@ ; X86-SSE1-LABEL: length16_eq: ; X86-SSE1: # %bb.0: ; X86-SSE1-NEXT: pushl %esi -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE1-NEXT: movl (%ecx), %edx -; X86-SSE1-NEXT: movl 4(%ecx), %esi -; X86-SSE1-NEXT: xorl (%eax), %edx -; X86-SSE1-NEXT: xorl 4(%eax), %esi -; X86-SSE1-NEXT: orl %edx, %esi -; X86-SSE1-NEXT: movl 8(%ecx), %edx -; X86-SSE1-NEXT: xorl 8(%eax), %edx -; X86-SSE1-NEXT: movl 12(%ecx), %ecx -; X86-SSE1-NEXT: xorl 12(%eax), %ecx -; X86-SSE1-NEXT: orl %edx, %ecx -; X86-SSE1-NEXT: orl %esi, %ecx +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE1-NEXT: movl (%edx), %esi +; X86-SSE1-NEXT: movl 4(%edx), %eax +; X86-SSE1-NEXT: xorl (%ecx), %esi +; X86-SSE1-NEXT: xorl 4(%ecx), %eax +; X86-SSE1-NEXT: orl %esi, %eax +; X86-SSE1-NEXT: movl 8(%edx), %esi +; X86-SSE1-NEXT: xorl 8(%ecx), %esi +; X86-SSE1-NEXT: movl 12(%edx), %edx +; X86-SSE1-NEXT: xorl 12(%ecx), %edx +; X86-SSE1-NEXT: orl %esi, %edx +; X86-SSE1-NEXT: orl %eax, %edx ; X86-SSE1-NEXT: setne %al ; X86-SSE1-NEXT: popl %esi ; X86-SSE1-NEXT: retl @@ -3448,21 +3448,21 @@ ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu (%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE2-NEXT: movdqu (%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm2 +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm2 ; X86-SSE2-NEXT: movdqu 32(%eax), %xmm3 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm3 -; X86-SSE2-NEXT: movdqu 47(%ecx), %xmm1 +; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqu 47(%ecx), %xmm2 ; X86-SSE2-NEXT: movdqu 47(%eax), %xmm4 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm4 +; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm4 ; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm1, %xmm4 ; X86-SSE2-NEXT: pand %xmm0, %xmm4 -; X86-SSE2-NEXT: pand %xmm2, %xmm4 ; X86-SSE2-NEXT: pmovmskb %xmm4, %eax ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: setne %al @@ -3472,21 +3472,21 @@ ; X86-SSE41: # %bb.0: ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu (%eax), %xmm2 -; X86-SSE41-NEXT: pxor %xmm0, %xmm2 -; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE41-NEXT: movdqu (%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm2 +; X86-SSE41-NEXT: movdqu (%eax), %xmm0 ; X86-SSE41-NEXT: pxor %xmm1, %xmm0 -; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1 +; X86-SSE41-NEXT: pxor %xmm2, %xmm1 +; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm2 ; X86-SSE41-NEXT: movdqu 32(%eax), %xmm3 -; X86-SSE41-NEXT: pxor %xmm1, %xmm3 -; X86-SSE41-NEXT: movdqu 47(%ecx), %xmm1 +; X86-SSE41-NEXT: pxor %xmm2, %xmm3 +; X86-SSE41-NEXT: movdqu 47(%ecx), %xmm2 ; X86-SSE41-NEXT: movdqu 47(%eax), %xmm4 -; X86-SSE41-NEXT: pxor %xmm1, %xmm4 +; X86-SSE41-NEXT: pxor %xmm2, %xmm4 ; X86-SSE41-NEXT: por %xmm3, %xmm4 +; X86-SSE41-NEXT: por %xmm1, %xmm4 ; X86-SSE41-NEXT: por %xmm0, %xmm4 -; X86-SSE41-NEXT: por %xmm2, %xmm4 ; X86-SSE41-NEXT: ptest %xmm4, %xmm4 ; X86-SSE41-NEXT: setne %al ; X86-SSE41-NEXT: retl @@ -3839,21 +3839,21 @@ ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu (%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE2-NEXT: movdqu (%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm2 +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm2 ; X86-SSE2-NEXT: movdqu 32(%eax), %xmm3 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm3 -; X86-SSE2-NEXT: movdqu 48(%ecx), %xmm1 +; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqu 48(%ecx), %xmm2 ; X86-SSE2-NEXT: movdqu 48(%eax), %xmm4 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm4 +; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm4 ; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm1, %xmm4 ; X86-SSE2-NEXT: pand %xmm0, %xmm4 -; X86-SSE2-NEXT: pand %xmm2, %xmm4 ; X86-SSE2-NEXT: pmovmskb %xmm4, %eax ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: setne %al @@ -3863,21 +3863,21 @@ ; X86-SSE41: # %bb.0: ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu (%eax), %xmm2 -; X86-SSE41-NEXT: pxor %xmm0, %xmm2 -; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE41-NEXT: movdqu (%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm2 +; X86-SSE41-NEXT: movdqu (%eax), %xmm0 ; X86-SSE41-NEXT: pxor %xmm1, %xmm0 -; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1 +; X86-SSE41-NEXT: pxor %xmm2, %xmm1 +; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm2 ; X86-SSE41-NEXT: movdqu 32(%eax), %xmm3 -; X86-SSE41-NEXT: pxor %xmm1, %xmm3 -; X86-SSE41-NEXT: movdqu 48(%ecx), %xmm1 +; X86-SSE41-NEXT: pxor %xmm2, %xmm3 +; X86-SSE41-NEXT: movdqu 48(%ecx), %xmm2 ; X86-SSE41-NEXT: movdqu 48(%eax), %xmm4 -; X86-SSE41-NEXT: pxor %xmm1, %xmm4 +; X86-SSE41-NEXT: pxor %xmm2, %xmm4 ; X86-SSE41-NEXT: por %xmm3, %xmm4 +; X86-SSE41-NEXT: por %xmm1, %xmm4 ; X86-SSE41-NEXT: por %xmm0, %xmm4 -; X86-SSE41-NEXT: por %xmm2, %xmm4 ; X86-SSE41-NEXT: ptest %xmm4, %xmm4 ; X86-SSE41-NEXT: setne %al ; X86-SSE41-NEXT: retl Index: llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll =================================================================== --- llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll +++ llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll @@ -302,28 +302,28 @@ ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE2-NEXT: movd %xmm0, %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0 +; X86-SSE2-NEXT: movd %xmm1, %ecx ; X86-SSE2-NEXT: movntil %ecx, (%eax) -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] ; X86-SSE2-NEXT: movd %xmm2, %ecx ; X86-SSE2-NEXT: movntil %ecx, 12(%eax) -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: movd %xmm2, %ecx ; X86-SSE2-NEXT: movntil %ecx, 8(%eax) -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-SSE2-NEXT: movd %xmm0, %ecx -; X86-SSE2-NEXT: movntil %ecx, 4(%eax) +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; X86-SSE2-NEXT: movd %xmm1, %ecx -; X86-SSE2-NEXT: movntil %ecx, 16(%eax) -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] +; X86-SSE2-NEXT: movntil %ecx, 4(%eax) ; X86-SSE2-NEXT: movd %xmm0, %ecx +; X86-SSE2-NEXT: movntil %ecx, 16(%eax) +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; X86-SSE2-NEXT: movd %xmm1, %ecx ; X86-SSE2-NEXT: movntil %ecx, 28(%eax) -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; X86-SSE2-NEXT: movd %xmm0, %ecx +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: movd %xmm1, %ecx ; X86-SSE2-NEXT: movntil %ecx, 24(%eax) -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-SSE2-NEXT: movd %xmm0, %ecx ; X86-SSE2-NEXT: movntil %ecx, 20(%eax) ; X86-SSE2-NEXT: retl @@ -415,28 +415,28 @@ ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE2-NEXT: movd %xmm0, %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0 +; X86-SSE2-NEXT: movd %xmm1, %ecx ; X86-SSE2-NEXT: movntil %ecx, (%eax) -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] ; X86-SSE2-NEXT: movd %xmm2, %ecx ; X86-SSE2-NEXT: movntil %ecx, 12(%eax) -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; X86-SSE2-NEXT: movd %xmm2, %ecx ; X86-SSE2-NEXT: movntil %ecx, 8(%eax) -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-SSE2-NEXT: movd %xmm0, %ecx -; X86-SSE2-NEXT: movntil %ecx, 4(%eax) +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; X86-SSE2-NEXT: movd %xmm1, %ecx -; X86-SSE2-NEXT: movntil %ecx, 16(%eax) -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] +; X86-SSE2-NEXT: movntil %ecx, 4(%eax) ; X86-SSE2-NEXT: movd %xmm0, %ecx +; X86-SSE2-NEXT: movntil %ecx, 16(%eax) +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; X86-SSE2-NEXT: movd %xmm1, %ecx ; X86-SSE2-NEXT: movntil %ecx, 28(%eax) -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; X86-SSE2-NEXT: movd %xmm0, %ecx +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-SSE2-NEXT: movd %xmm1, %ecx ; X86-SSE2-NEXT: movntil %ecx, 24(%eax) -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-SSE2-NEXT: movd %xmm0, %ecx ; X86-SSE2-NEXT: movntil %ecx, 20(%eax) ; X86-SSE2-NEXT: retl Index: llvm/test/CodeGen/X86/mmx-arith.ll =================================================================== --- llvm/test/CodeGen/X86/mmx-arith.ll +++ llvm/test/CodeGen/X86/mmx-arith.ll @@ -142,12 +142,12 @@ define void @test1(x86_mmx* %A, x86_mmx* %B) { ; X32-LABEL: test1: ; X32: # %bb.0: # %entry -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X32-NEXT: paddd %xmm0, %xmm1 -; X32-NEXT: movq %xmm1, (%ecx) +; X32-NEXT: movq %xmm1, (%eax) ; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; X32-NEXT: pmuludq %xmm0, %xmm1 @@ -156,16 +156,16 @@ ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X32-NEXT: movq %xmm1, (%ecx) +; X32-NEXT: movq %xmm1, (%eax) ; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: pand %xmm1, %xmm0 -; X32-NEXT: movq %xmm0, (%ecx) +; X32-NEXT: movq %xmm0, (%eax) ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X32-NEXT: por %xmm0, %xmm1 -; X32-NEXT: movq %xmm1, (%ecx) +; X32-NEXT: movq %xmm1, (%eax) ; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: pxor %xmm1, %xmm0 -; X32-NEXT: movq %xmm0, (%ecx) +; X32-NEXT: movq %xmm0, (%eax) ; X32-NEXT: emms ; X32-NEXT: retl ; Index: llvm/test/CodeGen/X86/mul-constant-i64.ll =================================================================== --- llvm/test/CodeGen/X86/mul-constant-i64.ll +++ llvm/test/CodeGen/X86/mul-constant-i64.ll @@ -1479,28 +1479,28 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl $9, %edx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl $9, %ecx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: leal (%ebx,%ebx,8), %edi +; X86-NEXT: addl $42, %ecx +; X86-NEXT: adcl %edx, %edi +; X86-NEXT: movl $5, %edx +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edx ; X86-NEXT: movl %eax, %esi -; X86-NEXT: leal (%edi,%edi,8), %ebx -; X86-NEXT: addl $42, %esi +; X86-NEXT: leal (%ebx,%ebx,4), %ebx +; X86-NEXT: addl $2, %esi ; X86-NEXT: adcl %edx, %ebx -; X86-NEXT: movl $5, %edx ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: leal (%edi,%edi,4), %edi -; X86-NEXT: addl $2, %ecx -; X86-NEXT: adcl %edx, %edi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: imull %esi, %edi -; X86-NEXT: addl %edi, %edx -; X86-NEXT: imull %ebx, %ecx -; X86-NEXT: addl %ecx, %edx +; X86-NEXT: mull %esi +; X86-NEXT: imull %ecx, %ebx +; X86-NEXT: addl %ebx, %edx +; X86-NEXT: imull %edi, %esi +; X86-NEXT: addl %esi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -1511,28 +1511,28 @@ ; X86-NOOPT-NEXT: pushl %ebx ; X86-NOOPT-NEXT: pushl %edi ; X86-NOOPT-NEXT: pushl %esi -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOOPT-NEXT: movl $9, %edx -; X86-NOOPT-NEXT: movl %ecx, %eax +; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NOOPT-NEXT: movl $9, %ecx +; X86-NOOPT-NEXT: movl %esi, %eax +; X86-NOOPT-NEXT: mull %ecx +; X86-NOOPT-NEXT: movl %eax, %ecx +; X86-NOOPT-NEXT: leal (%ebx,%ebx,8), %edi +; X86-NOOPT-NEXT: addl $42, %ecx +; X86-NOOPT-NEXT: adcl %edx, %edi +; X86-NOOPT-NEXT: movl $5, %edx +; X86-NOOPT-NEXT: movl %esi, %eax ; X86-NOOPT-NEXT: mull %edx ; X86-NOOPT-NEXT: movl %eax, %esi -; X86-NOOPT-NEXT: leal (%edi,%edi,8), %ebx -; X86-NOOPT-NEXT: addl $42, %esi +; X86-NOOPT-NEXT: leal (%ebx,%ebx,4), %ebx +; X86-NOOPT-NEXT: addl $2, %esi ; X86-NOOPT-NEXT: adcl %edx, %ebx -; X86-NOOPT-NEXT: movl $5, %edx ; X86-NOOPT-NEXT: movl %ecx, %eax -; X86-NOOPT-NEXT: mull %edx -; X86-NOOPT-NEXT: movl %eax, %ecx -; X86-NOOPT-NEXT: leal (%edi,%edi,4), %edi -; X86-NOOPT-NEXT: addl $2, %ecx -; X86-NOOPT-NEXT: adcl %edx, %edi -; X86-NOOPT-NEXT: movl %esi, %eax -; X86-NOOPT-NEXT: mull %ecx -; X86-NOOPT-NEXT: imull %esi, %edi -; X86-NOOPT-NEXT: addl %edi, %edx -; X86-NOOPT-NEXT: imull %ebx, %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx +; X86-NOOPT-NEXT: mull %esi +; X86-NOOPT-NEXT: imull %ecx, %ebx +; X86-NOOPT-NEXT: addl %ebx, %edx +; X86-NOOPT-NEXT: imull %edi, %esi +; X86-NOOPT-NEXT: addl %esi, %edx ; X86-NOOPT-NEXT: popl %esi ; X86-NOOPT-NEXT: popl %edi ; X86-NOOPT-NEXT: popl %ebx Index: llvm/test/CodeGen/X86/mul-constant-result.ll =================================================================== --- llvm/test/CodeGen/X86/mul-constant-result.ll +++ llvm/test/CodeGen/X86/mul-constant-result.ll @@ -542,8 +542,8 @@ ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $2, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: xorl $2, %ebx ; X86-NEXT: pushl $1 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $3 @@ -551,290 +551,290 @@ ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $3, %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: pushl $2 -; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $4 -; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: calll mult@PLT -; X86-NEXT: addl $8, %esp -; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $4, %edi +; X86-NEXT: xorl $3, %edi ; X86-NEXT: orl %ebx, %edi ; X86-NEXT: pushl $2 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $5 +; X86-NEXT: pushl $4 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $5, %ebx +; X86-NEXT: xorl $4, %ebx ; X86-NEXT: orl %edi, %ebx -; X86-NEXT: pushl $3 +; X86-NEXT: pushl $2 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $6 +; X86-NEXT: pushl $5 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $6, %edi +; X86-NEXT: xorl $5, %edi ; X86-NEXT: orl %ebx, %edi ; X86-NEXT: pushl $3 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $7 +; X86-NEXT: pushl $6 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $7, %ebx +; X86-NEXT: xorl $6, %ebx ; X86-NEXT: orl %edi, %ebx -; X86-NEXT: pushl $4 +; X86-NEXT: pushl $3 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $8 +; X86-NEXT: pushl $7 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $8, %edi +; X86-NEXT: xorl $7, %edi ; X86-NEXT: orl %ebx, %edi ; X86-NEXT: pushl $4 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $9 +; X86-NEXT: pushl $8 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $9, %ebx +; X86-NEXT: xorl $8, %ebx ; X86-NEXT: orl %edi, %ebx -; X86-NEXT: pushl $5 +; X86-NEXT: pushl $4 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $10 +; X86-NEXT: pushl $9 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $10, %edi +; X86-NEXT: xorl $9, %edi ; X86-NEXT: orl %ebx, %edi ; X86-NEXT: pushl $5 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $11 +; X86-NEXT: pushl $10 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $11, %ebx +; X86-NEXT: xorl $10, %ebx ; X86-NEXT: orl %edi, %ebx -; X86-NEXT: pushl $6 +; X86-NEXT: pushl $5 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $12 +; X86-NEXT: pushl $11 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $12, %edi +; X86-NEXT: xorl $11, %edi ; X86-NEXT: orl %ebx, %edi ; X86-NEXT: pushl $6 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $13 +; X86-NEXT: pushl $12 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $13, %ebx +; X86-NEXT: xorl $12, %ebx ; X86-NEXT: orl %edi, %ebx -; X86-NEXT: pushl $7 +; X86-NEXT: pushl $6 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $14 +; X86-NEXT: pushl $13 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $14, %edi +; X86-NEXT: xorl $13, %edi ; X86-NEXT: orl %ebx, %edi ; X86-NEXT: pushl $7 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $15 +; X86-NEXT: pushl $14 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $15, %ebx +; X86-NEXT: xorl $14, %ebx ; X86-NEXT: orl %edi, %ebx -; X86-NEXT: pushl $8 +; X86-NEXT: pushl $7 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $16 +; X86-NEXT: pushl $15 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $16, %edi +; X86-NEXT: xorl $15, %edi ; X86-NEXT: orl %ebx, %edi ; X86-NEXT: pushl $8 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $17 +; X86-NEXT: pushl $16 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $17, %ebx +; X86-NEXT: xorl $16, %ebx ; X86-NEXT: orl %edi, %ebx -; X86-NEXT: pushl $9 +; X86-NEXT: pushl $8 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $18 +; X86-NEXT: pushl $17 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $18, %edi +; X86-NEXT: xorl $17, %edi ; X86-NEXT: orl %ebx, %edi ; X86-NEXT: pushl $9 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $19 +; X86-NEXT: pushl $18 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $19, %ebx +; X86-NEXT: xorl $18, %ebx ; X86-NEXT: orl %edi, %ebx -; X86-NEXT: pushl $10 +; X86-NEXT: pushl $9 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $20 +; X86-NEXT: pushl $19 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $20, %edi +; X86-NEXT: xorl $19, %edi ; X86-NEXT: orl %ebx, %edi ; X86-NEXT: pushl $10 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $21 +; X86-NEXT: pushl $20 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $21, %ebx +; X86-NEXT: xorl $20, %ebx ; X86-NEXT: orl %edi, %ebx -; X86-NEXT: pushl $11 +; X86-NEXT: pushl $10 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $22 +; X86-NEXT: pushl $21 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $22, %edi +; X86-NEXT: xorl $21, %edi ; X86-NEXT: orl %ebx, %edi ; X86-NEXT: pushl $11 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $23 +; X86-NEXT: pushl $22 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $23, %ebx +; X86-NEXT: xorl $22, %ebx ; X86-NEXT: orl %edi, %ebx -; X86-NEXT: pushl $12 +; X86-NEXT: pushl $11 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $24 +; X86-NEXT: pushl $23 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $24, %edi +; X86-NEXT: xorl $23, %edi ; X86-NEXT: orl %ebx, %edi ; X86-NEXT: pushl $12 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $25 +; X86-NEXT: pushl $24 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $25, %ebx +; X86-NEXT: xorl $24, %ebx ; X86-NEXT: orl %edi, %ebx -; X86-NEXT: pushl $13 +; X86-NEXT: pushl $12 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $26 +; X86-NEXT: pushl $25 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $26, %edi +; X86-NEXT: xorl $25, %edi ; X86-NEXT: orl %ebx, %edi ; X86-NEXT: pushl $13 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $27 +; X86-NEXT: pushl $26 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $27, %ebx +; X86-NEXT: xorl $26, %ebx ; X86-NEXT: orl %edi, %ebx -; X86-NEXT: pushl $14 +; X86-NEXT: pushl $13 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $28 +; X86-NEXT: pushl $27 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $28, %edi +; X86-NEXT: xorl $27, %edi ; X86-NEXT: orl %ebx, %edi ; X86-NEXT: pushl $14 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $29 +; X86-NEXT: pushl $28 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $29, %ebx +; X86-NEXT: xorl $28, %ebx ; X86-NEXT: orl %edi, %ebx -; X86-NEXT: pushl $15 +; X86-NEXT: pushl $14 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $30 +; X86-NEXT: pushl $29 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $30, %edi +; X86-NEXT: xorl $29, %edi ; X86-NEXT: orl %ebx, %edi ; X86-NEXT: pushl $15 ; X86-NEXT: .cfi_adjust_cfa_offset 4 -; X86-NEXT: pushl $31 +; X86-NEXT: pushl $30 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $31, %ebx +; X86-NEXT: xorl $30, %ebx ; X86-NEXT: orl %edi, %ebx -; X86-NEXT: orl %esi, %ebx +; X86-NEXT: pushl $15 +; X86-NEXT: .cfi_adjust_cfa_offset 4 +; X86-NEXT: pushl $31 +; X86-NEXT: .cfi_adjust_cfa_offset 4 +; X86-NEXT: calll mult@PLT +; X86-NEXT: addl $8, %esp +; X86-NEXT: .cfi_adjust_cfa_offset -8 +; X86-NEXT: movl %eax, %edi +; X86-NEXT: xorl $31, %edi +; X86-NEXT: orl %ebx, %edi +; X86-NEXT: orl %esi, %edi ; X86-NEXT: pushl $16 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $32 @@ -844,7 +844,7 @@ ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: xorl $32, %eax ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: orl %ebx, %eax +; X86-NEXT: orl %edi, %eax ; X86-NEXT: setne %cl ; X86-NEXT: negl %ecx ; X86-NEXT: movl %ecx, %eax Index: llvm/test/CodeGen/X86/mul-i1024.ll =================================================================== --- llvm/test/CodeGen/X86/mul-i1024.ll +++ llvm/test/CodeGen/X86/mul-i1024.ll @@ -3143,9 +3143,10 @@ ; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: addl %ecx, %edx -; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: imull %ebp, %esi ; X32-NEXT: addl %edx, %esi -; X32-NEXT: movl %esi, %ebp +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload @@ -3155,51 +3156,49 @@ ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: addl %esi, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %edi, %eax ; X32-NEXT: imull %edi, %esi ; X32-NEXT: addl %edx, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, %edi -; X32-NEXT: adcl %ebp, %esi +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx +; X32-NEXT: addl %ecx, %ebx ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, %edi ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb %bl +; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %edi ; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl %bl, %ecx +; X32-NEXT: movzbl %cl, %ecx ; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: imull %ebp, %edi -; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: imull %eax, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %edi, %edx -; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: addl %edx, %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: addl %edx, %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload @@ -3213,14 +3212,15 @@ ; X32-NEXT: addl %edx, %ecx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl %ebp, %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ebp +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: addl %esi, %ecx @@ -3390,18 +3390,17 @@ ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl 112(%ecx), %edi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: imull %edi, %ebx -; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: imull %edi, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %ebx, %edx +; X32-NEXT: addl %esi, %edx ; X32-NEXT: movl 116(%ecx), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: imull %eax, %esi -; X32-NEXT: addl %edx, %esi -; X32-NEXT: movl %esi, %ebx +; X32-NEXT: imull %eax, %ebx +; X32-NEXT: addl %edx, %ebx ; X32-NEXT: movl 120(%ecx), %eax ; X32-NEXT: movl %eax, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -3443,13 +3442,13 @@ ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: imull %eax, %edi +; X32-NEXT: imull %eax, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %edi, %edx +; X32-NEXT: addl %esi, %edx ; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: addl %edx, %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -4499,16 +4498,16 @@ ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %edi, %ecx -; X32-NEXT: imull %eax, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl %edi, %esi +; X32-NEXT: imull %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: addl %ecx, %edx +; X32-NEXT: addl %esi, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: imull %ebp, %esi -; X32-NEXT: addl %edx, %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: imull %ebp, %ecx +; X32-NEXT: addl %edx, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %edi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload @@ -4553,16 +4552,15 @@ ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: imull %esi, %ebx -; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: imull %esi, %ecx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: addl %ebx, %edx -; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: addl %edx, %ecx -; X32-NEXT: movl %ecx, %ebx +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl %edx, %ebx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload @@ -4608,45 +4606,46 @@ ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, %ebp ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, %edx +; X32-NEXT: movl %ecx, %ebp ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, %edx ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, %edi +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%esp), %eax # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -4667,6 +4666,7 @@ ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -5044,7 +5044,7 @@ ; X64-NEXT: setb %r14b ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: movq %r10, (%rsp) # 8-byte Spill +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r11 @@ -5119,7 +5119,7 @@ ; X64-NEXT: adcq %rax, %r10 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload ; X64-NEXT: movq %r12, %rax -; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r11, (%rsp) # 8-byte Spill ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -5239,7 +5239,7 @@ ; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: movq (%rsp), %rsi # 8-byte Reload ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -5384,7 +5384,7 @@ ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; X64-NEXT: movq 64(%rsi), %rdi -; X64-NEXT: movq (%rsp), %rbx # 8-byte Reload +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload ; X64-NEXT: movq %rbx, %rax ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rcx @@ -5483,20 +5483,19 @@ ; X64-NEXT: addq %r15, %rsi ; X64-NEXT: adcq %r10, %rbx ; X64-NEXT: setb %r9b -; X64-NEXT: movq (%rsp), %r14 # 8-byte Reload -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, %r10 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload ; X64-NEXT: movq %r8, %rax ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rcx, %rbp +; X64-NEXT: addq %r14, %rbp ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq %r14, %r15 +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: addq %rbp, %rax @@ -5504,6 +5503,7 @@ ; X64-NEXT: adcq %rdi, %rcx ; X64-NEXT: setb %dil ; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %r8, %r14 ; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: addq %rcx, %r12 @@ -5517,11 +5517,11 @@ ; X64-NEXT: adcq %rax, %r12 ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NEXT: movq (%rsp), %rax # 8-byte Reload ; X64-NEXT: imulq %rax, %r13 -; X64-NEXT: movq %rax, %r10 +; X64-NEXT: movq %rax, %r15 ; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rax, %r8 +; X64-NEXT: movq %rax, %r10 ; X64-NEXT: addq %r13, %rdx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; X64-NEXT: imulq %rdi, %r11 @@ -5534,19 +5534,18 @@ ; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: addq %rsi, %rdx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: imulq %rbp, %rax -; X64-NEXT: addq %rdx, %rax -; X64-NEXT: addq %r8, %r9 -; X64-NEXT: adcq %r11, %rax -; X64-NEXT: movq %rax, %r8 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: imulq %rbp, %r8 +; X64-NEXT: addq %rdx, %r8 +; X64-NEXT: addq %r10, %r9 +; X64-NEXT: adcq %r11, %r8 ; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %r10 +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rbx, %rax ; X64-NEXT: movq %rbx, %r11 -; X64-NEXT: mulq %r10 +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %rcx, %rbx @@ -5554,8 +5553,8 @@ ; X64-NEXT: movq %rbp, %rax ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rbx, %r14 +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rbx, %r10 ; X64-NEXT: adcq %rsi, %rbp ; X64-NEXT: setb %cl ; X64-NEXT: movq %r11, %rax @@ -5575,29 +5574,28 @@ ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rax, %r10 +; X64-NEXT: movq %rax, %r15 ; X64-NEXT: addq %rcx, %rdx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload ; X64-NEXT: imulq %r8, %rsi ; X64-NEXT: addq %rdx, %rsi ; X64-NEXT: movq 96(%rdi), %rcx ; X64-NEXT: movq 104(%rdi), %rbp -; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq %r15, %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NEXT: movq %rax, %rdi ; X64-NEXT: imulq %rbp, %rdi ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: addq %rdi, %rdx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: imulq %rcx, %rax -; X64-NEXT: addq %rdx, %rax -; X64-NEXT: addq %r10, %r9 -; X64-NEXT: adcq %rsi, %rax -; X64-NEXT: movq %rax, %r10 +; X64-NEXT: imulq %rcx, %r14 +; X64-NEXT: addq %rdx, %r14 +; X64-NEXT: addq %r15, %r9 +; X64-NEXT: adcq %rsi, %r14 +; X64-NEXT: movq %r14, %r15 ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r15 +; X64-NEXT: movq %rax, %r14 ; X64-NEXT: movq %rbp, %rax ; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %rbx @@ -5617,13 +5615,13 @@ ; X64-NEXT: movzbl %bl, %ecx ; X64-NEXT: adcq %rcx, %rdx ; X64-NEXT: addq %r9, %rax -; X64-NEXT: adcq %r10, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; X64-NEXT: adcq %r14, %rsi +; X64-NEXT: adcq %r15, %rdx +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload +; X64-NEXT: adcq %r10, %rsi ; X64-NEXT: adcq %r11, %rax ; X64-NEXT: adcq %r13, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload ; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %r12, %rax @@ -5666,7 +5664,7 @@ ; X64-NEXT: movq %r13, %rax ; X64-NEXT: movq %r14, %rdi ; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq 72(%r9), %rax ; X64-NEXT: movq %rax, %r9 @@ -5680,7 +5678,7 @@ ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: addq %rbp, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill ; X64-NEXT: adcq %rdi, %rsi ; X64-NEXT: setb %cl ; X64-NEXT: movq %r9, %rax @@ -5873,9 +5871,9 @@ ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload -; X64-NEXT: movq (%rsp), %rcx # 8-byte Reload +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq (%rsp), %rdi # 8-byte Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload Index: llvm/test/CodeGen/X86/mul-i256.ll =================================================================== --- llvm/test/CodeGen/X86/mul-i256.ll +++ llvm/test/CodeGen/X86/mul-i256.ll @@ -58,30 +58,31 @@ ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl (%edi), %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl (%edi), %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl 4(%edi), %eax -; X32-NEXT: movl %eax, %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %ebx, %ecx ; X32-NEXT: setb %bl -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl %ecx, %ebp ; X32-NEXT: movzbl %bl, %eax @@ -92,24 +93,25 @@ ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 8(%eax), %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl 8(%eax), %edi +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %edi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 12(%eax), %ecx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X32-NEXT: movl 12(%eax), %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %ecx +; X32-NEXT: movl %esi, (%esp) # 4-byte Spill ; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, %edi @@ -166,12 +168,13 @@ ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl 16(%ecx), %edi -; X32-NEXT: imull %edi, %ebx +; X32-NEXT: movl %ebx, %esi +; X32-NEXT: imull %edi, %esi ; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp ; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: addl %ebx, %edx +; X32-NEXT: addl %esi, %edx ; X32-NEXT: movl 20(%ecx), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: imull %eax, %ebp @@ -217,41 +220,41 @@ ; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-NEXT: movl 28(%ebx), %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: imull %esi, %ecx -; X32-NEXT: movl 24(%ebx), %edi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl 28(%edi), %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: imull %eax, %esi +; X32-NEXT: movl 24(%edi), %ecx +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %ecx, %edx -; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: addl %edx, %edi -; X32-NEXT: movl 16(%ebx), %ebp -; X32-NEXT: movl 20(%ebx), %ebx +; X32-NEXT: addl %esi, %edx +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl %edx, %ecx +; X32-NEXT: movl 16(%edi), %ebp +; X32-NEXT: movl 20(%edi), %ebx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: imull %ebx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: imull %ebx, %edi ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: imull %ebp, %ecx -; X32-NEXT: addl %edx, %ecx +; X32-NEXT: addl %edi, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: imull %ebp, %esi +; X32-NEXT: addl %edx, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %edi, %ebx +; X32-NEXT: addl %esi, %ebx ; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload @@ -309,52 +312,55 @@ ; X64-NEXT: .cfi_def_cfa_offset 16 ; X64-NEXT: pushq %r14 ; X64-NEXT: .cfi_def_cfa_offset 24 -; X64-NEXT: pushq %rbx +; X64-NEXT: pushq %r12 ; X64-NEXT: .cfi_def_cfa_offset 32 -; X64-NEXT: .cfi_offset %rbx, -32 +; X64-NEXT: pushq %rbx +; X64-NEXT: .cfi_def_cfa_offset 40 +; X64-NEXT: .cfi_offset %rbx, -40 +; X64-NEXT: .cfi_offset %r12, -32 ; X64-NEXT: .cfi_offset %r14, -24 ; X64-NEXT: .cfi_offset %r15, -16 ; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq (%rdi), %r11 +; X64-NEXT: movq (%rdi), %r14 ; X64-NEXT: movq 8(%rdi), %r8 -; X64-NEXT: movq 16(%rdi), %rbx -; X64-NEXT: movq 16(%rsi), %r10 -; X64-NEXT: movq (%rsi), %rcx +; X64-NEXT: movq 16(%rdi), %rcx +; X64-NEXT: movq 16(%rsi), %rbx +; X64-NEXT: movq (%rsi), %r12 ; X64-NEXT: movq 8(%rsi), %r15 ; X64-NEXT: movq 24(%rdi), %rdi -; X64-NEXT: imulq %rcx, %rdi -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rax, %r14 +; X64-NEXT: imulq %r12, %rdi +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rax, %r10 ; X64-NEXT: addq %rdi, %rdx -; X64-NEXT: imulq %r15, %rbx -; X64-NEXT: addq %rdx, %rbx -; X64-NEXT: movq %r10, %rdi +; X64-NEXT: imulq %r15, %rcx +; X64-NEXT: addq %rdx, %rcx +; X64-NEXT: movq %rbx, %rdi ; X64-NEXT: imulq %r8, %rdi -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rax, %r10 +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rax, %r11 ; X64-NEXT: addq %rdi, %rdx -; X64-NEXT: movq 24(%rsi), %rdi -; X64-NEXT: imulq %r11, %rdi -; X64-NEXT: addq %rdx, %rdi -; X64-NEXT: addq %r14, %r10 -; X64-NEXT: adcq %rbx, %rdi -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %rcx +; X64-NEXT: movq 24(%rsi), %rbx +; X64-NEXT: imulq %r14, %rbx +; X64-NEXT: addq %rdx, %rbx +; X64-NEXT: addq %r10, %r11 +; X64-NEXT: adcq %rcx, %rbx +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r14 +; X64-NEXT: movq %rax, %r10 ; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %rcx +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rsi, %rbx +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %rsi, %rdi ; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rbx, %r11 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rdi, %r14 ; X64-NEXT: adcq %rcx, %rsi ; X64-NEXT: setb %al ; X64-NEXT: movzbl %al, %ecx @@ -362,13 +368,15 @@ ; X64-NEXT: mulq %r15 ; X64-NEXT: addq %rsi, %rax ; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: addq %r10, %rax -; X64-NEXT: adcq %rdi, %rdx -; X64-NEXT: movq %r14, (%r9) -; X64-NEXT: movq %r11, 8(%r9) +; X64-NEXT: addq %r11, %rax +; X64-NEXT: adcq %rbx, %rdx +; X64-NEXT: movq %r10, (%r9) +; X64-NEXT: movq %r14, 8(%r9) ; X64-NEXT: movq %rax, 16(%r9) ; X64-NEXT: movq %rdx, 24(%r9) ; X64-NEXT: popq %rbx +; X64-NEXT: .cfi_def_cfa_offset 32 +; X64-NEXT: popq %r12 ; X64-NEXT: .cfi_def_cfa_offset 24 ; X64-NEXT: popq %r14 ; X64-NEXT: .cfi_def_cfa_offset 16 Index: llvm/test/CodeGen/X86/mul-i512.ll =================================================================== --- llvm/test/CodeGen/X86/mul-i512.ll +++ llvm/test/CodeGen/X86/mul-i512.ll @@ -9,7 +9,7 @@ ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: subl $180, %esp +; X32-NEXT: subl $184, %esp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl 28(%edx), %ecx @@ -33,6 +33,7 @@ ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %ecx, %ebp +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -46,46 +47,45 @@ ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 16(%ecx), %ebx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 16(%ecx), %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl 20(%ecx), %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 20(%ecx), %ebx +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: addl %esi, %ecx ; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebp, %esi -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %edi, %ebp ; X32-NEXT: setb %cl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebp, %esi -; X32-NEXT: movzbl %cl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 8(%eax), %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl 8(%edi), %ebp ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -95,25 +95,25 @@ ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl %ecx, %ebx ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 12(%eax), %ecx -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl 12(%edi), %ecx +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx +; X32-NEXT: movl %eax, %esi ; X32-NEXT: adcl %ebp, %edi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: addl %edi, %ecx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %ecx ; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload @@ -191,13 +191,13 @@ ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill ; X32-NEXT: movl 4(%ecx), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ebx @@ -328,7 +328,7 @@ ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ebx ; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -362,7 +362,7 @@ ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 24(%eax), %ebx @@ -399,21 +399,21 @@ ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl %ebp, %ecx @@ -421,7 +421,7 @@ ; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: adcl %esi, %ebp ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -431,8 +431,8 @@ ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: addl %ebx, %edi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: adcl (%esp), %ebx # 4-byte Folded Reload +; X32-NEXT: movl (%esp), %ebx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %eax ; X32-NEXT: adcl $0, %edx @@ -454,7 +454,7 @@ ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload @@ -524,13 +524,13 @@ ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload @@ -539,13 +539,13 @@ ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: adcl %ebx, %ecx -; X32-NEXT: setb (%esp) # 1-byte Folded Spill +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edi, %ebx ; X32-NEXT: movl %eax, %edi ; X32-NEXT: addl %ecx, %edi -; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload @@ -554,7 +554,7 @@ ; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax @@ -583,14 +583,14 @@ ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl %edi, %ecx -; X32-NEXT: adcl (%esp), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload ; X32-NEXT: adcl %esi, %eax ; X32-NEXT: movl %eax, %esi ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl (%esp), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -624,7 +624,7 @@ ; X32-NEXT: movl 36(%eax), %ecx ; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl %ecx, %esi -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebx, %eax @@ -652,18 +652,18 @@ ; X32-NEXT: addl %ecx, %ebp ; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %edi, %esi -; X32-NEXT: setb (%esp) # 1-byte Folded Spill +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %edi ; X32-NEXT: addl %esi, %edi -; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload @@ -676,17 +676,17 @@ ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 44(%eax), %esi -; X32-NEXT: movl %esi, (%esp) # 4-byte Spill +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx @@ -722,14 +722,14 @@ ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull (%esp) # 4-byte Folded Reload +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: adcl %edi, %ecx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl (%esp), %edi # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload @@ -747,7 +747,7 @@ ; X32-NEXT: imull %eax, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: addl %esi, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: imull %ebx, %ecx @@ -755,7 +755,7 @@ ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl (%esp), %edi # 4-byte Reload ; X32-NEXT: imull %edi, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp @@ -765,8 +765,8 @@ ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: imull %ebp, %esi ; X32-NEXT: addl %edx, %esi -; X32-NEXT: addl (%esp), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -787,12 +787,12 @@ ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %ecx ; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl (%esp), %eax # 4-byte Reload ; X32-NEXT: mull %ebp ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movzbl %bl, %ecx ; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -882,13 +882,13 @@ ; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %ebx, %esi ; X32-NEXT: setb %cl ; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ebp ; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -923,9 +923,9 @@ ; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax @@ -955,24 +955,24 @@ ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %ecx ; X32-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl (%esp), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edi ; X32-NEXT: adcl $0, %ecx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload @@ -980,17 +980,17 @@ ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: setb (%esp) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx ; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, %esi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl %ecx, %ebp -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebp, (%esp) # 4-byte Spill ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %esi ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -998,21 +998,21 @@ ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl 48(%ecx), %edi -; X32-NEXT: imull %edi, %ebx +; X32-NEXT: movl %ebx, %esi +; X32-NEXT: imull %edi, %esi ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %ebx, %edx +; X32-NEXT: addl %esi, %edx ; X32-NEXT: movl 52(%ecx), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: imull %eax, %esi -; X32-NEXT: addl %edx, %esi -; X32-NEXT: movl %esi, %ebp +; X32-NEXT: imull %eax, %ebx +; X32-NEXT: addl %edx, %ebx ; X32-NEXT: movl 56(%ecx), %eax ; X32-NEXT: movl %eax, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: imull %ebx, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: imull %ebp, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: addl %esi, %edx @@ -1022,16 +1022,16 @@ ; X32-NEXT: addl %edx, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebp, %esi +; X32-NEXT: adcl %ebx, %esi ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload @@ -1050,17 +1050,18 @@ ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: imull %esi, %edi -; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %edi, %edx -; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: addl %edx, %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: imull %eax, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: imull %ebp, %edi +; X32-NEXT: addl %edx, %edi +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload @@ -1071,12 +1072,11 @@ ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: imull %ebx, %ecx ; X32-NEXT: addl %edx, %ecx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, %ebp ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -1086,60 +1086,56 @@ ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: addl %ebx, %ecx ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl %esi, %edi -; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ecx, %edi +; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: setb %cl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movzbl %cl, %ecx -; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl (%esp), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl (%esp), %edi # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, (%ecx) @@ -1163,12 +1159,12 @@ ; X32-NEXT: movl %esi, 36(%ecx) ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, 40(%ecx) -; X32-NEXT: movl %edi, 44(%ecx) -; X32-NEXT: movl %edx, 48(%ecx) -; X32-NEXT: movl %ebp, 52(%ecx) +; X32-NEXT: movl %ebx, 44(%ecx) +; X32-NEXT: movl %ebp, 48(%ecx) +; X32-NEXT: movl %edi, 52(%ecx) ; X32-NEXT: movl %eax, 56(%ecx) -; X32-NEXT: movl %ebx, 60(%ecx) -; X32-NEXT: addl $180, %esp +; X32-NEXT: movl %edx, 60(%ecx) +; X32-NEXT: addl $184, %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx @@ -1183,23 +1179,22 @@ ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: pushq %rax -; X64-NEXT: movq %rdx, (%rsp) # 8-byte Spill +; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq (%rdi), %r9 -; X64-NEXT: movq 8(%rdi), %r15 -; X64-NEXT: movq 24(%rdi), %r12 +; X64-NEXT: movq 8(%rdi), %r8 +; X64-NEXT: movq 24(%rdi), %r15 ; X64-NEXT: movq 16(%rdi), %rax -; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq (%rsi), %rdi ; X64-NEXT: movq 8(%rsi), %r14 +; X64-NEXT: movq %rsi, %r12 ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r11 -; X64-NEXT: movq %r12, %rax -; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rax, %rbx @@ -1213,36 +1208,35 @@ ; X64-NEXT: adcq %rbp, %rcx ; X64-NEXT: setb %al ; X64-NEXT: movzbl %al, %esi -; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %r14 +; X64-NEXT: movq %r14, %r15 ; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %rcx, %r13 -; X64-NEXT: adcq %rsi, %r8 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rcx, %r14 +; X64-NEXT: adcq %rsi, %rdx +; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %r9, %rax ; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r8, %rax ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %rcx, %rbp ; X64-NEXT: adcq $0, %rbx ; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq %r9, %r12 -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r14 +; X64-NEXT: movq %r9, %rdi +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: addq %rbp, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %rbx, %rcx ; X64-NEXT: setb %sil -; X64-NEXT: movq %r15, %rdi -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r14 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %rcx, %rbx @@ -1250,190 +1244,190 @@ ; X64-NEXT: adcq %rax, %r15 ; X64-NEXT: addq %r11, %rbx ; X64-NEXT: adcq %r10, %r15 -; X64-NEXT: adcq $0, %r13 -; X64-NEXT: movq %r8, %r14 ; X64-NEXT: adcq $0, %r14 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq 16(%rsi), %r8 -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r9 +; X64-NEXT: adcq $0, %r13 +; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq 16(%r12), %r9 ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %r11 ; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r8 +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %r8, %r11 +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %r10, %rbp ; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq 24(%rsi), %rdi -; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %rdi +; X64-NEXT: movq 24(%r12), %r8 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: addq %rbp, %rax ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: adcq %rcx, %rsi ; X64-NEXT: setb %cl ; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %rdi +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r11 ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: addq %rsi, %r12 ; X64-NEXT: movzbl %cl, %eax ; X64-NEXT: adcq %rax, %r11 -; X64-NEXT: addq %rbx, %r9 -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq %rbx, %r13 +; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %r15, %rbp ; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %r12 ; X64-NEXT: adcq $0, %r11 -; X64-NEXT: addq %r13, %r12 -; X64-NEXT: adcq %r14, %r11 -; X64-NEXT: setb %r9b +; X64-NEXT: addq %r14, %r12 +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; X64-NEXT: setb %r15b ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r8 +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r14 +; X64-NEXT: movq %rax, %rdi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload ; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r8 +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %rcx, %rbp ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %rdi +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: addq %rbp, %rax ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: adcq %rsi, %rcx ; X64-NEXT: setb %bl ; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movzbl %bl, %ecx -; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: addq %r12, %r14 -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rcx, %r14 +; X64-NEXT: movzbl %bl, %eax +; X64-NEXT: adcq %rax, %rdx +; X64-NEXT: addq %r12, %rdi +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %r11, %rbp ; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movzbl %r9b, %ecx -; X64-NEXT: adcq %rcx, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movzbl %r15b, %eax +; X64-NEXT: adcq %rax, %r14 ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq 32(%rcx), %r10 -; X64-NEXT: imulq %r10, %rdi -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r8 +; X64-NEXT: movq 32(%rcx), %r12 +; X64-NEXT: imulq %r12, %r8 +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rdi, %rdx -; X64-NEXT: movq 40(%rcx), %r9 -; X64-NEXT: imulq %r9, %r8 -; X64-NEXT: addq %rdx, %r8 +; X64-NEXT: addq %r8, %rdx +; X64-NEXT: movq 40(%rcx), %r8 +; X64-NEXT: imulq %r8, %r9 +; X64-NEXT: addq %rdx, %r9 ; X64-NEXT: movq 48(%rcx), %rax +; X64-NEXT: movq %rcx, %rbp ; X64-NEXT: movq %rax, %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: imulq %r15, %rdi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: imulq %rbx, %rdi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rax, %r12 +; X64-NEXT: mulq %rbx +; X64-NEXT: movq %rax, %rcx ; X64-NEXT: addq %rdi, %rdx -; X64-NEXT: movq 56(%rcx), %rbp -; X64-NEXT: imulq %rsi, %rbp +; X64-NEXT: movq 56(%rbp), %rbp +; X64-NEXT: imulq %rbx, %rbp ; X64-NEXT: addq %rdx, %rbp -; X64-NEXT: addq %r11, %r12 -; X64-NEXT: adcq %r8, %rbp -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: movq %rsi, %rcx -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq %r11, %rcx +; X64-NEXT: adcq %r9, %rbp ; X64-NEXT: movq %rbx, %rax ; X64-NEXT: movq %rbx, %r11 -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rdi, %rbx -; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rbx, %r10 -; X64-NEXT: adcq %rsi, %rdi -; X64-NEXT: setb %bl +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r9, %rbx +; X64-NEXT: adcq $0, %rdi ; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: addq %rbx, %r12 +; X64-NEXT: adcq %rdi, %rsi +; X64-NEXT: setb %bl +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %rdi, %r13 +; X64-NEXT: addq %rsi, %r13 ; X64-NEXT: movzbl %bl, %eax ; X64-NEXT: adcq %rax, %r15 -; X64-NEXT: addq %r12, %r13 +; X64-NEXT: addq %rcx, %r13 ; X64-NEXT: adcq %rbp, %r15 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; X64-NEXT: movq 56(%rdx), %rcx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: movq 48(%rdx), %rbp -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rax, %r12 +; X64-NEXT: movq 48(%rdx), %rbx +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: mulq %rbx +; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %rcx, %rdx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: imulq %r8, %rbp -; X64-NEXT: addq %rdx, %rbp -; X64-NEXT: movq 32(%rsi), %rdi -; X64-NEXT: movq 40(%rsi), %rbx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: imulq %r9, %rbx +; X64-NEXT: addq %rdx, %rbx +; X64-NEXT: movq 32(%r8), %rdi +; X64-NEXT: movq 40(%r8), %r8 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: imulq %rbx, %rsi -; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %rsi, %rdx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: imulq %rdi, %r14 -; X64-NEXT: addq %rdx, %r14 -; X64-NEXT: addq %r12, %rcx -; X64-NEXT: adcq %rbp, %r14 +; X64-NEXT: imulq %r8, %rcx +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %rcx, %rdx +; X64-NEXT: imulq %rdi, %r10 +; X64-NEXT: addq %rdx, %r10 +; X64-NEXT: addq %rsi, %r11 +; X64-NEXT: adcq %rbx, %r10 ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: mulq %rbp +; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %rbp, %r12 -; X64-NEXT: adcq $0, %r11 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %rbp +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %rbx, %rbp +; X64-NEXT: adcq $0, %rcx ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %r12, %rdi -; X64-NEXT: adcq %r11, %rbp -; X64-NEXT: setb %r9b -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: addq %rbp, %rax -; X64-NEXT: movzbl %r9b, %ebp -; X64-NEXT: adcq %rbp, %rdx -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: adcq %r14, %rdx +; X64-NEXT: addq %rbp, %rdi +; X64-NEXT: adcq %rcx, %rbx +; X64-NEXT: setb %cl +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: addq %rbx, %rax +; X64-NEXT: movzbl %cl, %ecx +; X64-NEXT: adcq %rcx, %rdx +; X64-NEXT: addq %r11, %rax +; X64-NEXT: adcq %r10, %rdx ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; X64-NEXT: adcq %r10, %rdi +; X64-NEXT: adcq %r12, %rdi ; X64-NEXT: adcq %r13, %rax ; X64-NEXT: adcq %r15, %rdx ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; X64-NEXT: adcq %r14, %rax ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload -; X64-NEXT: movq (%rsp), %rcx # 8-byte Reload +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload ; X64-NEXT: movq %rbp, (%rcx) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload @@ -1446,7 +1440,6 @@ ; X64-NEXT: movq %rdi, 40(%rcx) ; X64-NEXT: movq %rax, 48(%rcx) ; X64-NEXT: movq %rdx, 56(%rcx) -; X64-NEXT: addq $8, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 ; X64-NEXT: popq %r13 Index: llvm/test/CodeGen/X86/mul128.ll =================================================================== --- llvm/test/CodeGen/X86/mul128.ll +++ llvm/test/CodeGen/X86/mul128.ll @@ -30,54 +30,55 @@ ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: imull %edx, %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %edx -; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %edi, %ecx -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: addl %esi, %ecx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: imull {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: mull %ebp -; X86-NEXT: addl %esi, %edx +; X86-NEXT: imull %ecx, %ebp +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: imull %ebp, %edi +; X86-NEXT: imull %esi, %edi ; X86-NEXT: addl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl %ebp, %edi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: imull %ebp, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: imull %esi, %ecx +; X86-NEXT: addl %edx, %ecx ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ecx, %edi +; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %edi, %esi +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: adcl %esi, %ebx -; X86-NEXT: setb %cl +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: setb %bl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movzbl %bl, %esi +; X86-NEXT: adcl %esi, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl %edi, %edx +; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%esp), %esi # 4-byte Reload ; X86-NEXT: movl %esi, (%ecx) Index: llvm/test/CodeGen/X86/neg-abs.ll =================================================================== --- llvm/test/CodeGen/X86/neg-abs.ll +++ llvm/test/CodeGen/X86/neg-abs.ll @@ -113,27 +113,27 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: sarl $31, %edx -; X86-NEXT: xorl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: xorl %ecx, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: xorl %edx, %esi +; X86-NEXT: xorl %ecx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: xorl %edx, %edi +; X86-NEXT: xorl %ecx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: xorl %edx, %ebx -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: xorl %ecx, %ebx +; X86-NEXT: movl %ecx, %ebp ; X86-NEXT: subl %ebx, %ebp -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %ecx, %ebx ; X86-NEXT: sbbl %edi, %ebx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %ecx, %edi ; X86-NEXT: sbbl %esi, %edi -; X86-NEXT: sbbl %ecx, %edx +; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl %ebp, (%eax) ; X86-NEXT: movl %ebx, 4(%eax) ; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx Index: llvm/test/CodeGen/X86/nontemporal.ll =================================================================== --- llvm/test/CodeGen/X86/nontemporal.ll +++ llvm/test/CodeGen/X86/nontemporal.ll @@ -59,31 +59,31 @@ ; X86-AVX-NEXT: vmovdqa 56(%ebp), %xmm4 ; X86-AVX-NEXT: vmovdqa 40(%ebp), %xmm5 ; X86-AVX-NEXT: vmovdqa 24(%ebp), %xmm6 -; X86-AVX-NEXT: movl 8(%ebp), %edx -; X86-AVX-NEXT: movl 80(%ebp), %esi -; X86-AVX-NEXT: movl (%esi), %eax +; X86-AVX-NEXT: movl 8(%ebp), %esi +; X86-AVX-NEXT: movl 80(%ebp), %edx +; X86-AVX-NEXT: movl (%edx), %eax ; X86-AVX-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovntps %xmm0, (%edx) +; X86-AVX-NEXT: vmovntps %xmm0, (%esi) ; X86-AVX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm0 -; X86-AVX-NEXT: addl (%esi), %eax -; X86-AVX-NEXT: vmovntdq %xmm0, (%edx) +; X86-AVX-NEXT: addl (%edx), %eax +; X86-AVX-NEXT: vmovntdq %xmm0, (%esi) ; X86-AVX-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm0 -; X86-AVX-NEXT: addl (%esi), %eax -; X86-AVX-NEXT: vmovntpd %xmm0, (%edx) +; X86-AVX-NEXT: addl (%edx), %eax +; X86-AVX-NEXT: vmovntpd %xmm0, (%esi) ; X86-AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm6, %xmm0 -; X86-AVX-NEXT: addl (%esi), %eax -; X86-AVX-NEXT: vmovntdq %xmm0, (%edx) +; X86-AVX-NEXT: addl (%edx), %eax +; X86-AVX-NEXT: vmovntdq %xmm0, (%esi) ; X86-AVX-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm5, %xmm0 -; X86-AVX-NEXT: addl (%esi), %eax -; X86-AVX-NEXT: vmovntdq %xmm0, (%edx) +; X86-AVX-NEXT: addl (%edx), %eax +; X86-AVX-NEXT: vmovntdq %xmm0, (%esi) ; X86-AVX-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4, %xmm0 -; X86-AVX-NEXT: addl (%esi), %eax -; X86-AVX-NEXT: vmovntdq %xmm0, (%edx) -; X86-AVX-NEXT: addl (%esi), %eax -; X86-AVX-NEXT: movntil %ecx, (%edx) -; X86-AVX-NEXT: addl (%esi), %eax -; X86-AVX-NEXT: vmovsd %xmm3, (%edx) -; X86-AVX-NEXT: addl (%esi), %eax +; X86-AVX-NEXT: addl (%edx), %eax +; X86-AVX-NEXT: vmovntdq %xmm0, (%esi) +; X86-AVX-NEXT: addl (%edx), %eax +; X86-AVX-NEXT: movntil %ecx, (%esi) +; X86-AVX-NEXT: addl (%edx), %eax +; X86-AVX-NEXT: vmovsd %xmm3, (%esi) +; X86-AVX-NEXT: addl (%edx), %eax ; X86-AVX-NEXT: leal -4(%ebp), %esp ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: popl %ebp Index: llvm/test/CodeGen/X86/nosse-vector.ll =================================================================== --- llvm/test/CodeGen/X86/nosse-vector.ll +++ llvm/test/CodeGen/X86/nosse-vector.ll @@ -144,20 +144,20 @@ ; X32-NEXT: pushl %esi ; X32-NEXT: andl $-8, %esp ; X32-NEXT: subl $48, %esp -; X32-NEXT: movl 8(%ebp), %eax -; X32-NEXT: movl 24(%eax), %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl 28(%eax), %ecx -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X32-NEXT: movl 16(%eax), %esi -; X32-NEXT: movl 20(%eax), %edi -; X32-NEXT: movl 8(%eax), %ebx -; X32-NEXT: movl 12(%eax), %edx -; X32-NEXT: movl (%eax), %ecx -; X32-NEXT: movl 4(%eax), %eax +; X32-NEXT: movl 8(%ebp), %edx +; X32-NEXT: movl 24(%edx), %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 28(%edx), %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl 16(%edx), %esi +; X32-NEXT: movl 20(%edx), %edi +; X32-NEXT: movl 8(%edx), %ebx +; X32-NEXT: movl 12(%edx), %ecx +; X32-NEXT: movl (%edx), %eax +; X32-NEXT: movl 4(%edx), %edx +; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) Index: llvm/test/CodeGen/X86/overflow.ll =================================================================== --- llvm/test/CodeGen/X86/overflow.ll +++ llvm/test/CodeGen/X86/overflow.ll @@ -9,11 +9,11 @@ ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %ebp, %eax @@ -22,15 +22,15 @@ ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl %edi, %ebp ; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebp, %eax -; X32-NEXT: adcl %ebx, %ecx +; X32-NEXT: adcl %ebx, %esi ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: mull %esi -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movzbl %bl, %ecx ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx Index: llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll =================================================================== --- llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll +++ llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll @@ -276,8 +276,8 @@ ; CHECK32-NEXT: pushl %edi ; CHECK32-NEXT: pushl %esi ; CHECK32-NEXT: pushl %eax -; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edi ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edi ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ebx @@ -286,8 +286,8 @@ ; CHECK32-NEXT: lock cmpxchg8b (%esi) ; CHECK32-NEXT: setne {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK32-NEXT: movl %edi, %edx -; CHECK32-NEXT: movl %ebp, %ecx +; CHECK32-NEXT: movl %ebp, %edx +; CHECK32-NEXT: movl %edi, %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK32-NEXT: lock cmpxchg8b (%esi) Index: llvm/test/CodeGen/X86/popcnt.ll =================================================================== --- llvm/test/CodeGen/X86/popcnt.ll +++ llvm/test/CodeGen/X86/popcnt.ll @@ -445,42 +445,42 @@ ; X86-SSE2-LABEL: cnt128: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrlw $1, %xmm1 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: psubb %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE2-NEXT: pand %xmm1, %xmm3 -; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $1, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] ; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: paddb %xmm3, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE2-NEXT: psrlw $4, %xmm3 -; X86-SSE2-NEXT: paddb %xmm0, %xmm3 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE2-NEXT: psubb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 ; X86-SSE2-NEXT: pand %xmm0, %xmm3 -; X86-SSE2-NEXT: pxor %xmm4, %xmm4 -; X86-SSE2-NEXT: psadbw %xmm4, %xmm3 -; X86-SSE2-NEXT: movd %xmm3, %ecx -; X86-SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE2-NEXT: psrlw $2, %xmm2 +; X86-SSE2-NEXT: pand %xmm0, %xmm2 +; X86-SSE2-NEXT: paddb %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: psrlw $4, %xmm4 +; X86-SSE2-NEXT: paddb %xmm2, %xmm4 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pxor %xmm3, %xmm3 +; X86-SSE2-NEXT: psadbw %xmm3, %xmm4 +; X86-SSE2-NEXT: movd %xmm4, %ecx +; X86-SSE2-NEXT: movq {{.*#+}} xmm4 = mem[0],zero +; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 ; X86-SSE2-NEXT: psrlw $1, %xmm5 -; X86-SSE2-NEXT: pand %xmm2, %xmm5 -; X86-SSE2-NEXT: psubb %xmm5, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: psrlw $2, %xmm3 -; X86-SSE2-NEXT: pand %xmm1, %xmm3 -; X86-SSE2-NEXT: paddb %xmm2, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 -; X86-SSE2-NEXT: psrlw $4, %xmm1 -; X86-SSE2-NEXT: paddb %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm5 +; X86-SSE2-NEXT: psubb %xmm5, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: psadbw %xmm4, %xmm1 -; X86-SSE2-NEXT: movd %xmm1, %edx +; X86-SSE2-NEXT: psrlw $2, %xmm4 +; X86-SSE2-NEXT: pand %xmm0, %xmm4 +; X86-SSE2-NEXT: paddb %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: psrlw $4, %xmm0 +; X86-SSE2-NEXT: paddb %xmm4, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: psadbw %xmm3, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %edx ; X86-SSE2-NEXT: addl %ecx, %edx ; X86-SSE2-NEXT: movl %edx, (%eax) ; X86-SSE2-NEXT: movl $0, 12(%eax) @@ -491,32 +491,32 @@ ; X86-SSSE3-LABEL: cnt128: ; X86-SSSE3: # %bb.0: ; X86-SSSE3-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X86-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X86-SSSE3-NEXT: movdqa %xmm1, %xmm2 -; X86-SSSE3-NEXT: pand %xmm0, %xmm2 -; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X86-SSSE3-NEXT: movdqa %xmm3, %xmm4 -; X86-SSSE3-NEXT: pshufb %xmm2, %xmm4 -; X86-SSSE3-NEXT: psrlw $4, %xmm1 -; X86-SSSE3-NEXT: pand %xmm0, %xmm1 -; X86-SSSE3-NEXT: movdqa %xmm3, %xmm2 -; X86-SSSE3-NEXT: pshufb %xmm1, %xmm2 -; X86-SSSE3-NEXT: paddb %xmm4, %xmm2 -; X86-SSSE3-NEXT: pxor %xmm1, %xmm1 -; X86-SSSE3-NEXT: psadbw %xmm1, %xmm2 -; X86-SSSE3-NEXT: movd %xmm2, %ecx +; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X86-SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; X86-SSSE3-NEXT: movdqa %xmm2, %xmm4 -; X86-SSSE3-NEXT: pand %xmm0, %xmm4 -; X86-SSSE3-NEXT: movdqa %xmm3, %xmm5 -; X86-SSSE3-NEXT: pshufb %xmm4, %xmm5 +; X86-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; X86-SSSE3-NEXT: pand %xmm1, %xmm3 +; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-SSSE3-NEXT: movdqa %xmm0, %xmm4 +; X86-SSSE3-NEXT: pshufb %xmm3, %xmm4 ; X86-SSSE3-NEXT: psrlw $4, %xmm2 -; X86-SSSE3-NEXT: pand %xmm0, %xmm2 +; X86-SSSE3-NEXT: pand %xmm1, %xmm2 +; X86-SSSE3-NEXT: movdqa %xmm0, %xmm3 ; X86-SSSE3-NEXT: pshufb %xmm2, %xmm3 -; X86-SSSE3-NEXT: paddb %xmm5, %xmm3 -; X86-SSSE3-NEXT: psadbw %xmm1, %xmm3 -; X86-SSSE3-NEXT: movd %xmm3, %edx +; X86-SSSE3-NEXT: paddb %xmm4, %xmm3 +; X86-SSSE3-NEXT: pxor %xmm2, %xmm2 +; X86-SSSE3-NEXT: psadbw %xmm2, %xmm3 +; X86-SSSE3-NEXT: movd %xmm3, %ecx +; X86-SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; X86-SSSE3-NEXT: movdqa %xmm3, %xmm4 +; X86-SSSE3-NEXT: pand %xmm1, %xmm4 +; X86-SSSE3-NEXT: movdqa %xmm0, %xmm5 +; X86-SSSE3-NEXT: pshufb %xmm4, %xmm5 +; X86-SSSE3-NEXT: psrlw $4, %xmm3 +; X86-SSSE3-NEXT: pand %xmm1, %xmm3 +; X86-SSSE3-NEXT: pshufb %xmm3, %xmm0 +; X86-SSSE3-NEXT: paddb %xmm5, %xmm0 +; X86-SSSE3-NEXT: psadbw %xmm2, %xmm0 +; X86-SSSE3-NEXT: movd %xmm0, %edx ; X86-SSSE3-NEXT: addl %ecx, %edx ; X86-SSSE3-NEXT: movl %edx, (%eax) ; X86-SSSE3-NEXT: movl $0, 12(%eax) @@ -667,41 +667,41 @@ ; X86-NOSSE-NEXT: pushl %edi ; X86-NOSSE-NEXT: pushl %esi ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl %ecx, %edx -; X86-NOSSE-NEXT: shrl %edx -; X86-NOSSE-NEXT: movl $1431655765, %esi # imm = 0x55555555 -; X86-NOSSE-NEXT: andl %esi, %edx -; X86-NOSSE-NEXT: subl %edx, %ecx -; X86-NOSSE-NEXT: movl $858993459, %edx # imm = 0x33333333 -; X86-NOSSE-NEXT: movl %ecx, %edi -; X86-NOSSE-NEXT: andl %edx, %edi -; X86-NOSSE-NEXT: shrl $2, %ecx +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NOSSE-NEXT: movl %esi, %ecx +; X86-NOSSE-NEXT: shrl %ecx +; X86-NOSSE-NEXT: movl $1431655765, %edx # imm = 0x55555555 ; X86-NOSSE-NEXT: andl %edx, %ecx -; X86-NOSSE-NEXT: addl %edi, %ecx -; X86-NOSSE-NEXT: movl %ecx, %edi -; X86-NOSSE-NEXT: shrl $4, %edi -; X86-NOSSE-NEXT: addl %ecx, %edi -; X86-NOSSE-NEXT: movl $252645135, %ecx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: subl %ecx, %esi +; X86-NOSSE-NEXT: movl $858993459, %ecx # imm = 0x33333333 +; X86-NOSSE-NEXT: movl %esi, %edi ; X86-NOSSE-NEXT: andl %ecx, %edi -; X86-NOSSE-NEXT: imull $16843009, %edi, %edi # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %edi +; X86-NOSSE-NEXT: shrl $2, %esi +; X86-NOSSE-NEXT: andl %ecx, %esi +; X86-NOSSE-NEXT: addl %edi, %esi +; X86-NOSSE-NEXT: movl %esi, %ebx +; X86-NOSSE-NEXT: shrl $4, %ebx +; X86-NOSSE-NEXT: addl %esi, %ebx +; X86-NOSSE-NEXT: movl $252645135, %edi # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: andl %edi, %ebx +; X86-NOSSE-NEXT: imull $16843009, %ebx, %esi # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %esi ; X86-NOSSE-NEXT: movl %eax, %ebx ; X86-NOSSE-NEXT: shrl %ebx -; X86-NOSSE-NEXT: andl %esi, %ebx +; X86-NOSSE-NEXT: andl %edx, %ebx ; X86-NOSSE-NEXT: subl %ebx, %eax -; X86-NOSSE-NEXT: movl %eax, %esi -; X86-NOSSE-NEXT: andl %edx, %esi -; X86-NOSSE-NEXT: shrl $2, %eax -; X86-NOSSE-NEXT: andl %edx, %eax -; X86-NOSSE-NEXT: addl %esi, %eax ; X86-NOSSE-NEXT: movl %eax, %edx -; X86-NOSSE-NEXT: shrl $4, %edx -; X86-NOSSE-NEXT: addl %eax, %edx ; X86-NOSSE-NEXT: andl %ecx, %edx -; X86-NOSSE-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $2, %eax +; X86-NOSSE-NEXT: andl %ecx, %eax +; X86-NOSSE-NEXT: addl %edx, %eax +; X86-NOSSE-NEXT: movl %eax, %ecx +; X86-NOSSE-NEXT: shrl $4, %ecx +; X86-NOSSE-NEXT: addl %eax, %ecx +; X86-NOSSE-NEXT: andl %edi, %ecx +; X86-NOSSE-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 ; X86-NOSSE-NEXT: shrl $24, %eax -; X86-NOSSE-NEXT: addl %edi, %eax +; X86-NOSSE-NEXT: addl %esi, %eax ; X86-NOSSE-NEXT: xorl %edx, %edx ; X86-NOSSE-NEXT: popl %esi ; X86-NOSSE-NEXT: popl %edi @@ -845,35 +845,36 @@ ; X86-NOSSE-NEXT: shrl $2, %esi ; X86-NOSSE-NEXT: andl %ecx, %esi ; X86-NOSSE-NEXT: addl %eax, %esi -; X86-NOSSE-NEXT: movl %esi, %eax -; X86-NOSSE-NEXT: shrl $4, %eax -; X86-NOSSE-NEXT: addl %esi, %eax -; X86-NOSSE-NEXT: movl %edx, %esi -; X86-NOSSE-NEXT: shrl %esi -; X86-NOSSE-NEXT: andl %ebp, %esi -; X86-NOSSE-NEXT: subl %esi, %edx -; X86-NOSSE-NEXT: movl %edx, %esi -; X86-NOSSE-NEXT: andl %ecx, %esi +; X86-NOSSE-NEXT: movl %esi, %ebp +; X86-NOSSE-NEXT: shrl $4, %ebp +; X86-NOSSE-NEXT: addl %esi, %ebp +; X86-NOSSE-NEXT: movl %edx, %eax +; X86-NOSSE-NEXT: shrl %eax +; X86-NOSSE-NEXT: movl $1431655765, %esi # imm = 0x55555555 +; X86-NOSSE-NEXT: andl %esi, %eax +; X86-NOSSE-NEXT: subl %eax, %edx +; X86-NOSSE-NEXT: movl %edx, %eax +; X86-NOSSE-NEXT: andl %ecx, %eax ; X86-NOSSE-NEXT: shrl $2, %edx ; X86-NOSSE-NEXT: andl %ecx, %edx -; X86-NOSSE-NEXT: addl %esi, %edx -; X86-NOSSE-NEXT: movl %edx, %ecx -; X86-NOSSE-NEXT: shrl $4, %ecx -; X86-NOSSE-NEXT: addl %edx, %ecx +; X86-NOSSE-NEXT: addl %eax, %edx +; X86-NOSSE-NEXT: movl %edx, %eax +; X86-NOSSE-NEXT: shrl $4, %eax +; X86-NOSSE-NEXT: addl %edx, %eax +; X86-NOSSE-NEXT: andl %ebx, %ebp ; X86-NOSSE-NEXT: andl %ebx, %eax -; X86-NOSSE-NEXT: andl %ebx, %ecx -; X86-NOSSE-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %eax -; X86-NOSSE-NEXT: imull $16843009, %ecx, %ecx # imm = 0x1010101 +; X86-NOSSE-NEXT: imull $16843009, %ebp, %ecx # imm = 0x1010101 ; X86-NOSSE-NEXT: shrl $24, %ecx -; X86-NOSSE-NEXT: addl %eax, %ecx +; X86-NOSSE-NEXT: imull $16843009, %eax, %edx # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %edx +; X86-NOSSE-NEXT: addl %ecx, %edx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: addl %edi, %ecx -; X86-NOSSE-NEXT: xorl %edx, %edx -; X86-NOSSE-NEXT: movl %edx, 12(%eax) -; X86-NOSSE-NEXT: movl %edx, 8(%eax) -; X86-NOSSE-NEXT: movl %edx, 4(%eax) -; X86-NOSSE-NEXT: movl %ecx, (%eax) +; X86-NOSSE-NEXT: addl %edi, %edx +; X86-NOSSE-NEXT: xorl %ecx, %ecx +; X86-NOSSE-NEXT: movl %ecx, 12(%eax) +; X86-NOSSE-NEXT: movl %ecx, 8(%eax) +; X86-NOSSE-NEXT: movl %ecx, 4(%eax) +; X86-NOSSE-NEXT: movl %edx, (%eax) ; X86-NOSSE-NEXT: popl %esi ; X86-NOSSE-NEXT: popl %edi ; X86-NOSSE-NEXT: popl %ebx @@ -950,42 +951,42 @@ ; X86-SSE2-LABEL: cnt128_optsize: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrlw $1, %xmm1 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: psubb %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE2-NEXT: pand %xmm1, %xmm3 -; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $1, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] ; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: paddb %xmm3, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE2-NEXT: psrlw $4, %xmm3 -; X86-SSE2-NEXT: paddb %xmm0, %xmm3 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE2-NEXT: psubb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 ; X86-SSE2-NEXT: pand %xmm0, %xmm3 -; X86-SSE2-NEXT: pxor %xmm4, %xmm4 -; X86-SSE2-NEXT: psadbw %xmm4, %xmm3 -; X86-SSE2-NEXT: movd %xmm3, %ecx -; X86-SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE2-NEXT: psrlw $2, %xmm2 +; X86-SSE2-NEXT: pand %xmm0, %xmm2 +; X86-SSE2-NEXT: paddb %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: psrlw $4, %xmm4 +; X86-SSE2-NEXT: paddb %xmm2, %xmm4 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pxor %xmm3, %xmm3 +; X86-SSE2-NEXT: psadbw %xmm3, %xmm4 +; X86-SSE2-NEXT: movd %xmm4, %ecx +; X86-SSE2-NEXT: movq {{.*#+}} xmm4 = mem[0],zero +; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 ; X86-SSE2-NEXT: psrlw $1, %xmm5 -; X86-SSE2-NEXT: pand %xmm2, %xmm5 -; X86-SSE2-NEXT: psubb %xmm5, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: psrlw $2, %xmm3 -; X86-SSE2-NEXT: pand %xmm1, %xmm3 -; X86-SSE2-NEXT: paddb %xmm2, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 -; X86-SSE2-NEXT: psrlw $4, %xmm1 -; X86-SSE2-NEXT: paddb %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm5 +; X86-SSE2-NEXT: psubb %xmm5, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: psadbw %xmm4, %xmm1 -; X86-SSE2-NEXT: movd %xmm1, %edx +; X86-SSE2-NEXT: psrlw $2, %xmm4 +; X86-SSE2-NEXT: pand %xmm0, %xmm4 +; X86-SSE2-NEXT: paddb %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: psrlw $4, %xmm0 +; X86-SSE2-NEXT: paddb %xmm4, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: psadbw %xmm3, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %edx ; X86-SSE2-NEXT: addl %ecx, %edx ; X86-SSE2-NEXT: xorl %ecx, %ecx ; X86-SSE2-NEXT: movl %ecx, 12(%eax) @@ -997,32 +998,32 @@ ; X86-SSSE3-LABEL: cnt128_optsize: ; X86-SSSE3: # %bb.0: ; X86-SSSE3-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X86-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X86-SSSE3-NEXT: movdqa %xmm1, %xmm2 -; X86-SSSE3-NEXT: pand %xmm0, %xmm2 -; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X86-SSSE3-NEXT: movdqa %xmm3, %xmm4 -; X86-SSSE3-NEXT: pshufb %xmm2, %xmm4 -; X86-SSSE3-NEXT: psrlw $4, %xmm1 -; X86-SSSE3-NEXT: pand %xmm0, %xmm1 -; X86-SSSE3-NEXT: movdqa %xmm3, %xmm2 -; X86-SSSE3-NEXT: pshufb %xmm1, %xmm2 -; X86-SSSE3-NEXT: paddb %xmm4, %xmm2 -; X86-SSSE3-NEXT: pxor %xmm1, %xmm1 -; X86-SSSE3-NEXT: psadbw %xmm1, %xmm2 -; X86-SSSE3-NEXT: movd %xmm2, %ecx +; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X86-SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; X86-SSSE3-NEXT: movdqa %xmm2, %xmm4 -; X86-SSSE3-NEXT: pand %xmm0, %xmm4 -; X86-SSSE3-NEXT: movdqa %xmm3, %xmm5 -; X86-SSSE3-NEXT: pshufb %xmm4, %xmm5 +; X86-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; X86-SSSE3-NEXT: pand %xmm1, %xmm3 +; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-SSSE3-NEXT: movdqa %xmm0, %xmm4 +; X86-SSSE3-NEXT: pshufb %xmm3, %xmm4 ; X86-SSSE3-NEXT: psrlw $4, %xmm2 -; X86-SSSE3-NEXT: pand %xmm0, %xmm2 +; X86-SSSE3-NEXT: pand %xmm1, %xmm2 +; X86-SSSE3-NEXT: movdqa %xmm0, %xmm3 ; X86-SSSE3-NEXT: pshufb %xmm2, %xmm3 -; X86-SSSE3-NEXT: paddb %xmm5, %xmm3 -; X86-SSSE3-NEXT: psadbw %xmm1, %xmm3 -; X86-SSSE3-NEXT: movd %xmm3, %edx +; X86-SSSE3-NEXT: paddb %xmm4, %xmm3 +; X86-SSSE3-NEXT: pxor %xmm2, %xmm2 +; X86-SSSE3-NEXT: psadbw %xmm2, %xmm3 +; X86-SSSE3-NEXT: movd %xmm3, %ecx +; X86-SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; X86-SSSE3-NEXT: movdqa %xmm3, %xmm4 +; X86-SSSE3-NEXT: pand %xmm1, %xmm4 +; X86-SSSE3-NEXT: movdqa %xmm0, %xmm5 +; X86-SSSE3-NEXT: pshufb %xmm4, %xmm5 +; X86-SSSE3-NEXT: psrlw $4, %xmm3 +; X86-SSSE3-NEXT: pand %xmm1, %xmm3 +; X86-SSSE3-NEXT: pshufb %xmm3, %xmm0 +; X86-SSSE3-NEXT: paddb %xmm5, %xmm0 +; X86-SSSE3-NEXT: psadbw %xmm2, %xmm0 +; X86-SSSE3-NEXT: movd %xmm0, %edx ; X86-SSSE3-NEXT: addl %ecx, %edx ; X86-SSSE3-NEXT: xorl %ecx, %ecx ; X86-SSSE3-NEXT: movl %ecx, 12(%eax) @@ -1096,41 +1097,41 @@ ; X86-NOSSE-NEXT: pushl %edi ; X86-NOSSE-NEXT: pushl %esi ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl %ecx, %edx -; X86-NOSSE-NEXT: shrl %edx -; X86-NOSSE-NEXT: movl $1431655765, %esi # imm = 0x55555555 -; X86-NOSSE-NEXT: andl %esi, %edx -; X86-NOSSE-NEXT: subl %edx, %ecx -; X86-NOSSE-NEXT: movl $858993459, %edx # imm = 0x33333333 -; X86-NOSSE-NEXT: movl %ecx, %edi -; X86-NOSSE-NEXT: andl %edx, %edi -; X86-NOSSE-NEXT: shrl $2, %ecx +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NOSSE-NEXT: movl %esi, %ecx +; X86-NOSSE-NEXT: shrl %ecx +; X86-NOSSE-NEXT: movl $1431655765, %edx # imm = 0x55555555 ; X86-NOSSE-NEXT: andl %edx, %ecx -; X86-NOSSE-NEXT: addl %edi, %ecx -; X86-NOSSE-NEXT: movl %ecx, %edi -; X86-NOSSE-NEXT: shrl $4, %edi -; X86-NOSSE-NEXT: addl %ecx, %edi -; X86-NOSSE-NEXT: movl $252645135, %ecx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: subl %ecx, %esi +; X86-NOSSE-NEXT: movl $858993459, %ecx # imm = 0x33333333 +; X86-NOSSE-NEXT: movl %esi, %edi ; X86-NOSSE-NEXT: andl %ecx, %edi -; X86-NOSSE-NEXT: imull $16843009, %edi, %edi # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %edi +; X86-NOSSE-NEXT: shrl $2, %esi +; X86-NOSSE-NEXT: andl %ecx, %esi +; X86-NOSSE-NEXT: addl %edi, %esi +; X86-NOSSE-NEXT: movl %esi, %ebx +; X86-NOSSE-NEXT: shrl $4, %ebx +; X86-NOSSE-NEXT: addl %esi, %ebx +; X86-NOSSE-NEXT: movl $252645135, %edi # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: andl %edi, %ebx +; X86-NOSSE-NEXT: imull $16843009, %ebx, %esi # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %esi ; X86-NOSSE-NEXT: movl %eax, %ebx ; X86-NOSSE-NEXT: shrl %ebx -; X86-NOSSE-NEXT: andl %esi, %ebx +; X86-NOSSE-NEXT: andl %edx, %ebx ; X86-NOSSE-NEXT: subl %ebx, %eax -; X86-NOSSE-NEXT: movl %eax, %esi -; X86-NOSSE-NEXT: andl %edx, %esi -; X86-NOSSE-NEXT: shrl $2, %eax -; X86-NOSSE-NEXT: andl %edx, %eax -; X86-NOSSE-NEXT: addl %esi, %eax ; X86-NOSSE-NEXT: movl %eax, %edx -; X86-NOSSE-NEXT: shrl $4, %edx -; X86-NOSSE-NEXT: addl %eax, %edx ; X86-NOSSE-NEXT: andl %ecx, %edx -; X86-NOSSE-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $2, %eax +; X86-NOSSE-NEXT: andl %ecx, %eax +; X86-NOSSE-NEXT: addl %edx, %eax +; X86-NOSSE-NEXT: movl %eax, %ecx +; X86-NOSSE-NEXT: shrl $4, %ecx +; X86-NOSSE-NEXT: addl %eax, %ecx +; X86-NOSSE-NEXT: andl %edi, %ecx +; X86-NOSSE-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 ; X86-NOSSE-NEXT: shrl $24, %eax -; X86-NOSSE-NEXT: addl %edi, %eax +; X86-NOSSE-NEXT: addl %esi, %eax ; X86-NOSSE-NEXT: xorl %edx, %edx ; X86-NOSSE-NEXT: popl %esi ; X86-NOSSE-NEXT: popl %edi @@ -1274,35 +1275,36 @@ ; X86-NOSSE-NEXT: shrl $2, %esi ; X86-NOSSE-NEXT: andl %ecx, %esi ; X86-NOSSE-NEXT: addl %eax, %esi -; X86-NOSSE-NEXT: movl %esi, %eax -; X86-NOSSE-NEXT: shrl $4, %eax -; X86-NOSSE-NEXT: addl %esi, %eax -; X86-NOSSE-NEXT: movl %edx, %esi -; X86-NOSSE-NEXT: shrl %esi -; X86-NOSSE-NEXT: andl %ebp, %esi -; X86-NOSSE-NEXT: subl %esi, %edx -; X86-NOSSE-NEXT: movl %edx, %esi -; X86-NOSSE-NEXT: andl %ecx, %esi +; X86-NOSSE-NEXT: movl %esi, %ebp +; X86-NOSSE-NEXT: shrl $4, %ebp +; X86-NOSSE-NEXT: addl %esi, %ebp +; X86-NOSSE-NEXT: movl %edx, %eax +; X86-NOSSE-NEXT: shrl %eax +; X86-NOSSE-NEXT: movl $1431655765, %esi # imm = 0x55555555 +; X86-NOSSE-NEXT: andl %esi, %eax +; X86-NOSSE-NEXT: subl %eax, %edx +; X86-NOSSE-NEXT: movl %edx, %eax +; X86-NOSSE-NEXT: andl %ecx, %eax ; X86-NOSSE-NEXT: shrl $2, %edx ; X86-NOSSE-NEXT: andl %ecx, %edx -; X86-NOSSE-NEXT: addl %esi, %edx -; X86-NOSSE-NEXT: movl %edx, %ecx -; X86-NOSSE-NEXT: shrl $4, %ecx -; X86-NOSSE-NEXT: addl %edx, %ecx +; X86-NOSSE-NEXT: addl %eax, %edx +; X86-NOSSE-NEXT: movl %edx, %eax +; X86-NOSSE-NEXT: shrl $4, %eax +; X86-NOSSE-NEXT: addl %edx, %eax +; X86-NOSSE-NEXT: andl %ebx, %ebp ; X86-NOSSE-NEXT: andl %ebx, %eax -; X86-NOSSE-NEXT: andl %ebx, %ecx -; X86-NOSSE-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %eax -; X86-NOSSE-NEXT: imull $16843009, %ecx, %ecx # imm = 0x1010101 +; X86-NOSSE-NEXT: imull $16843009, %ebp, %ecx # imm = 0x1010101 ; X86-NOSSE-NEXT: shrl $24, %ecx -; X86-NOSSE-NEXT: addl %eax, %ecx +; X86-NOSSE-NEXT: imull $16843009, %eax, %edx # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %edx +; X86-NOSSE-NEXT: addl %ecx, %edx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: addl %edi, %ecx -; X86-NOSSE-NEXT: xorl %edx, %edx -; X86-NOSSE-NEXT: movl %edx, 12(%eax) -; X86-NOSSE-NEXT: movl %edx, 8(%eax) -; X86-NOSSE-NEXT: movl %edx, 4(%eax) -; X86-NOSSE-NEXT: movl %ecx, (%eax) +; X86-NOSSE-NEXT: addl %edi, %edx +; X86-NOSSE-NEXT: xorl %ecx, %ecx +; X86-NOSSE-NEXT: movl %ecx, 12(%eax) +; X86-NOSSE-NEXT: movl %ecx, 8(%eax) +; X86-NOSSE-NEXT: movl %ecx, 4(%eax) +; X86-NOSSE-NEXT: movl %edx, (%eax) ; X86-NOSSE-NEXT: popl %esi ; X86-NOSSE-NEXT: popl %edi ; X86-NOSSE-NEXT: popl %ebx @@ -1379,42 +1381,42 @@ ; X86-SSE2-LABEL: cnt128_pgso: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrlw $1, %xmm1 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: psubb %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE2-NEXT: pand %xmm1, %xmm3 -; X86-SSE2-NEXT: psrlw $2, %xmm0 +; X86-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $1, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] ; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: paddb %xmm3, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE2-NEXT: psrlw $4, %xmm3 -; X86-SSE2-NEXT: paddb %xmm0, %xmm3 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE2-NEXT: psubb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 ; X86-SSE2-NEXT: pand %xmm0, %xmm3 -; X86-SSE2-NEXT: pxor %xmm4, %xmm4 -; X86-SSE2-NEXT: psadbw %xmm4, %xmm3 -; X86-SSE2-NEXT: movd %xmm3, %ecx -; X86-SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE2-NEXT: psrlw $2, %xmm2 +; X86-SSE2-NEXT: pand %xmm0, %xmm2 +; X86-SSE2-NEXT: paddb %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: psrlw $4, %xmm4 +; X86-SSE2-NEXT: paddb %xmm2, %xmm4 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pxor %xmm3, %xmm3 +; X86-SSE2-NEXT: psadbw %xmm3, %xmm4 +; X86-SSE2-NEXT: movd %xmm4, %ecx +; X86-SSE2-NEXT: movq {{.*#+}} xmm4 = mem[0],zero +; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 ; X86-SSE2-NEXT: psrlw $1, %xmm5 -; X86-SSE2-NEXT: pand %xmm2, %xmm5 -; X86-SSE2-NEXT: psubb %xmm5, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: psrlw $2, %xmm3 -; X86-SSE2-NEXT: pand %xmm1, %xmm3 -; X86-SSE2-NEXT: paddb %xmm2, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 -; X86-SSE2-NEXT: psrlw $4, %xmm1 -; X86-SSE2-NEXT: paddb %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm5 +; X86-SSE2-NEXT: psubb %xmm5, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: psadbw %xmm4, %xmm1 -; X86-SSE2-NEXT: movd %xmm1, %edx +; X86-SSE2-NEXT: psrlw $2, %xmm4 +; X86-SSE2-NEXT: pand %xmm0, %xmm4 +; X86-SSE2-NEXT: paddb %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: psrlw $4, %xmm0 +; X86-SSE2-NEXT: paddb %xmm4, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: psadbw %xmm3, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %edx ; X86-SSE2-NEXT: addl %ecx, %edx ; X86-SSE2-NEXT: xorl %ecx, %ecx ; X86-SSE2-NEXT: movl %ecx, 12(%eax) @@ -1426,32 +1428,32 @@ ; X86-SSSE3-LABEL: cnt128_pgso: ; X86-SSSE3: # %bb.0: ; X86-SSSE3-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X86-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X86-SSSE3-NEXT: movdqa %xmm1, %xmm2 -; X86-SSSE3-NEXT: pand %xmm0, %xmm2 -; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X86-SSSE3-NEXT: movdqa %xmm3, %xmm4 -; X86-SSSE3-NEXT: pshufb %xmm2, %xmm4 -; X86-SSSE3-NEXT: psrlw $4, %xmm1 -; X86-SSSE3-NEXT: pand %xmm0, %xmm1 -; X86-SSSE3-NEXT: movdqa %xmm3, %xmm2 -; X86-SSSE3-NEXT: pshufb %xmm1, %xmm2 -; X86-SSSE3-NEXT: paddb %xmm4, %xmm2 -; X86-SSSE3-NEXT: pxor %xmm1, %xmm1 -; X86-SSSE3-NEXT: psadbw %xmm1, %xmm2 -; X86-SSSE3-NEXT: movd %xmm2, %ecx +; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X86-SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; X86-SSSE3-NEXT: movdqa %xmm2, %xmm4 -; X86-SSSE3-NEXT: pand %xmm0, %xmm4 -; X86-SSSE3-NEXT: movdqa %xmm3, %xmm5 -; X86-SSSE3-NEXT: pshufb %xmm4, %xmm5 +; X86-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; X86-SSSE3-NEXT: pand %xmm1, %xmm3 +; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X86-SSSE3-NEXT: movdqa %xmm0, %xmm4 +; X86-SSSE3-NEXT: pshufb %xmm3, %xmm4 ; X86-SSSE3-NEXT: psrlw $4, %xmm2 -; X86-SSSE3-NEXT: pand %xmm0, %xmm2 +; X86-SSSE3-NEXT: pand %xmm1, %xmm2 +; X86-SSSE3-NEXT: movdqa %xmm0, %xmm3 ; X86-SSSE3-NEXT: pshufb %xmm2, %xmm3 -; X86-SSSE3-NEXT: paddb %xmm5, %xmm3 -; X86-SSSE3-NEXT: psadbw %xmm1, %xmm3 -; X86-SSSE3-NEXT: movd %xmm3, %edx +; X86-SSSE3-NEXT: paddb %xmm4, %xmm3 +; X86-SSSE3-NEXT: pxor %xmm2, %xmm2 +; X86-SSSE3-NEXT: psadbw %xmm2, %xmm3 +; X86-SSSE3-NEXT: movd %xmm3, %ecx +; X86-SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; X86-SSSE3-NEXT: movdqa %xmm3, %xmm4 +; X86-SSSE3-NEXT: pand %xmm1, %xmm4 +; X86-SSSE3-NEXT: movdqa %xmm0, %xmm5 +; X86-SSSE3-NEXT: pshufb %xmm4, %xmm5 +; X86-SSSE3-NEXT: psrlw $4, %xmm3 +; X86-SSSE3-NEXT: pand %xmm1, %xmm3 +; X86-SSSE3-NEXT: pshufb %xmm3, %xmm0 +; X86-SSSE3-NEXT: paddb %xmm5, %xmm0 +; X86-SSSE3-NEXT: psadbw %xmm2, %xmm0 +; X86-SSSE3-NEXT: movd %xmm0, %edx ; X86-SSSE3-NEXT: addl %ecx, %edx ; X86-SSSE3-NEXT: xorl %ecx, %ecx ; X86-SSSE3-NEXT: movl %ecx, 12(%eax) Index: llvm/test/CodeGen/X86/pr31088.ll =================================================================== --- llvm/test/CodeGen/X86/pr31088.ll +++ llvm/test/CodeGen/X86/pr31088.ll @@ -132,18 +132,18 @@ ; X64-NEXT: pushq %r14 ; X64-NEXT: pushq %rbx ; X64-NEXT: subq $32, %rsp -; X64-NEXT: movl %edx, %ebx -; X64-NEXT: movl %esi, %ebp +; X64-NEXT: movl %edx, %ebp +; X64-NEXT: movl %esi, %ebx ; X64-NEXT: movl %edi, %r14d ; X64-NEXT: movzwl %cx, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; X64-NEXT: movzwl %bp, %edi +; X64-NEXT: movzwl %bx, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; X64-NEXT: callq __gnu_f2h_ieee@PLT ; X64-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; X64-NEXT: movzwl %bx, %edi +; X64-NEXT: movzwl %bp, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: movzwl %r14w, %edi Index: llvm/test/CodeGen/X86/pr32284.ll =================================================================== --- llvm/test/CodeGen/X86/pr32284.ll +++ llvm/test/CodeGen/X86/pr32284.ll @@ -213,41 +213,46 @@ ; ; X86-LABEL: f1: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi +; X86-NEXT: pushl %ebx ; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 12 ; X86-NEXT: subl $1, %esp -; X86-NEXT: .cfi_def_cfa_offset 9 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl var_5, %edx -; X86-NEXT: movl %edx, %eax -; X86-NEXT: xorl $208307499, %eax # imm = 0xC6A852B -; X86-NEXT: movl %edx, %esi -; X86-NEXT: sarl $31, %esi -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: xorl $-2, %ecx -; X86-NEXT: orl %eax, %ecx +; X86-NEXT: .cfi_def_cfa_offset 13 +; X86-NEXT: .cfi_offset %esi, -12 +; X86-NEXT: .cfi_offset %ebx, -8 +; X86-NEXT: movl var_5, %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: xorl $208307499, %edx # imm = 0xC6A852B +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: xorl $-2, %esi +; X86-NEXT: orl %edx, %esi ; X86-NEXT: setne (%esp) -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: andl %esi, %ecx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $-1, %ecx -; X86-NEXT: sete %al -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpl $-1, %edx -; X86-NEXT: sete %cl -; X86-NEXT: addl $7093, %edx # imm = 0x1BB5 -; X86-NEXT: adcl $0, %esi -; X86-NEXT: cmpl %ecx, %edx -; X86-NEXT: sbbl $0, %esi -; X86-NEXT: setl %cl -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movl %ecx, var_57 +; X86-NEXT: movl %eax, %esi +; X86-NEXT: andl %ecx, %esi +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: cmpl $-1, %esi +; X86-NEXT: sete %dl +; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: cmpl $-1, %eax +; X86-NEXT: sete %bl +; X86-NEXT: addl $7093, %eax # imm = 0x1BB5 +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: cmpl %ebx, %eax +; X86-NEXT: sbbl $0, %ecx +; X86-NEXT: setl %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: movl %eax, var_57 ; X86-NEXT: movl $0, var_57+4 -; X86-NEXT: movl %eax, _ZN8struct_210member_2_0E +; X86-NEXT: movl %edx, _ZN8struct_210member_2_0E ; X86-NEXT: movl $0, _ZN8struct_210member_2_0E+4 ; X86-NEXT: addl $1, %esp -; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_def_cfa_offset 12 ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: popl %ebx ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl entry: Index: llvm/test/CodeGen/X86/pr32329.ll =================================================================== --- llvm/test/CodeGen/X86/pr32329.ll +++ llvm/test/CodeGen/X86/pr32329.ll @@ -30,29 +30,29 @@ ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: movsbl var_27, %eax -; X86-NEXT: movzwl var_2, %esi +; X86-NEXT: movzwl var_2, %ebx ; X86-NEXT: movl var_310, %ecx ; X86-NEXT: imull %eax, %ecx ; X86-NEXT: addl var_24, %ecx -; X86-NEXT: movl $4194303, %edi # imm = 0x3FFFFF -; X86-NEXT: andl obj, %edi -; X86-NEXT: leal (%edi,%edi), %edx +; X86-NEXT: movl $4194303, %esi # imm = 0x3FFFFF +; X86-NEXT: andl obj, %esi +; X86-NEXT: leal (%esi,%esi), %edx ; X86-NEXT: subl %eax, %edx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: subl %esi, %ebx -; X86-NEXT: imull %ebx, %ecx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: subl %ebx, %edi +; X86-NEXT: imull %edi, %ecx ; X86-NEXT: addb $113, %cl -; X86-NEXT: movl $9, %esi +; X86-NEXT: movl $9, %ebx ; X86-NEXT: xorl %ebp, %ebp -; X86-NEXT: shldl %cl, %esi, %ebp -; X86-NEXT: shll %cl, %esi +; X86-NEXT: shldl %cl, %ebx, %ebp +; X86-NEXT: shll %cl, %ebx ; X86-NEXT: testb $32, %cl -; X86-NEXT: cmovnel %esi, %ebp +; X86-NEXT: cmovnel %ebx, %ebp ; X86-NEXT: movl $0, %ecx -; X86-NEXT: cmovnel %ecx, %esi -; X86-NEXT: cmpl %edi, %ebx +; X86-NEXT: cmovnel %ecx, %ebx +; X86-NEXT: cmpl %esi, %edi ; X86-NEXT: movl %ebp, var_50+4 -; X86-NEXT: movl %esi, var_50 +; X86-NEXT: movl %ebx, var_50 ; X86-NEXT: setge var_205 ; X86-NEXT: imull %eax, %edx ; X86-NEXT: movb %dl, var_218 Index: llvm/test/CodeGen/X86/pr32610.ll =================================================================== --- llvm/test/CodeGen/X86/pr32610.ll +++ llvm/test/CodeGen/X86/pr32610.ll @@ -13,20 +13,20 @@ ; CHECK-NEXT: pushl %ebp ; CHECK-NEXT: movl %esp, %ebp ; CHECK-NEXT: pushl %esi -; CHECK-NEXT: movl 8(%ebp), %ecx -; CHECK-NEXT: movl L_b$non_lazy_ptr, %edx -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: cmpl (%edx), %ecx -; CHECK-NEXT: sete %al +; CHECK-NEXT: movl 8(%ebp), %edx +; CHECK-NEXT: movl L_b$non_lazy_ptr, %eax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: cmpl (%eax), %edx +; CHECK-NEXT: sete %cl ; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: incl %esi ; CHECK-NEXT: cmpl $0, 12(%ebp) -; CHECK-NEXT: cmovel %esi, %eax -; CHECK-NEXT: cmpl (%edx), %ecx -; CHECK-NEXT: cmovnel %esi, %eax -; CHECK-NEXT: movl L_c$non_lazy_ptr, %ecx -; CHECK-NEXT: movl %eax, (%ecx) -; CHECK-NEXT: movl (%edx), %eax +; CHECK-NEXT: cmovel %esi, %ecx +; CHECK-NEXT: cmpl (%eax), %edx +; CHECK-NEXT: cmovnel %esi, %ecx +; CHECK-NEXT: movl L_c$non_lazy_ptr, %edx +; CHECK-NEXT: movl %ecx, (%edx) +; CHECK-NEXT: movl (%eax), %eax ; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: movl $2, %ecx ; CHECK-NEXT: cmovnel %eax, %ecx Index: llvm/test/CodeGen/X86/pr34080-2.ll =================================================================== --- llvm/test/CodeGen/X86/pr34080-2.ll +++ llvm/test/CodeGen/X86/pr34080-2.ll @@ -58,26 +58,26 @@ ; CHECK-NEXT: fistpll {{[0-9]+}}(%esp) ; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) ; CHECK-NEXT: movb $1, 36(%ebx) -; CHECK-NEXT: imull $3600000, 20(%ebx), %eax # imm = 0x36EE80 -; CHECK-NEXT: imull $60000, 24(%ebx), %ecx # imm = 0xEA60 -; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: imull $3600000, 20(%ebx), %ecx # imm = 0x36EE80 +; CHECK-NEXT: imull $60000, 24(%ebx), %eax # imm = 0xEA60 +; CHECK-NEXT: addl %ecx, %eax ; CHECK-NEXT: fldl 28(%ebx) ; CHECK-NEXT: fmuls {{\.?LCPI[0-9]+_[0-9]+}} ; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp) -; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: orl $3072, %eax # imm = 0xC00 -; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: sarl $31, %eax +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: orl $3072, %ecx # imm = 0xC00 +; CHECK-NEXT: movw %cx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: sarl $31, %ecx ; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) ; CHECK-NEXT: fistpll {{[0-9]+}}(%esp) ; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) -; CHECK-NEXT: addl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: adcl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: adcl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %ecx, (%ebx) -; CHECK-NEXT: movl %eax, 4(%ebx) +; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: adcl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: adcl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %eax, (%ebx) +; CHECK-NEXT: movl %ecx, 4(%ebx) ; CHECK-NEXT: leal -12(%ebp), %esp ; CHECK-NEXT: popl %esi ; CHECK-NEXT: popl %edi Index: llvm/test/CodeGen/X86/pr46527.ll =================================================================== --- llvm/test/CodeGen/X86/pr46527.ll +++ llvm/test/CodeGen/X86/pr46527.ll @@ -7,11 +7,11 @@ ; CHECK-NEXT: calll .L0$pb ; CHECK-NEXT: .cfi_adjust_cfa_offset 4 ; CHECK-NEXT: .L0$pb: -; CHECK-NEXT: popl %eax +; CHECK-NEXT: popl %ecx ; CHECK-NEXT: .cfi_adjust_cfa_offset -4 ; CHECK-NEXT: .Ltmp0: -; CHECK-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movb {{[0-9]+}}(%esp), %dl ; CHECK-NEXT: notb %dl ; CHECK-NEXT: andb $1, %dl @@ -22,8 +22,8 @@ ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; CHECK-NEXT: paddb %xmm1, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}@GOTOFF(%eax), %xmm1 -; CHECK-NEXT: movdqa %xmm1, (%ecx) +; CHECK-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}@GOTOFF(%ecx), %xmm1 +; CHECK-NEXT: movdqa %xmm1, (%eax) ; CHECK-NEXT: retl entry: %0 = select i1 %flag, i8 0, i8 2 Index: llvm/test/CodeGen/X86/sadd_sat.ll =================================================================== --- llvm/test/CodeGen/X86/sadd_sat.ll +++ llvm/test/CodeGen/X86/sadd_sat.ll @@ -43,23 +43,21 @@ ; X86-LABEL: func2: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: adcl {{[0-9]+}}(%esp), %esi +; X86-NEXT: addl {{[0-9]+}}(%esp), %edx +; X86-NEXT: adcl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: seto %bl -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sarl $31, %eax ; X86-NEXT: testb %bl, %bl -; X86-NEXT: cmovel %ecx, %eax +; X86-NEXT: cmovel %edx, %eax ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: testl %esi, %esi +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setns %dl ; X86-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF ; X86-NEXT: testb %bl, %bl -; X86-NEXT: cmovel %esi, %edx -; X86-NEXT: popl %esi +; X86-NEXT: cmovel %ecx, %edx ; X86-NEXT: popl %ebx ; X86-NEXT: retl ; Index: llvm/test/CodeGen/X86/sadd_sat_plus.ll =================================================================== --- llvm/test/CodeGen/X86/sadd_sat_plus.ll +++ llvm/test/CodeGen/X86/sadd_sat_plus.ll @@ -45,23 +45,21 @@ ; X86-LABEL: func64: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: adcl {{[0-9]+}}(%esp), %esi +; X86-NEXT: addl {{[0-9]+}}(%esp), %edx +; X86-NEXT: adcl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: seto %bl -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sarl $31, %eax ; X86-NEXT: testb %bl, %bl -; X86-NEXT: cmovel %ecx, %eax +; X86-NEXT: cmovel %edx, %eax ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: testl %esi, %esi +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setns %dl ; X86-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF ; X86-NEXT: testb %bl, %bl -; X86-NEXT: cmovel %esi, %edx -; X86-NEXT: popl %esi +; X86-NEXT: cmovel %ecx, %edx ; X86-NEXT: popl %ebx ; X86-NEXT: retl ; Index: llvm/test/CodeGen/X86/scheduler-backtracking.ll =================================================================== --- llvm/test/CodeGen/X86/scheduler-backtracking.ll +++ llvm/test/CodeGen/X86/scheduler-backtracking.ll @@ -76,14 +76,14 @@ ; HYBRID-NEXT: movl %r10d, %ecx ; HYBRID-NEXT: shldq %cl, %rdi, %r11 ; HYBRID-NEXT: addb $-125, %sil -; HYBRID-NEXT: xorl %edx, %edx +; HYBRID-NEXT: xorl %ebx, %ebx ; HYBRID-NEXT: movl %esi, %ecx -; HYBRID-NEXT: shldq %cl, %rdi, %rdx -; HYBRID-NEXT: movl $1, %ebx -; HYBRID-NEXT: shlq %cl, %rbx +; HYBRID-NEXT: shldq %cl, %rdi, %rbx +; HYBRID-NEXT: movl $1, %edx +; HYBRID-NEXT: shlq %cl, %rdx ; HYBRID-NEXT: testb $64, %sil -; HYBRID-NEXT: cmovneq %rbx, %rdx -; HYBRID-NEXT: cmovneq %r8, %rbx +; HYBRID-NEXT: cmovneq %rdx, %rbx +; HYBRID-NEXT: cmovneq %r8, %rdx ; HYBRID-NEXT: movl %r10d, %ecx ; HYBRID-NEXT: shlq %cl, %rdi ; HYBRID-NEXT: testb $64, %r10b @@ -94,12 +94,12 @@ ; HYBRID-NEXT: movq %r11, 8(%rax) ; HYBRID-NEXT: cmovsq %r8, %rdi ; HYBRID-NEXT: movq %rdi, (%rax) -; HYBRID-NEXT: cmovnsq %r8, %rdx -; HYBRID-NEXT: cmoveq %r8, %rdx -; HYBRID-NEXT: movq %rdx, 24(%rax) -; HYBRID-NEXT: cmovnsq %r9, %rbx +; HYBRID-NEXT: cmovnsq %r8, %rbx ; HYBRID-NEXT: cmoveq %r8, %rbx -; HYBRID-NEXT: movq %rbx, 16(%rax) +; HYBRID-NEXT: movq %rbx, 24(%rax) +; HYBRID-NEXT: cmovnsq %r9, %rdx +; HYBRID-NEXT: cmoveq %r8, %rdx +; HYBRID-NEXT: movq %rdx, 16(%rax) ; HYBRID-NEXT: popq %rbx ; HYBRID-NEXT: retq ; @@ -121,14 +121,14 @@ ; BURR-NEXT: movl %r10d, %ecx ; BURR-NEXT: shldq %cl, %rdi, %r11 ; BURR-NEXT: addb $-125, %sil -; BURR-NEXT: xorl %edx, %edx +; BURR-NEXT: xorl %ebx, %ebx ; BURR-NEXT: movl %esi, %ecx -; BURR-NEXT: shldq %cl, %rdi, %rdx -; BURR-NEXT: movl $1, %ebx -; BURR-NEXT: shlq %cl, %rbx +; BURR-NEXT: shldq %cl, %rdi, %rbx +; BURR-NEXT: movl $1, %edx +; BURR-NEXT: shlq %cl, %rdx ; BURR-NEXT: testb $64, %sil -; BURR-NEXT: cmovneq %rbx, %rdx -; BURR-NEXT: cmovneq %r8, %rbx +; BURR-NEXT: cmovneq %rdx, %rbx +; BURR-NEXT: cmovneq %r8, %rdx ; BURR-NEXT: movl %r10d, %ecx ; BURR-NEXT: shlq %cl, %rdi ; BURR-NEXT: testb $64, %r10b @@ -139,12 +139,12 @@ ; BURR-NEXT: movq %r11, 8(%rax) ; BURR-NEXT: cmovsq %r8, %rdi ; BURR-NEXT: movq %rdi, (%rax) -; BURR-NEXT: cmovnsq %r8, %rdx -; BURR-NEXT: cmoveq %r8, %rdx -; BURR-NEXT: movq %rdx, 24(%rax) -; BURR-NEXT: cmovnsq %r9, %rbx +; BURR-NEXT: cmovnsq %r8, %rbx ; BURR-NEXT: cmoveq %r8, %rbx -; BURR-NEXT: movq %rbx, 16(%rax) +; BURR-NEXT: movq %rbx, 24(%rax) +; BURR-NEXT: cmovnsq %r9, %rdx +; BURR-NEXT: cmoveq %r8, %rdx +; BURR-NEXT: movq %rdx, 16(%rax) ; BURR-NEXT: popq %rbx ; BURR-NEXT: retq ; Index: llvm/test/CodeGen/X86/sdiv_fix.ll =================================================================== --- llvm/test/CodeGen/X86/sdiv_fix.ll +++ llvm/test/CodeGen/X86/sdiv_fix.ll @@ -95,14 +95,14 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movsbl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll $14, %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: cltd -; X86-NEXT: idivl %esi -; X86-NEXT: leal -1(%eax), %edi -; X86-NEXT: testl %esi, %esi +; X86-NEXT: idivl %edi +; X86-NEXT: leal -1(%eax), %esi +; X86-NEXT: testl %edi, %edi ; X86-NEXT: sets %bl ; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: sets %cl @@ -110,9 +110,9 @@ ; X86-NEXT: testl %edx, %edx ; X86-NEXT: setne %dl ; X86-NEXT: testb %cl, %dl -; X86-NEXT: cmovel %eax, %edi -; X86-NEXT: addl %edi, %edi -; X86-NEXT: movswl %di, %eax +; X86-NEXT: cmovel %eax, %esi +; X86-NEXT: addl %esi, %esi +; X86-NEXT: movswl %si, %eax ; X86-NEXT: shrl %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi @@ -222,31 +222,33 @@ ; ; X86-LABEL: func4: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: shlb $4, %cl +; X86-NEXT: sarb $4, %cl ; X86-NEXT: movb {{[0-9]+}}(%esp), %dl ; X86-NEXT: shlb $4, %dl ; X86-NEXT: sarb $4, %dl -; X86-NEXT: movb {{[0-9]+}}(%esp), %dh -; X86-NEXT: shlb $4, %dh -; X86-NEXT: sarb $4, %dh -; X86-NEXT: shlb $2, %dh -; X86-NEXT: movsbl %dh, %eax -; X86-NEXT: idivb %dl -; X86-NEXT: movsbl %ah, %ecx +; X86-NEXT: shlb $2, %dl +; X86-NEXT: movsbl %dl, %eax +; X86-NEXT: idivb %cl +; X86-NEXT: movsbl %ah, %ebx ; X86-NEXT: movzbl %al, %esi ; X86-NEXT: decb %al ; X86-NEXT: movzbl %al, %eax +; X86-NEXT: testb %cl, %cl +; X86-NEXT: sets %cl ; X86-NEXT: testb %dl, %dl ; X86-NEXT: sets %dl -; X86-NEXT: testb %dh, %dh -; X86-NEXT: sets %dh -; X86-NEXT: xorb %dl, %dh -; X86-NEXT: testb %cl, %cl +; X86-NEXT: xorb %cl, %dl +; X86-NEXT: testb %bl, %bl ; X86-NEXT: setne %cl -; X86-NEXT: testb %dh, %cl +; X86-NEXT: testb %dl, %cl ; X86-NEXT: cmovel %esi, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: popl %esi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %tmp = call i4 @llvm.sdiv.fix.i4(i4 %x, i4 %y, i32 2) ret i4 %tmp @@ -617,26 +619,26 @@ ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: sarl $31, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: shll $31, %ebp ; X86-NEXT: movl %ecx, %esi -; X86-NEXT: shrl $31, %esi -; X86-NEXT: shldl $31, %ecx, %esi +; X86-NEXT: shll $31, %esi +; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: shrl $31, %ebp +; X86-NEXT: shldl $31, %ecx, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %eax -; X86-NEXT: pushl %esi ; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %esi ; X86-NEXT: calll __moddi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %esi ; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %esi ; X86-NEXT: calll __divdi3 ; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %esi, %esi +; X86-NEXT: testl %ebp, %ebp ; X86-NEXT: sets %cl ; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: sets %dl Index: llvm/test/CodeGen/X86/sdiv_fix_sat.ll =================================================================== --- llvm/test/CodeGen/X86/sdiv_fix_sat.ll +++ llvm/test/CodeGen/X86/sdiv_fix_sat.ll @@ -45,14 +45,14 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movswl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movswl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll $8, %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: cltd -; X86-NEXT: idivl %esi -; X86-NEXT: leal -1(%eax), %edi -; X86-NEXT: testl %esi, %esi +; X86-NEXT: idivl %edi +; X86-NEXT: leal -1(%eax), %esi +; X86-NEXT: testl %edi, %edi ; X86-NEXT: sets %bl ; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: sets %cl @@ -60,10 +60,10 @@ ; X86-NEXT: testl %edx, %edx ; X86-NEXT: setne %dl ; X86-NEXT: testb %cl, %dl -; X86-NEXT: cmovel %eax, %edi -; X86-NEXT: cmpl $65535, %edi # imm = 0xFFFF +; X86-NEXT: cmovel %eax, %esi +; X86-NEXT: cmpl $65535, %esi # imm = 0xFFFF ; X86-NEXT: movl $65535, %ecx # imm = 0xFFFF -; X86-NEXT: cmovll %edi, %ecx +; X86-NEXT: cmovll %esi, %ecx ; X86-NEXT: cmpl $-65535, %ecx # imm = 0xFFFF0001 ; X86-NEXT: movl $-65536, %eax # imm = 0xFFFF0000 ; X86-NEXT: cmovgel %ecx, %eax @@ -114,14 +114,14 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movsbl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll $14, %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: cltd -; X86-NEXT: idivl %esi -; X86-NEXT: leal -1(%eax), %edi -; X86-NEXT: testl %esi, %esi +; X86-NEXT: idivl %edi +; X86-NEXT: leal -1(%eax), %esi +; X86-NEXT: testl %edi, %edi ; X86-NEXT: sets %bl ; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: sets %cl @@ -129,10 +129,10 @@ ; X86-NEXT: testl %edx, %edx ; X86-NEXT: setne %dl ; X86-NEXT: testb %cl, %dl -; X86-NEXT: cmovel %eax, %edi -; X86-NEXT: cmpl $16383, %edi # imm = 0x3FFF +; X86-NEXT: cmovel %eax, %esi +; X86-NEXT: cmpl $16383, %esi # imm = 0x3FFF ; X86-NEXT: movl $16383, %ecx # imm = 0x3FFF -; X86-NEXT: cmovll %edi, %ecx +; X86-NEXT: cmovll %esi, %ecx ; X86-NEXT: cmpl $-16383, %ecx # imm = 0xC001 ; X86-NEXT: movl $-16384, %eax # imm = 0xC000 ; X86-NEXT: cmovgel %ecx, %eax @@ -188,27 +188,27 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shll $8, %eax -; X86-NEXT: movswl %ax, %esi +; X86-NEXT: movswl %ax, %edi ; X86-NEXT: addl %ecx, %ecx -; X86-NEXT: shrl $4, %esi +; X86-NEXT: shrl $4, %edi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: cwtd -; X86-NEXT: idivw %si +; X86-NEXT: idivw %di ; X86-NEXT: # kill: def $ax killed $ax def $eax -; X86-NEXT: leal -1(%eax), %edi +; X86-NEXT: leal -1(%eax), %esi ; X86-NEXT: testw %cx, %cx ; X86-NEXT: sets %cl -; X86-NEXT: testw %si, %si +; X86-NEXT: testw %di, %di ; X86-NEXT: sets %ch ; X86-NEXT: xorb %cl, %ch ; X86-NEXT: testw %dx, %dx ; X86-NEXT: setne %cl ; X86-NEXT: testb %ch, %cl -; X86-NEXT: cmovel %eax, %edi -; X86-NEXT: movswl %di, %eax +; X86-NEXT: cmovel %eax, %esi +; X86-NEXT: movswl %si, %eax ; X86-NEXT: cmpl $16383, %eax # imm = 0x3FFF ; X86-NEXT: movl $16383, %ecx # imm = 0x3FFF -; X86-NEXT: cmovll %edi, %ecx +; X86-NEXT: cmovll %esi, %ecx ; X86-NEXT: movswl %cx, %eax ; X86-NEXT: cmpl $-16383, %eax # imm = 0xC001 ; X86-NEXT: movl $49152, %eax # imm = 0xC000 @@ -262,28 +262,29 @@ ; ; X86-LABEL: func4: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: shlb $4, %cl +; X86-NEXT: sarb $4, %cl ; X86-NEXT: movb {{[0-9]+}}(%esp), %dl ; X86-NEXT: shlb $4, %dl ; X86-NEXT: sarb $4, %dl -; X86-NEXT: movb {{[0-9]+}}(%esp), %dh -; X86-NEXT: shlb $4, %dh -; X86-NEXT: sarb $4, %dh -; X86-NEXT: shlb $2, %dh -; X86-NEXT: movsbl %dh, %eax -; X86-NEXT: idivb %dl -; X86-NEXT: movsbl %ah, %ecx +; X86-NEXT: shlb $2, %dl +; X86-NEXT: movsbl %dl, %eax +; X86-NEXT: idivb %cl +; X86-NEXT: movsbl %ah, %ebx ; X86-NEXT: movzbl %al, %esi ; X86-NEXT: decb %al ; X86-NEXT: movzbl %al, %eax +; X86-NEXT: testb %cl, %cl +; X86-NEXT: sets %cl ; X86-NEXT: testb %dl, %dl ; X86-NEXT: sets %dl -; X86-NEXT: testb %dh, %dh -; X86-NEXT: sets %dh -; X86-NEXT: xorb %dl, %dh -; X86-NEXT: testb %cl, %cl +; X86-NEXT: xorb %cl, %dl +; X86-NEXT: testb %bl, %bl ; X86-NEXT: setne %cl -; X86-NEXT: testb %dh, %cl +; X86-NEXT: testb %dl, %cl ; X86-NEXT: cmovel %esi, %eax ; X86-NEXT: cmpb $7, %al ; X86-NEXT: movl $7, %ecx @@ -293,6 +294,7 @@ ; X86-NEXT: cmovgel %ecx, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: popl %esi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %tmp = call i4 @llvm.sdiv.fix.sat.i4(i4 %x, i4 %y, i32 2) ret i4 %tmp @@ -533,14 +535,14 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movswl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movswl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll $7, %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: cltd -; X86-NEXT: idivl %esi -; X86-NEXT: leal -1(%eax), %edi -; X86-NEXT: testl %esi, %esi +; X86-NEXT: idivl %edi +; X86-NEXT: leal -1(%eax), %esi +; X86-NEXT: testl %edi, %edi ; X86-NEXT: sets %bl ; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: sets %cl @@ -548,10 +550,10 @@ ; X86-NEXT: testl %edx, %edx ; X86-NEXT: setne %dl ; X86-NEXT: testb %cl, %dl -; X86-NEXT: cmovel %eax, %edi -; X86-NEXT: cmpl $131071, %edi # imm = 0x1FFFF +; X86-NEXT: cmovel %eax, %esi +; X86-NEXT: cmpl $131071, %esi # imm = 0x1FFFF ; X86-NEXT: movl $131071, %ecx # imm = 0x1FFFF -; X86-NEXT: cmovll %edi, %ecx +; X86-NEXT: cmovll %esi, %ecx ; X86-NEXT: cmpl $-131071, %ecx # imm = 0xFFFE0001 ; X86-NEXT: movl $-131072, %eax # imm = 0xFFFE0000 ; X86-NEXT: cmovgel %ecx, %eax @@ -983,10 +985,9 @@ ; X86-NEXT: pushl %ecx ; X86-NEXT: pushl %ecx ; X86-NEXT: pushl 36(%ebp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %ecx -; X86-NEXT: movl %ecx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edi ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl %eax @@ -997,30 +998,30 @@ ; X86-NEXT: subl $1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl $0, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %eax ; X86-NEXT: sbbl $0, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: testl %edi, %edi -; X86-NEXT: sets %al +; X86-NEXT: sets %cl ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: sets %ah -; X86-NEXT: xorb %al, %ah +; X86-NEXT: sets %ch +; X86-NEXT: xorb %cl, %ch ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: orl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: orl {{[0-9]+}}(%esp), %edx ; X86-NEXT: orl %edi, %edx -; X86-NEXT: setne %al -; X86-NEXT: testb %ah, %al -; X86-NEXT: cmovel %esi, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: setne %cl +; X86-NEXT: testb %ch, %cl +; X86-NEXT: cmovel %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -1034,15 +1035,15 @@ ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: subl $1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %ecx ; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: testl %ebx, %ebx @@ -1050,23 +1051,23 @@ ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: sets %bh ; X86-NEXT: xorb %bl, %bh -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: orl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: orl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl %edi, %eax +; X86-NEXT: orl %esi, %eax ; X86-NEXT: setne %al ; X86-NEXT: testb %bh, %al -; X86-NEXT: cmovel %edx, %ecx +; X86-NEXT: cmovel %edi, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: cmovel %edx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: cmovel %esi, %eax +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill Index: llvm/test/CodeGen/X86/select.ll =================================================================== --- llvm/test/CodeGen/X86/select.ll +++ llvm/test/CodeGen/X86/select.ll @@ -295,39 +295,39 @@ ; ; ATHLON-LABEL: test6: ; ATHLON: ## %bb.0: -; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %eax ; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %ecx -; ATHLON-NEXT: flds 12(%ecx) -; ATHLON-NEXT: flds 8(%ecx) -; ATHLON-NEXT: flds 4(%ecx) -; ATHLON-NEXT: flds (%ecx) +; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %eax +; ATHLON-NEXT: flds 12(%eax) +; ATHLON-NEXT: flds 8(%eax) +; ATHLON-NEXT: flds 4(%eax) ; ATHLON-NEXT: flds (%eax) +; ATHLON-NEXT: flds (%ecx) ; ATHLON-NEXT: fmul %st, %st(0) ; ATHLON-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; ATHLON-NEXT: fxch %st(1) ; ATHLON-NEXT: fcmove %st(1), %st ; ATHLON-NEXT: fstp %st(1) -; ATHLON-NEXT: flds 4(%eax) +; ATHLON-NEXT: flds 4(%ecx) ; ATHLON-NEXT: fmul %st, %st(0) ; ATHLON-NEXT: fxch %st(2) ; ATHLON-NEXT: fcmove %st(2), %st ; ATHLON-NEXT: fstp %st(2) -; ATHLON-NEXT: flds 8(%eax) +; ATHLON-NEXT: flds 8(%ecx) ; ATHLON-NEXT: fmul %st, %st(0) ; ATHLON-NEXT: fxch %st(3) ; ATHLON-NEXT: fcmove %st(3), %st ; ATHLON-NEXT: fstp %st(3) -; ATHLON-NEXT: flds 12(%eax) +; ATHLON-NEXT: flds 12(%ecx) ; ATHLON-NEXT: fmul %st, %st(0) ; ATHLON-NEXT: fxch %st(4) ; ATHLON-NEXT: fcmove %st(4), %st ; ATHLON-NEXT: fstp %st(4) ; ATHLON-NEXT: fxch %st(3) -; ATHLON-NEXT: fstps 12(%ecx) +; ATHLON-NEXT: fstps 12(%eax) ; ATHLON-NEXT: fxch %st(1) -; ATHLON-NEXT: fstps 8(%ecx) -; ATHLON-NEXT: fstps 4(%ecx) -; ATHLON-NEXT: fstps (%ecx) +; ATHLON-NEXT: fstps 8(%eax) +; ATHLON-NEXT: fstps 4(%eax) +; ATHLON-NEXT: fstps (%eax) ; ATHLON-NEXT: retl ; ; MCU-LABEL: test6: @@ -508,43 +508,43 @@ ; ATHLON-NEXT: pushl %edi ; ATHLON-NEXT: pushl %esi ; ATHLON-NEXT: testb $1, {{[0-9]+}}(%esp) -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %eax ; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %ecx -; ATHLON-NEXT: cmovnel %eax, %ecx ; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %eax +; ATHLON-NEXT: cmovnel %ecx, %eax +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %edx +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %ecx +; ATHLON-NEXT: cmovnel %edx, %ecx ; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %edx -; ATHLON-NEXT: cmovnel %eax, %edx -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %eax ; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %esi -; ATHLON-NEXT: cmovnel %eax, %esi -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %eax +; ATHLON-NEXT: cmovnel %edx, %esi +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %edx ; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %edi -; ATHLON-NEXT: cmovnel %eax, %edi -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %eax +; ATHLON-NEXT: cmovnel %edx, %edi +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %edx ; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %ebx -; ATHLON-NEXT: cmovnel %eax, %ebx -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %eax +; ATHLON-NEXT: cmovnel %edx, %ebx +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %edx ; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %ebp -; ATHLON-NEXT: cmovnel %eax, %ebp -; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %eax +; ATHLON-NEXT: cmovnel %edx, %ebp +; ATHLON-NEXT: movl (%eax), %eax ; ATHLON-NEXT: movl (%ecx), %ecx -; ATHLON-NEXT: movl (%edx), %edx -; ATHLON-NEXT: movl (%esi), %esi -; ATHLON-NEXT: movl (%edi), %edi +; ATHLON-NEXT: movl (%esi), %edx +; ATHLON-NEXT: movl (%edi), %esi ; ATHLON-NEXT: movl (%ebx), %ebx -; ATHLON-NEXT: movl (%ebp), %ebp +; ATHLON-NEXT: movl (%ebp), %edi +; ATHLON-NEXT: decl %eax +; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %ebp +; ATHLON-NEXT: movl %eax, 20(%ebp) ; ATHLON-NEXT: decl %ecx -; ATHLON-NEXT: movl %ecx, 20(%eax) +; ATHLON-NEXT: movl %ecx, 16(%ebp) ; ATHLON-NEXT: decl %edx -; ATHLON-NEXT: movl %edx, 16(%eax) +; ATHLON-NEXT: movl %edx, 12(%ebp) ; ATHLON-NEXT: decl %esi -; ATHLON-NEXT: movl %esi, 12(%eax) -; ATHLON-NEXT: decl %edi -; ATHLON-NEXT: movl %edi, 8(%eax) +; ATHLON-NEXT: movl %esi, 8(%ebp) ; ATHLON-NEXT: decl %ebx -; ATHLON-NEXT: movl %ebx, 4(%eax) -; ATHLON-NEXT: decl %ebp -; ATHLON-NEXT: movl %ebp, (%eax) +; ATHLON-NEXT: movl %ebx, 4(%ebp) +; ATHLON-NEXT: decl %edi +; ATHLON-NEXT: movl %edi, (%ebp) ; ATHLON-NEXT: popl %esi ; ATHLON-NEXT: popl %edi ; ATHLON-NEXT: popl %ebx Index: llvm/test/CodeGen/X86/setcc-wide-types.ll =================================================================== --- llvm/test/CodeGen/X86/setcc-wide-types.ll +++ llvm/test/CodeGen/X86/setcc-wide-types.ll @@ -236,45 +236,45 @@ ; SSE2-LABEL: ne_i512: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %rax +; SSE2-NEXT: movq %xmm8, %rdx ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %rcx +; SSE2-NEXT: movq %xmm8, %rsi ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %rdx +; SSE2-NEXT: movq %xmm8, %rdi ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %rsi +; SSE2-NEXT: movq %xmm8, %rax ; SSE2-NEXT: movq %xmm0, %r11 -; SSE2-NEXT: movq %xmm2, %r8 +; SSE2-NEXT: movq %xmm2, %r10 ; SSE2-NEXT: movq %xmm1, %r9 -; SSE2-NEXT: movq %xmm3, %r10 +; SSE2-NEXT: movq %xmm3, %r8 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rdi -; SSE2-NEXT: xorq %rax, %rdi -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: xorq %rcx, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: xorq %rdx, %rcx -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rdx ; SSE2-NEXT: xorq %rsi, %rdx -; SSE2-NEXT: orq %rcx, %rdx -; SSE2-NEXT: orq %rax, %rdx -; SSE2-NEXT: orq %rdi, %rdx +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE2-NEXT: movq %xmm0, %rsi +; SSE2-NEXT: xorq %rdi, %rsi +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE2-NEXT: movq %xmm0, %rdi +; SSE2-NEXT: xorq %rax, %rdi +; SSE2-NEXT: orq %rsi, %rdi +; SSE2-NEXT: orq %rdx, %rdi +; SSE2-NEXT: orq %rcx, %rdi ; SSE2-NEXT: movq %xmm4, %rax ; SSE2-NEXT: xorq %r11, %rax ; SSE2-NEXT: movq %xmm6, %rcx -; SSE2-NEXT: xorq %r8, %rcx -; SSE2-NEXT: movq %xmm5, %rsi -; SSE2-NEXT: xorq %r9, %rsi -; SSE2-NEXT: movq %xmm7, %rdi -; SSE2-NEXT: xorq %r10, %rdi -; SSE2-NEXT: orq %rsi, %rdi -; SSE2-NEXT: orq %rcx, %rdi -; SSE2-NEXT: orq %rax, %rdi +; SSE2-NEXT: xorq %r10, %rcx +; SSE2-NEXT: movq %xmm5, %rdx +; SSE2-NEXT: xorq %r9, %rdx +; SSE2-NEXT: movq %xmm7, %rsi +; SSE2-NEXT: xorq %r8, %rsi +; SSE2-NEXT: orq %rdx, %rsi +; SSE2-NEXT: orq %rcx, %rsi +; SSE2-NEXT: orq %rax, %rsi ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rdx, %rdi +; SSE2-NEXT: orq %rdi, %rsi ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; @@ -317,84 +317,84 @@ ; ; AVX1-LABEL: ne_i512: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vmovq %xmm1, %rcx +; AVX1-NEXT: vmovq %xmm0, %rdx +; AVX1-NEXT: vmovq %xmm1, %rsi ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovq %xmm4, %rdx +; AVX1-NEXT: vmovq %xmm4, %rdi ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vmovq %xmm5, %rsi +; AVX1-NEXT: vmovq %xmm5, %rax ; AVX1-NEXT: vpextrq $1, %xmm0, %r11 -; AVX1-NEXT: vpextrq $1, %xmm1, %r8 +; AVX1-NEXT: vpextrq $1, %xmm1, %r10 ; AVX1-NEXT: vpextrq $1, %xmm4, %r9 -; AVX1-NEXT: vpextrq $1, %xmm5, %r10 -; AVX1-NEXT: vmovq %xmm2, %rdi -; AVX1-NEXT: xorq %rax, %rdi -; AVX1-NEXT: vmovq %xmm3, %rax -; AVX1-NEXT: xorq %rcx, %rax -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: vpextrq $1, %xmm5, %r8 +; AVX1-NEXT: vmovq %xmm2, %rcx ; AVX1-NEXT: xorq %rdx, %rcx -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %rdx +; AVX1-NEXT: vmovq %xmm3, %rdx ; AVX1-NEXT: xorq %rsi, %rdx -; AVX1-NEXT: orq %rcx, %rdx -; AVX1-NEXT: orq %rax, %rdx -; AVX1-NEXT: orq %rdi, %rdx +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rsi +; AVX1-NEXT: xorq %rdi, %rsi +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %rdi +; AVX1-NEXT: xorq %rax, %rdi +; AVX1-NEXT: orq %rsi, %rdi +; AVX1-NEXT: orq %rdx, %rdi +; AVX1-NEXT: orq %rcx, %rdi ; AVX1-NEXT: vpextrq $1, %xmm2, %rax ; AVX1-NEXT: xorq %r11, %rax ; AVX1-NEXT: vpextrq $1, %xmm3, %rcx -; AVX1-NEXT: xorq %r8, %rcx -; AVX1-NEXT: vpextrq $1, %xmm0, %rsi -; AVX1-NEXT: xorq %r9, %rsi -; AVX1-NEXT: vpextrq $1, %xmm1, %rdi -; AVX1-NEXT: xorq %r10, %rdi -; AVX1-NEXT: orq %rsi, %rdi -; AVX1-NEXT: orq %rcx, %rdi -; AVX1-NEXT: orq %rax, %rdi +; AVX1-NEXT: xorq %r10, %rcx +; AVX1-NEXT: vpextrq $1, %xmm0, %rdx +; AVX1-NEXT: xorq %r9, %rdx +; AVX1-NEXT: vpextrq $1, %xmm1, %rsi +; AVX1-NEXT: xorq %r8, %rsi +; AVX1-NEXT: orq %rdx, %rsi +; AVX1-NEXT: orq %rcx, %rsi +; AVX1-NEXT: orq %rax, %rsi ; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: orq %rdx, %rdi +; AVX1-NEXT: orq %rdi, %rsi ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: ne_i512: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vmovq %xmm1, %rcx +; AVX2-NEXT: vmovq %xmm0, %rdx +; AVX2-NEXT: vmovq %xmm1, %rsi ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vmovq %xmm4, %rdx +; AVX2-NEXT: vmovq %xmm4, %rdi ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-NEXT: vmovq %xmm5, %rsi +; AVX2-NEXT: vmovq %xmm5, %rax ; AVX2-NEXT: vpextrq $1, %xmm0, %r11 -; AVX2-NEXT: vpextrq $1, %xmm1, %r8 +; AVX2-NEXT: vpextrq $1, %xmm1, %r10 ; AVX2-NEXT: vpextrq $1, %xmm4, %r9 -; AVX2-NEXT: vpextrq $1, %xmm5, %r10 -; AVX2-NEXT: vmovq %xmm2, %rdi -; AVX2-NEXT: xorq %rax, %rdi -; AVX2-NEXT: vmovq %xmm3, %rax -; AVX2-NEXT: xorq %rcx, %rax -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: vpextrq $1, %xmm5, %r8 +; AVX2-NEXT: vmovq %xmm2, %rcx ; AVX2-NEXT: xorq %rdx, %rcx -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %rdx +; AVX2-NEXT: vmovq %xmm3, %rdx ; AVX2-NEXT: xorq %rsi, %rdx -; AVX2-NEXT: orq %rcx, %rdx -; AVX2-NEXT: orq %rax, %rdx -; AVX2-NEXT: orq %rdi, %rdx +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rsi +; AVX2-NEXT: xorq %rdi, %rsi +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %rdi +; AVX2-NEXT: xorq %rax, %rdi +; AVX2-NEXT: orq %rsi, %rdi +; AVX2-NEXT: orq %rdx, %rdi +; AVX2-NEXT: orq %rcx, %rdi ; AVX2-NEXT: vpextrq $1, %xmm2, %rax ; AVX2-NEXT: xorq %r11, %rax ; AVX2-NEXT: vpextrq $1, %xmm3, %rcx -; AVX2-NEXT: xorq %r8, %rcx -; AVX2-NEXT: vpextrq $1, %xmm0, %rsi -; AVX2-NEXT: xorq %r9, %rsi -; AVX2-NEXT: vpextrq $1, %xmm1, %rdi -; AVX2-NEXT: xorq %r10, %rdi -; AVX2-NEXT: orq %rsi, %rdi -; AVX2-NEXT: orq %rcx, %rdi -; AVX2-NEXT: orq %rax, %rdi +; AVX2-NEXT: xorq %r10, %rcx +; AVX2-NEXT: vpextrq $1, %xmm0, %rdx +; AVX2-NEXT: xorq %r9, %rdx +; AVX2-NEXT: vpextrq $1, %xmm1, %rsi +; AVX2-NEXT: xorq %r8, %rsi +; AVX2-NEXT: orq %rdx, %rsi +; AVX2-NEXT: orq %rcx, %rsi +; AVX2-NEXT: orq %rax, %rsi ; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rdx, %rdi +; AVX2-NEXT: orq %rdi, %rsi ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -427,45 +427,45 @@ ; SSE2-LABEL: eq_i512: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %rax +; SSE2-NEXT: movq %xmm8, %rdx ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %rcx +; SSE2-NEXT: movq %xmm8, %rsi ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %rdx +; SSE2-NEXT: movq %xmm8, %rdi ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3] -; SSE2-NEXT: movq %xmm8, %rsi +; SSE2-NEXT: movq %xmm8, %rax ; SSE2-NEXT: movq %xmm0, %r11 -; SSE2-NEXT: movq %xmm2, %r8 +; SSE2-NEXT: movq %xmm2, %r10 ; SSE2-NEXT: movq %xmm1, %r9 -; SSE2-NEXT: movq %xmm3, %r10 +; SSE2-NEXT: movq %xmm3, %r8 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rdi -; SSE2-NEXT: xorq %rax, %rdi -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: xorq %rcx, %rax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: xorq %rdx, %rcx -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rdx ; SSE2-NEXT: xorq %rsi, %rdx -; SSE2-NEXT: orq %rcx, %rdx -; SSE2-NEXT: orq %rax, %rdx -; SSE2-NEXT: orq %rdi, %rdx +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE2-NEXT: movq %xmm0, %rsi +; SSE2-NEXT: xorq %rdi, %rsi +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE2-NEXT: movq %xmm0, %rdi +; SSE2-NEXT: xorq %rax, %rdi +; SSE2-NEXT: orq %rsi, %rdi +; SSE2-NEXT: orq %rdx, %rdi +; SSE2-NEXT: orq %rcx, %rdi ; SSE2-NEXT: movq %xmm4, %rax ; SSE2-NEXT: xorq %r11, %rax ; SSE2-NEXT: movq %xmm6, %rcx -; SSE2-NEXT: xorq %r8, %rcx -; SSE2-NEXT: movq %xmm5, %rsi -; SSE2-NEXT: xorq %r9, %rsi -; SSE2-NEXT: movq %xmm7, %rdi -; SSE2-NEXT: xorq %r10, %rdi -; SSE2-NEXT: orq %rsi, %rdi -; SSE2-NEXT: orq %rcx, %rdi -; SSE2-NEXT: orq %rax, %rdi +; SSE2-NEXT: xorq %r10, %rcx +; SSE2-NEXT: movq %xmm5, %rdx +; SSE2-NEXT: xorq %r9, %rdx +; SSE2-NEXT: movq %xmm7, %rsi +; SSE2-NEXT: xorq %r8, %rsi +; SSE2-NEXT: orq %rdx, %rsi +; SSE2-NEXT: orq %rcx, %rsi +; SSE2-NEXT: orq %rax, %rsi ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rdx, %rdi +; SSE2-NEXT: orq %rdi, %rsi ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; @@ -508,84 +508,84 @@ ; ; AVX1-LABEL: eq_i512: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vmovq %xmm1, %rcx +; AVX1-NEXT: vmovq %xmm0, %rdx +; AVX1-NEXT: vmovq %xmm1, %rsi ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovq %xmm4, %rdx +; AVX1-NEXT: vmovq %xmm4, %rdi ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vmovq %xmm5, %rsi +; AVX1-NEXT: vmovq %xmm5, %rax ; AVX1-NEXT: vpextrq $1, %xmm0, %r11 -; AVX1-NEXT: vpextrq $1, %xmm1, %r8 +; AVX1-NEXT: vpextrq $1, %xmm1, %r10 ; AVX1-NEXT: vpextrq $1, %xmm4, %r9 -; AVX1-NEXT: vpextrq $1, %xmm5, %r10 -; AVX1-NEXT: vmovq %xmm2, %rdi -; AVX1-NEXT: xorq %rax, %rdi -; AVX1-NEXT: vmovq %xmm3, %rax -; AVX1-NEXT: xorq %rcx, %rax -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: vpextrq $1, %xmm5, %r8 +; AVX1-NEXT: vmovq %xmm2, %rcx ; AVX1-NEXT: xorq %rdx, %rcx -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %rdx +; AVX1-NEXT: vmovq %xmm3, %rdx ; AVX1-NEXT: xorq %rsi, %rdx -; AVX1-NEXT: orq %rcx, %rdx -; AVX1-NEXT: orq %rax, %rdx -; AVX1-NEXT: orq %rdi, %rdx +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rsi +; AVX1-NEXT: xorq %rdi, %rsi +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %rdi +; AVX1-NEXT: xorq %rax, %rdi +; AVX1-NEXT: orq %rsi, %rdi +; AVX1-NEXT: orq %rdx, %rdi +; AVX1-NEXT: orq %rcx, %rdi ; AVX1-NEXT: vpextrq $1, %xmm2, %rax ; AVX1-NEXT: xorq %r11, %rax ; AVX1-NEXT: vpextrq $1, %xmm3, %rcx -; AVX1-NEXT: xorq %r8, %rcx -; AVX1-NEXT: vpextrq $1, %xmm0, %rsi -; AVX1-NEXT: xorq %r9, %rsi -; AVX1-NEXT: vpextrq $1, %xmm1, %rdi -; AVX1-NEXT: xorq %r10, %rdi -; AVX1-NEXT: orq %rsi, %rdi -; AVX1-NEXT: orq %rcx, %rdi -; AVX1-NEXT: orq %rax, %rdi +; AVX1-NEXT: xorq %r10, %rcx +; AVX1-NEXT: vpextrq $1, %xmm0, %rdx +; AVX1-NEXT: xorq %r9, %rdx +; AVX1-NEXT: vpextrq $1, %xmm1, %rsi +; AVX1-NEXT: xorq %r8, %rsi +; AVX1-NEXT: orq %rdx, %rsi +; AVX1-NEXT: orq %rcx, %rsi +; AVX1-NEXT: orq %rax, %rsi ; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: orq %rdx, %rdi +; AVX1-NEXT: orq %rdi, %rsi ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: eq_i512: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vmovq %xmm1, %rcx +; AVX2-NEXT: vmovq %xmm0, %rdx +; AVX2-NEXT: vmovq %xmm1, %rsi ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vmovq %xmm4, %rdx +; AVX2-NEXT: vmovq %xmm4, %rdi ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-NEXT: vmovq %xmm5, %rsi +; AVX2-NEXT: vmovq %xmm5, %rax ; AVX2-NEXT: vpextrq $1, %xmm0, %r11 -; AVX2-NEXT: vpextrq $1, %xmm1, %r8 +; AVX2-NEXT: vpextrq $1, %xmm1, %r10 ; AVX2-NEXT: vpextrq $1, %xmm4, %r9 -; AVX2-NEXT: vpextrq $1, %xmm5, %r10 -; AVX2-NEXT: vmovq %xmm2, %rdi -; AVX2-NEXT: xorq %rax, %rdi -; AVX2-NEXT: vmovq %xmm3, %rax -; AVX2-NEXT: xorq %rcx, %rax -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: vpextrq $1, %xmm5, %r8 +; AVX2-NEXT: vmovq %xmm2, %rcx ; AVX2-NEXT: xorq %rdx, %rcx -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %rdx +; AVX2-NEXT: vmovq %xmm3, %rdx ; AVX2-NEXT: xorq %rsi, %rdx -; AVX2-NEXT: orq %rcx, %rdx -; AVX2-NEXT: orq %rax, %rdx -; AVX2-NEXT: orq %rdi, %rdx +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rsi +; AVX2-NEXT: xorq %rdi, %rsi +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %rdi +; AVX2-NEXT: xorq %rax, %rdi +; AVX2-NEXT: orq %rsi, %rdi +; AVX2-NEXT: orq %rdx, %rdi +; AVX2-NEXT: orq %rcx, %rdi ; AVX2-NEXT: vpextrq $1, %xmm2, %rax ; AVX2-NEXT: xorq %r11, %rax ; AVX2-NEXT: vpextrq $1, %xmm3, %rcx -; AVX2-NEXT: xorq %r8, %rcx -; AVX2-NEXT: vpextrq $1, %xmm0, %rsi -; AVX2-NEXT: xorq %r9, %rsi -; AVX2-NEXT: vpextrq $1, %xmm1, %rdi -; AVX2-NEXT: xorq %r10, %rdi -; AVX2-NEXT: orq %rsi, %rdi -; AVX2-NEXT: orq %rcx, %rdi -; AVX2-NEXT: orq %rax, %rdi +; AVX2-NEXT: xorq %r10, %rcx +; AVX2-NEXT: vpextrq $1, %xmm0, %rdx +; AVX2-NEXT: xorq %r9, %rdx +; AVX2-NEXT: vpextrq $1, %xmm1, %rsi +; AVX2-NEXT: xorq %r8, %rsi +; AVX2-NEXT: orq %rdx, %rsi +; AVX2-NEXT: orq %rcx, %rsi +; AVX2-NEXT: orq %rax, %rsi ; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: orq %rdx, %rdi +; AVX2-NEXT: orq %rdi, %rsi ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -744,22 +744,22 @@ ; SSE2-NEXT: xorq 24(%rsi), %r11 ; SSE2-NEXT: xorq (%rsi), %r8 ; SSE2-NEXT: xorq 16(%rsi), %r9 -; SSE2-NEXT: movq 48(%rdi), %rdx +; SSE2-NEXT: movq 48(%rdi), %rcx ; SSE2-NEXT: movq 32(%rdi), %rax -; SSE2-NEXT: movq 56(%rdi), %rcx +; SSE2-NEXT: movq 56(%rdi), %rdx ; SSE2-NEXT: movq 40(%rdi), %rdi ; SSE2-NEXT: xorq 40(%rsi), %rdi -; SSE2-NEXT: xorq 56(%rsi), %rcx -; SSE2-NEXT: orq %r11, %rcx -; SSE2-NEXT: orq %rdi, %rcx -; SSE2-NEXT: orq %r10, %rcx +; SSE2-NEXT: xorq 56(%rsi), %rdx +; SSE2-NEXT: orq %r11, %rdx +; SSE2-NEXT: orq %rdi, %rdx +; SSE2-NEXT: orq %r10, %rdx ; SSE2-NEXT: xorq 32(%rsi), %rax -; SSE2-NEXT: xorq 48(%rsi), %rdx -; SSE2-NEXT: orq %r9, %rdx -; SSE2-NEXT: orq %rax, %rdx -; SSE2-NEXT: orq %r8, %rdx +; SSE2-NEXT: xorq 48(%rsi), %rcx +; SSE2-NEXT: orq %r9, %rcx +; SSE2-NEXT: orq %rax, %rcx +; SSE2-NEXT: orq %r8, %rcx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rcx, %rdx +; SSE2-NEXT: orq %rdx, %rcx ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; @@ -773,22 +773,22 @@ ; SSE41-NEXT: xorq 24(%rsi), %r11 ; SSE41-NEXT: xorq (%rsi), %r8 ; SSE41-NEXT: xorq 16(%rsi), %r9 -; SSE41-NEXT: movq 48(%rdi), %rdx +; SSE41-NEXT: movq 48(%rdi), %rcx ; SSE41-NEXT: movq 32(%rdi), %rax -; SSE41-NEXT: movq 56(%rdi), %rcx +; SSE41-NEXT: movq 56(%rdi), %rdx ; SSE41-NEXT: movq 40(%rdi), %rdi ; SSE41-NEXT: xorq 40(%rsi), %rdi -; SSE41-NEXT: xorq 56(%rsi), %rcx -; SSE41-NEXT: orq %r11, %rcx -; SSE41-NEXT: orq %rdi, %rcx -; SSE41-NEXT: orq %r10, %rcx +; SSE41-NEXT: xorq 56(%rsi), %rdx +; SSE41-NEXT: orq %r11, %rdx +; SSE41-NEXT: orq %rdi, %rdx +; SSE41-NEXT: orq %r10, %rdx ; SSE41-NEXT: xorq 32(%rsi), %rax -; SSE41-NEXT: xorq 48(%rsi), %rdx -; SSE41-NEXT: orq %r9, %rdx -; SSE41-NEXT: orq %rax, %rdx -; SSE41-NEXT: orq %r8, %rdx +; SSE41-NEXT: xorq 48(%rsi), %rcx +; SSE41-NEXT: orq %r9, %rcx +; SSE41-NEXT: orq %rax, %rcx +; SSE41-NEXT: orq %r8, %rcx ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: orq %rcx, %rdx +; SSE41-NEXT: orq %rdx, %rcx ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; @@ -858,22 +858,22 @@ ; SSE2-NEXT: xorq 24(%rsi), %r11 ; SSE2-NEXT: xorq (%rsi), %r8 ; SSE2-NEXT: xorq 16(%rsi), %r9 -; SSE2-NEXT: movq 48(%rdi), %rdx +; SSE2-NEXT: movq 48(%rdi), %rcx ; SSE2-NEXT: movq 32(%rdi), %rax -; SSE2-NEXT: movq 56(%rdi), %rcx +; SSE2-NEXT: movq 56(%rdi), %rdx ; SSE2-NEXT: movq 40(%rdi), %rdi ; SSE2-NEXT: xorq 40(%rsi), %rdi -; SSE2-NEXT: xorq 56(%rsi), %rcx -; SSE2-NEXT: orq %r11, %rcx -; SSE2-NEXT: orq %rdi, %rcx -; SSE2-NEXT: orq %r10, %rcx +; SSE2-NEXT: xorq 56(%rsi), %rdx +; SSE2-NEXT: orq %r11, %rdx +; SSE2-NEXT: orq %rdi, %rdx +; SSE2-NEXT: orq %r10, %rdx ; SSE2-NEXT: xorq 32(%rsi), %rax -; SSE2-NEXT: xorq 48(%rsi), %rdx -; SSE2-NEXT: orq %r9, %rdx -; SSE2-NEXT: orq %rax, %rdx -; SSE2-NEXT: orq %r8, %rdx +; SSE2-NEXT: xorq 48(%rsi), %rcx +; SSE2-NEXT: orq %r9, %rcx +; SSE2-NEXT: orq %rax, %rcx +; SSE2-NEXT: orq %r8, %rcx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rcx, %rdx +; SSE2-NEXT: orq %rdx, %rcx ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; @@ -887,22 +887,22 @@ ; SSE41-NEXT: xorq 24(%rsi), %r11 ; SSE41-NEXT: xorq (%rsi), %r8 ; SSE41-NEXT: xorq 16(%rsi), %r9 -; SSE41-NEXT: movq 48(%rdi), %rdx +; SSE41-NEXT: movq 48(%rdi), %rcx ; SSE41-NEXT: movq 32(%rdi), %rax -; SSE41-NEXT: movq 56(%rdi), %rcx +; SSE41-NEXT: movq 56(%rdi), %rdx ; SSE41-NEXT: movq 40(%rdi), %rdi ; SSE41-NEXT: xorq 40(%rsi), %rdi -; SSE41-NEXT: xorq 56(%rsi), %rcx -; SSE41-NEXT: orq %r11, %rcx -; SSE41-NEXT: orq %rdi, %rcx -; SSE41-NEXT: orq %r10, %rcx +; SSE41-NEXT: xorq 56(%rsi), %rdx +; SSE41-NEXT: orq %r11, %rdx +; SSE41-NEXT: orq %rdi, %rdx +; SSE41-NEXT: orq %r10, %rdx ; SSE41-NEXT: xorq 32(%rsi), %rax -; SSE41-NEXT: xorq 48(%rsi), %rdx -; SSE41-NEXT: orq %r9, %rdx -; SSE41-NEXT: orq %rax, %rdx -; SSE41-NEXT: orq %r8, %rdx +; SSE41-NEXT: xorq 48(%rsi), %rcx +; SSE41-NEXT: orq %r9, %rcx +; SSE41-NEXT: orq %rax, %rcx +; SSE41-NEXT: orq %r8, %rcx ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: orq %rcx, %rdx +; SSE41-NEXT: orq %rdx, %rcx ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; Index: llvm/test/CodeGen/X86/shrink_vmul.ll =================================================================== --- llvm/test/CodeGen/X86/shrink_vmul.ll +++ llvm/test/CodeGen/X86/shrink_vmul.ll @@ -18,20 +18,20 @@ ; X86-SSE-LABEL: mul_2xi8: ; X86-SSE: # %bb.0: # %entry ; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE-NEXT: movl c, %esi -; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx +; X86-SSE-NEXT: movzwl (%edx,%eax), %edx ; X86-SSE-NEXT: movd %edx, %xmm0 -; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax -; X86-SSE-NEXT: movd %eax, %xmm1 +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm1 ; X86-SSE-NEXT: pxor %xmm2, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: movq %xmm1, (%esi,%eax,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -184,10 +184,10 @@ ; X86-SSE-LABEL: mul_8xi8: ; X86-SSE: # %bb.0: # %entry ; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SSE-NEXT: movl c, %ecx ; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X86-SSE-NEXT: pxor %xmm2, %xmm2 @@ -197,8 +197,8 @@ ; X86-SSE-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm1, 16(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm0, (%ecx,%eax,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -300,41 +300,41 @@ ; X86-SSE-LABEL: mul_16xi8: ; X86-SSE: # %bb.0: # %entry ; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl c, %esi -; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 -; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1 -; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SSE-NEXT: movl c, %ecx +; X86-SSE-NEXT: movdqu (%esi,%eax), %xmm3 +; X86-SSE-NEXT: movdqu (%edx,%eax), %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; X86-SSE-NEXT: pmullw %xmm4, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; X86-SSE-NEXT: pmullw %xmm3, %xmm0 ; X86-SSE-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X86-SSE-NEXT: movdqa %xmm1, %xmm4 -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; X86-SSE-NEXT: pmullw %xmm3, %xmm4 -; X86-SSE-NEXT: movdqa %xmm4, %xmm3 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; X86-SSE-NEXT: pmullw %xmm0, %xmm1 -; X86-SSE-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm0, 32(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm4, 16(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm3, (%esi,%ecx,4) +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE-NEXT: movdqu %xmm0, 48(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm3, 32(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm2, 16(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm4, (%ecx,%eax,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; ; X86-AVX1-LABEL: mul_16xi8: ; X86-AVX1: # %bb.0: # %entry ; X86-AVX1-NEXT: pushl %esi -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-AVX1-NEXT: movl c, %ecx ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero @@ -347,10 +347,10 @@ ; X86-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X86-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 -; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%ecx,%eax,4) +; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%ecx,%eax,4) +; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%ecx,%eax,4) +; X86-AVX1-NEXT: vmovdqu %xmm3, (%ecx,%eax,4) ; X86-AVX1-NEXT: popl %esi ; X86-AVX1-NEXT: retl ; @@ -728,40 +728,40 @@ ; X86-SSE-LABEL: mul_16xi16: ; X86-SSE: # %bb.0: # %entry ; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl c, %esi -; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 -; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1 -; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2 -; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3 -; X86-SSE-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE-NEXT: pmulhuw %xmm0, %xmm4 -; X86-SSE-NEXT: pmullw %xmm0, %xmm2 -; X86-SSE-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; X86-SSE-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE-NEXT: pmulhuw %xmm1, %xmm4 -; X86-SSE-NEXT: pmullw %xmm1, %xmm3 -; X86-SSE-NEXT: movdqa %xmm3, %xmm1 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SSE-NEXT: movl c, %ecx +; X86-SSE-NEXT: movdqu (%esi,%eax), %xmm2 +; X86-SSE-NEXT: movdqu 16(%esi,%eax), %xmm3 +; X86-SSE-NEXT: movdqu (%edx,%eax), %xmm0 +; X86-SSE-NEXT: movdqu 16(%edx,%eax), %xmm1 +; X86-SSE-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE-NEXT: pmulhuw %xmm2, %xmm4 +; X86-SSE-NEXT: pmullw %xmm2, %xmm0 +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X86-SSE-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE-NEXT: pmulhuw %xmm3, %xmm4 +; X86-SSE-NEXT: pmullw %xmm3, %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X86-SSE-NEXT: movdqu %xmm1, 32(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm3, 48(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm0, (%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm2, 16(%ecx,%eax,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; ; X86-AVX1-LABEL: mul_16xi16: ; X86-AVX1: # %bb.0: # %entry ; X86-AVX1-NEXT: pushl %esi -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-AVX1-NEXT: movl c, %ecx ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero @@ -774,10 +774,10 @@ ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 -; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%ecx,%eax,4) +; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%ecx,%eax,4) +; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%ecx,%eax,4) +; X86-AVX1-NEXT: vmovdqu %xmm3, (%ecx,%eax,4) ; X86-AVX1-NEXT: popl %esi ; X86-AVX1-NEXT: retl ; @@ -886,14 +886,14 @@ ; X86-SSE-LABEL: mul_2xi8_sext: ; X86-SSE: # %bb.0: # %entry ; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl c, %esi -; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx -; X86-SSE-NEXT: movd %edx, %xmm0 -; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax -; X86-SSE-NEXT: movd %eax, %xmm1 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SSE-NEXT: movl c, %ecx +; X86-SSE-NEXT: movzwl (%esi,%eax), %esi +; X86-SSE-NEXT: movd %esi, %xmm0 +; X86-SSE-NEXT: movzwl (%edx,%eax), %edx +; X86-SSE-NEXT: movd %edx, %xmm1 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: psraw $8, %xmm0 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -901,7 +901,7 @@ ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: movq %xmm0, (%ecx,%eax,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -979,14 +979,14 @@ ; X86-SSE-LABEL: mul_2xi8_sext_zext: ; X86-SSE: # %bb.0: # %entry ; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl c, %esi -; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx -; X86-SSE-NEXT: movd %edx, %xmm0 -; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax -; X86-SSE-NEXT: movd %eax, %xmm1 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SSE-NEXT: movl c, %ecx +; X86-SSE-NEXT: movzwl (%esi,%eax), %esi +; X86-SSE-NEXT: movd %esi, %xmm0 +; X86-SSE-NEXT: movzwl (%edx,%eax), %edx +; X86-SSE-NEXT: movd %edx, %xmm1 ; X86-SSE-NEXT: pxor %xmm2, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -995,7 +995,7 @@ ; X86-SSE-NEXT: pmulhw %xmm0, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: movq %xmm0, (%ecx,%eax,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -1153,10 +1153,10 @@ ; X86-SSE-LABEL: mul_2xi16_sext_zext: ; X86-SSE: # %bb.0: # %entry ; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SSE-NEXT: movl c, %ecx ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $16, %xmm0 @@ -1168,7 +1168,7 @@ ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X86-SSE-NEXT: pmuludq %xmm2, %xmm0 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: movq %xmm1, (%ecx,%eax,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -1242,56 +1242,56 @@ ; X86-SSE-LABEL: mul_16xi16_sext: ; X86-SSE: # %bb.0: # %entry ; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl c, %esi -; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 -; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1 -; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2 -; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3 -; X86-SSE-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE-NEXT: pmulhw %xmm0, %xmm4 -; X86-SSE-NEXT: pmullw %xmm0, %xmm2 -; X86-SSE-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; X86-SSE-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE-NEXT: pmulhw %xmm1, %xmm4 -; X86-SSE-NEXT: pmullw %xmm1, %xmm3 -; X86-SSE-NEXT: movdqa %xmm3, %xmm1 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SSE-NEXT: movl c, %ecx +; X86-SSE-NEXT: movdqu (%esi,%eax), %xmm2 +; X86-SSE-NEXT: movdqu 16(%esi,%eax), %xmm3 +; X86-SSE-NEXT: movdqu (%edx,%eax), %xmm0 +; X86-SSE-NEXT: movdqu 16(%edx,%eax), %xmm1 +; X86-SSE-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE-NEXT: pmulhw %xmm2, %xmm4 +; X86-SSE-NEXT: pmullw %xmm2, %xmm0 +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X86-SSE-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE-NEXT: pmulhw %xmm3, %xmm4 +; X86-SSE-NEXT: pmullw %xmm3, %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X86-SSE-NEXT: movdqu %xmm1, 32(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm3, 48(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm0, (%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm2, 16(%ecx,%eax,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; ; X86-AVX1-LABEL: mul_16xi16_sext: ; X86-AVX1: # %bb.0: # %entry ; X86-AVX1-NEXT: pushl %esi -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX1-NEXT: movl c, %esi -; X86-AVX1-NEXT: vpmovsxwd 24(%edx,%ecx), %xmm0 -; X86-AVX1-NEXT: vpmovsxwd 16(%edx,%ecx), %xmm1 -; X86-AVX1-NEXT: vpmovsxwd 8(%edx,%ecx), %xmm2 -; X86-AVX1-NEXT: vpmovsxwd (%edx,%ecx), %xmm3 -; X86-AVX1-NEXT: vpmovsxwd 24(%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-AVX1-NEXT: movl c, %ecx +; X86-AVX1-NEXT: vpmovsxwd 24(%esi,%eax), %xmm0 +; X86-AVX1-NEXT: vpmovsxwd 16(%esi,%eax), %xmm1 +; X86-AVX1-NEXT: vpmovsxwd 8(%esi,%eax), %xmm2 +; X86-AVX1-NEXT: vpmovsxwd (%esi,%eax), %xmm3 +; X86-AVX1-NEXT: vpmovsxwd 24(%edx,%eax), %xmm4 ; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 -; X86-AVX1-NEXT: vpmovsxwd 16(%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmovsxwd 16(%edx,%eax), %xmm4 ; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 -; X86-AVX1-NEXT: vpmovsxwd 8(%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmovsxwd 8(%edx,%eax), %xmm4 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 -; X86-AVX1-NEXT: vpmovsxwd (%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmovsxwd (%edx,%eax), %xmm4 ; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 -; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4) +; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%ecx,%eax,4) +; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%ecx,%eax,4) +; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%ecx,%eax,4) +; X86-AVX1-NEXT: vmovdqu %xmm3, (%ecx,%eax,4) ; X86-AVX1-NEXT: popl %esi ; X86-AVX1-NEXT: retl ; Index: llvm/test/CodeGen/X86/smax.ll =================================================================== --- llvm/test/CodeGen/X86/smax.ll +++ llvm/test/CodeGen/X86/smax.ll @@ -158,50 +158,50 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl %ebx, %edx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: cmoval %edx, %eax -; X86-NEXT: cmpl %esi, %ecx -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: cmoval %edx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmpl %ecx, %esi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: cmoval %esi, %eax +; X86-NEXT: cmpl %edx, %edi +; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: cmoval %esi, %ebp ; X86-NEXT: cmovel %eax, %ebp -; X86-NEXT: movl %esi, %eax -; X86-NEXT: cmoval %ecx, %eax +; X86-NEXT: movl %edx, %eax +; X86-NEXT: cmoval %edi, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: sbbl %edi, %ecx -; X86-NEXT: cmovll {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovll {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: xorl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: sbbl %ebx, %edi +; X86-NEXT: cmovll {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmovll {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ebx, %edi ; X86-NEXT: xorl %eax, %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: cmovel %ebp, %ebx -; X86-NEXT: cmovel (%esp), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmpl %eax, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: cmoval %edi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: xorl %esi, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: cmovel %ebp, %ecx +; X86-NEXT: cmovel (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: cmpl %esi, %ebx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: cmoval %ebx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: cmpl %edx, %ebp -; X86-NEXT: cmovgl %edi, %eax -; X86-NEXT: cmovel %ecx, %eax -; X86-NEXT: cmovgl %ebp, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %edx, 12(%ecx) -; X86-NEXT: movl %eax, 8(%ecx) -; X86-NEXT: movl %esi, 4(%ecx) -; X86-NEXT: movl %ebx, (%ecx) -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: cmpl %eax, %ebp +; X86-NEXT: cmovgl %ebx, %esi +; X86-NEXT: cmovel %edi, %esi +; X86-NEXT: cmovgl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %eax, 12(%edi) +; X86-NEXT: movl %esi, 8(%edi) +; X86-NEXT: movl %edx, 4(%edi) +; X86-NEXT: movl %ecx, (%edi) +; X86-NEXT: movl %edi, %eax ; X86-NEXT: addl $4, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi Index: llvm/test/CodeGen/X86/smin.ll =================================================================== --- llvm/test/CodeGen/X86/smin.ll +++ llvm/test/CodeGen/X86/smin.ll @@ -161,47 +161,46 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: cmpl %ecx, %edi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: cmovbl %edi, %eax +; X86-NEXT: cmpl %esi, %ebp ; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: cmovbl %eax, %ebx -; X86-NEXT: cmpl %esi, %edi -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: cmovbl %eax, %ebp -; X86-NEXT: cmovel %ebx, %ebp +; X86-NEXT: cmovbl %edi, %ebx +; X86-NEXT: cmovel %eax, %ebx ; X86-NEXT: movl %esi, %eax -; X86-NEXT: cmovbl %edi, %eax +; X86-NEXT: cmovbl %ebp, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: cmpl %edx, %edi ; X86-NEXT: movl %edx, %eax ; X86-NEXT: cmovbl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: sbbl %edi, %ebp ; X86-NEXT: cmovll {{[0-9]+}}(%esp), %esi ; X86-NEXT: cmovll {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: xorl %eax, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: xorl %edx, %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: cmovel %ebp, %ecx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: xorl %edi, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: xorl %edx, %eax +; X86-NEXT: orl %ebp, %eax +; X86-NEXT: cmovel %ebx, %ecx ; X86-NEXT: cmovel (%esp), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmpl %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %edi, %eax ; X86-NEXT: cmovll {{[0-9]+}}(%esp), %edx ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: cmovll %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %eax, 12(%edi) -; X86-NEXT: movl %edx, 8(%edi) -; X86-NEXT: movl %esi, 4(%edi) -; X86-NEXT: movl %ecx, (%edi) -; X86-NEXT: movl %edi, %eax +; X86-NEXT: cmovll %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi Index: llvm/test/CodeGen/X86/smul_fix.ll =================================================================== --- llvm/test/CodeGen/X86/smul_fix.ll +++ llvm/test/CodeGen/X86/smul_fix.ll @@ -51,36 +51,36 @@ ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %edi, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: addl %ebp, %eax -; X86-NEXT: adcl %esi, %edx -; X86-NEXT: movl %edi, %esi -; X86-NEXT: imull {{[0-9]+}}(%esp), %esi -; X86-NEXT: addl %edx, %esi -; X86-NEXT: movl %esi, %ebp +; X86-NEXT: adcl %edi, %edx +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: imull {{[0-9]+}}(%esp), %edi +; X86-NEXT: addl %edx, %edi +; X86-NEXT: movl %edi, %ebp ; X86-NEXT: subl %ecx, %ebp -; X86-NEXT: testl %edi, %edi -; X86-NEXT: cmovnsl %esi, %ebp +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: cmovnsl %edi, %ebp ; X86-NEXT: movl %ebp, %edx ; X86-NEXT: subl {{[0-9]+}}(%esp), %edx ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X86-NEXT: cmovnsl %ebp, %edx ; X86-NEXT: shldl $30, %eax, %edx -; X86-NEXT: shldl $30, %ebx, %eax +; X86-NEXT: shldl $30, %esi, %eax ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: popl %edi @@ -165,28 +165,28 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: shldl $30, %eax, %ebp +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shldl $30, %eax, %esi ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: imull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: shldl $30, %eax, %ebx -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: imull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %edi -; X86-NEXT: shldl $30, %eax, %edi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: shldl $30, %eax, %ebp +; X86-NEXT: movl %edi, %eax ; X86-NEXT: imull {{[0-9]+}}(%esp) ; X86-NEXT: shldl $30, %eax, %edx ; X86-NEXT: movl %edx, 12(%ecx) -; X86-NEXT: movl %edi, 8(%ecx) +; X86-NEXT: movl %ebp, 8(%ecx) ; X86-NEXT: movl %ebx, 4(%ecx) -; X86-NEXT: movl %ebp, (%ecx) +; X86-NEXT: movl %esi, (%ecx) ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -318,26 +318,26 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %ebp +; X86-NEXT: mull %esi ; X86-NEXT: addl %edx, %ebx ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ebp +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %esi ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: adcl %edi, %edx -; X86-NEXT: movl %esi, %edi +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: imull {{[0-9]+}}(%esp), %edi ; X86-NEXT: addl %edx, %edi ; X86-NEXT: movl %edi, %ebx -; X86-NEXT: subl %ebp, %ebx -; X86-NEXT: testl %esi, %esi +; X86-NEXT: subl %esi, %ebx +; X86-NEXT: testl %ebp, %ebp ; X86-NEXT: cmovnsl %edi, %ebx ; X86-NEXT: movl %ebx, %edx ; X86-NEXT: subl %ecx, %edx @@ -368,31 +368,32 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: imull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: mull %esi +; X86-NEXT: addl %edx, %ebp +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull %edi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: adcl %edi, %edx -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %esi +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %ebx, %edx +; X86-NEXT: adcl $0, %edi ; X86-NEXT: addl %ecx, %edx -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebp, %esi +; X86-NEXT: subl %esi, %ecx +; X86-NEXT: movl %edi, %esi ; X86-NEXT: sbbl $0, %esi ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %ebp, %esi +; X86-NEXT: cmovnsl %edi, %esi ; X86-NEXT: cmovnsl %edx, %ecx ; X86-NEXT: movl %ecx, %edi ; X86-NEXT: subl {{[0-9]+}}(%esp), %edi Index: llvm/test/CodeGen/X86/smul_fix_sat.ll =================================================================== --- llvm/test/CodeGen/X86/smul_fix_sat.ll +++ llvm/test/CodeGen/X86/smul_fix_sat.ll @@ -60,69 +60,75 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ebx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: adcl $0, %edi -; X86-NEXT: imull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %edi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %esi, %eax +; X86-NEXT: imull %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: adcl %edi, %edx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: addl %ebx, %edx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ecx, %esi +; X86-NEXT: adcl %ebp, %edx +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl %edi, %edx +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl %edx, %edi ; X86-NEXT: subl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: sbbl $0, %ebx +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: sbbl $0, %ebp ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %esi, %ebx +; X86-NEXT: cmovnsl %ebx, %ebp ; X86-NEXT: cmovnsl %edx, %edi -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: subl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: sbbl $0, %esi +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ebp, %edx +; X86-NEXT: sbbl $0, %edx ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %ebx, %esi -; X86-NEXT: cmovnsl %edi, %ebp -; X86-NEXT: testl %esi, %esi +; X86-NEXT: cmovnsl %ebp, %edx +; X86-NEXT: cmovnsl %edi, %ecx +; X86-NEXT: testl %edx, %edx ; X86-NEXT: setg %bl ; X86-NEXT: sete %bh -; X86-NEXT: cmpl $2, %ebp -; X86-NEXT: setae %dl -; X86-NEXT: andb %bh, %dl -; X86-NEXT: orb %bl, %dl -; X86-NEXT: shrdl $2, %eax, %ecx -; X86-NEXT: shrdl $2, %ebp, %eax -; X86-NEXT: testb %dl, %dl +; X86-NEXT: cmpl $2, %ecx +; X86-NEXT: setae %al +; X86-NEXT: andb %bh, %al +; X86-NEXT: orb %bl, %al +; X86-NEXT: movl (%esp), %ebx # 4-byte Reload +; X86-NEXT: shrdl $2, %esi, %ebx +; X86-NEXT: shrdl $2, %ecx, %esi +; X86-NEXT: testb %al, %al ; X86-NEXT: movl $2147483647, %edi # imm = 0x7FFFFFFF -; X86-NEXT: cmovel %eax, %edi +; X86-NEXT: cmovel %esi, %edi ; X86-NEXT: movl $-1, %eax -; X86-NEXT: cmovnel %eax, %ecx -; X86-NEXT: cmpl $-1, %esi -; X86-NEXT: setl %al +; X86-NEXT: cmovnel %eax, %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: cmpl $-1, %edx +; X86-NEXT: setl %bl ; X86-NEXT: sete %dl -; X86-NEXT: cmpl $-2, %ebp -; X86-NEXT: setb %ah -; X86-NEXT: andb %dl, %ah +; X86-NEXT: cmpl $-2, %ecx +; X86-NEXT: setb %cl +; X86-NEXT: andb %dl, %cl ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: orb %al, %ah -; X86-NEXT: cmovnel %edx, %ecx +; X86-NEXT: orb %bl, %cl +; X86-NEXT: cmovnel %edx, %eax ; X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000 ; X86-NEXT: cmovel %edi, %edx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: addl $4, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -477,31 +483,31 @@ ; X64-NEXT: imull %edx, %ecx ; X64-NEXT: cmovol %edi, %ecx ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X64-NEXT: movd %xmm2, %edx -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X64-NEXT: movd %xmm2, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: imull %edx, %edi +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; X64-NEXT: movd %xmm2, %edx +; X64-NEXT: movl %edx, %edi +; X64-NEXT: imull %esi, %edi ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: testl %edi, %edi ; X64-NEXT: setns %al ; X64-NEXT: addl $2147483647, %eax # imm = 0x7FFFFFFF -; X64-NEXT: imull %edx, %esi -; X64-NEXT: cmovol %eax, %esi +; X64-NEXT: imull %esi, %edx +; X64-NEXT: cmovol %eax, %edx ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; X64-NEXT: movd %xmm1, %r9d ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; X64-NEXT: movd %xmm0, %edx -; X64-NEXT: movl %edx, %edi +; X64-NEXT: movd %xmm0, %esi +; X64-NEXT: movl %esi, %edi ; X64-NEXT: imull %r9d, %edi ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: testl %edi, %edi ; X64-NEXT: setns %al ; X64-NEXT: addl $2147483647, %eax # imm = 0x7FFFFFFF -; X64-NEXT: imull %r9d, %edx -; X64-NEXT: cmovol %eax, %edx -; X64-NEXT: movd %edx, %xmm0 -; X64-NEXT: movd %esi, %xmm1 +; X64-NEXT: imull %r9d, %esi +; X64-NEXT: cmovol %eax, %esi +; X64-NEXT: movd %esi, %xmm0 +; X64-NEXT: movd %edx, %xmm1 ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X64-NEXT: movd %ecx, %xmm0 ; X64-NEXT: movd %r8d, %xmm2 @@ -589,62 +595,65 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: imull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: mull %esi +; X86-NEXT: addl %edx, %ebp +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull %edi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: adcl %edi, %edx -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %esi +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %ebx, %edx +; X86-NEXT: adcl $0, %edi ; X86-NEXT: addl %ecx, %edx -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebp, %esi +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: subl %esi, %ebx +; X86-NEXT: movl %edi, %esi ; X86-NEXT: sbbl $0, %esi ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %ebp, %esi -; X86-NEXT: cmovnsl %edx, %ecx -; X86-NEXT: movl %ecx, %edx +; X86-NEXT: cmovnsl %edi, %esi +; X86-NEXT: cmovnsl %edx, %ebx +; X86-NEXT: movl %ebx, %edx ; X86-NEXT: subl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %esi, %edi -; X86-NEXT: sbbl $0, %edi +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %esi, %edi -; X86-NEXT: cmovnsl %ecx, %edx +; X86-NEXT: cmovnsl %esi, %ecx +; X86-NEXT: cmovnsl %ebx, %edx ; X86-NEXT: testl %edx, %edx -; X86-NEXT: setns %cl -; X86-NEXT: sets %ch -; X86-NEXT: testl %edi, %edi -; X86-NEXT: setg %bl -; X86-NEXT: sete %bh -; X86-NEXT: andb %ch, %bh -; X86-NEXT: orb %bl, %bh +; X86-NEXT: setns {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: sets %bh +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: setg {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: sete %bl +; X86-NEXT: andb %bh, %bl +; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload ; X86-NEXT: movl $2147483647, %esi # imm = 0x7FFFFFFF ; X86-NEXT: cmovnel %esi, %edx ; X86-NEXT: movl $-1, %esi ; X86-NEXT: cmovnel %esi, %eax -; X86-NEXT: cmpl $-1, %edi -; X86-NEXT: setl %ch -; X86-NEXT: sete %bl -; X86-NEXT: andb %cl, %bl +; X86-NEXT: cmpl $-1, %ecx +; X86-NEXT: setl %cl +; X86-NEXT: sete %ch +; X86-NEXT: andb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Folded Reload ; X86-NEXT: xorl %esi, %esi -; X86-NEXT: orb %ch, %bl +; X86-NEXT: orb %cl, %ch ; X86-NEXT: cmovnel %esi, %eax ; X86-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 ; X86-NEXT: cmovnel %ecx, %edx +; X86-NEXT: addl $4, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -678,49 +687,50 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: imull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: mull %esi +; X86-NEXT: addl %edx, %ebp +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull %edi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: adcl %edi, %edx -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %esi +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %ebx, %edx +; X86-NEXT: adcl $0, %edi ; X86-NEXT: addl %ecx, %edx -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebp, %esi +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: subl %esi, %ebx +; X86-NEXT: movl %edi, %esi ; X86-NEXT: sbbl $0, %esi ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %ebp, %esi -; X86-NEXT: cmovnsl %edx, %ecx -; X86-NEXT: movl %ecx, %edx +; X86-NEXT: cmovnsl %edi, %esi +; X86-NEXT: cmovnsl %edx, %ebx +; X86-NEXT: movl %ebx, %edx ; X86-NEXT: subl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %esi, %edi -; X86-NEXT: sbbl $0, %edi +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %esi, %edi -; X86-NEXT: cmovnsl %ecx, %edx +; X86-NEXT: cmovnsl %esi, %ecx +; X86-NEXT: cmovnsl %ebx, %edx ; X86-NEXT: shrdl $31, %edx, %eax -; X86-NEXT: shrdl $31, %edi, %edx -; X86-NEXT: cmpl $1073741824, %edi # imm = 0x40000000 -; X86-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF -; X86-NEXT: cmovgel %ecx, %edx -; X86-NEXT: movl $-1, %ecx -; X86-NEXT: cmovgel %ecx, %eax -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpl $-1073741824, %edi # imm = 0xC0000000 -; X86-NEXT: cmovll %ecx, %eax +; X86-NEXT: shrdl $31, %ecx, %edx +; X86-NEXT: cmpl $1073741824, %ecx # imm = 0x40000000 +; X86-NEXT: movl $2147483647, %esi # imm = 0x7FFFFFFF +; X86-NEXT: cmovgel %esi, %edx +; X86-NEXT: movl $-1, %esi +; X86-NEXT: cmovgel %esi, %eax +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: cmpl $-1073741824, %ecx # imm = 0xC0000000 +; X86-NEXT: cmovll %esi, %eax ; X86-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 ; X86-NEXT: cmovll %ecx, %edx ; X86-NEXT: popl %esi Index: llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll =================================================================== --- llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll +++ llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll @@ -170,30 +170,30 @@ ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: sarl $31, %ebx -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: imull {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: imull %ebx, %esi -; X86-NEXT: addl %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addl %esi, %edx +; X86-NEXT: imull %ebx, %ecx +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: imull {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ecx, %edx +; X86-NEXT: addl %edi, %edx ; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: imull %edi, %ebp +; X86-NEXT: imull %ecx, %ebp ; X86-NEXT: addl %edx, %ebp ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ecx @@ -203,6 +203,7 @@ ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %edi, %ebx +; X86-NEXT: movl %esi, %edx ; X86-NEXT: adcl $0, %edx ; X86-NEXT: addl %ecx, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill @@ -456,28 +457,28 @@ ; X64-NEXT: imulq %rsi, %rcx ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rax, %r9 +; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %rcx, %rdx ; X64-NEXT: imulq %rdi, %r8 ; X64-NEXT: addq %rdx, %r8 -; X64-NEXT: movq %rdi, %rbp -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Reload -; X64-NEXT: imulq %r10, %rbp +; X64-NEXT: movq %rdi, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 ## 8-byte Reload +; X64-NEXT: imulq %r9, %rcx ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Reload -; X64-NEXT: mulq %rbx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp ## 8-byte Reload +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %rbp, %rdx +; X64-NEXT: addq %rcx, %rdx ; X64-NEXT: movq %rdi, %rcx -; X64-NEXT: imulq %rbx, %rcx -; X64-NEXT: movq %rbx, %rax +; X64-NEXT: imulq %rbp, %rcx +; X64-NEXT: movq %rbp, %rax ; X64-NEXT: addq %rdx, %rcx -; X64-NEXT: addq %r9, %r13 +; X64-NEXT: addq %rbx, %r13 ; X64-NEXT: adcq %r8, %rcx ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, %r14 -; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: addq %rbx, %r8 @@ -1247,24 +1248,24 @@ ; X86-NEXT: adcl %edx, %edi ; X86-NEXT: addl %esi, %ebp ; X86-NEXT: adcl %ecx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull %ebx, %eax -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: imull %ebx, %ecx -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull %ebx, %eax -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl %eax, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull %ebx, %eax -; X86-NEXT: addl %edx, %eax +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: imull %ebx, %ecx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: imull %ebx, %ecx +; X86-NEXT: addl %edx, %ecx ; X86-NEXT: movl (%esp), %edx ## 4-byte Reload ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill -; X86-NEXT: adcl %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %esi @@ -1407,16 +1408,15 @@ ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl %edi, %eax ; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: imull %ebp, %esi -; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %esi, %edx ; X86-NEXT: imull %ebp, %ecx -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: movl %ecx, %edi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: imull %ebp, %edi +; X86-NEXT: addl %edx, %edi ; X86-NEXT: movl %ebp, %ecx ; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ebp, %eax Index: llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll =================================================================== --- llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -3136,34 +3136,34 @@ ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] -; X86-SSE-NEXT: movaps (%esi), %xmm0 # encoding: [0x0f,0x28,0x06] -; X86-SSE-NEXT: movaps (%edx), %xmm1 # encoding: [0x0f,0x28,0x0a] -; X86-SSE-NEXT: movaps (%ecx), %xmm2 # encoding: [0x0f,0x28,0x11] +; X86-SSE-NEXT: movaps (%esi), %xmm1 # encoding: [0x0f,0x28,0x0e] +; X86-SSE-NEXT: movaps (%edx), %xmm2 # encoding: [0x0f,0x28,0x12] +; X86-SSE-NEXT: movaps (%ecx), %xmm0 # encoding: [0x0f,0x28,0x01] ; X86-SSE-NEXT: movaps (%eax), %xmm3 # encoding: [0x0f,0x28,0x18] -; X86-SSE-NEXT: movaps %xmm0, %xmm4 # encoding: [0x0f,0x28,0xe0] -; X86-SSE-NEXT: unpcklps %xmm1, %xmm4 # encoding: [0x0f,0x14,0xe1] -; X86-SSE-NEXT: # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; X86-SSE-NEXT: movaps %xmm2, %xmm5 # encoding: [0x0f,0x28,0xea] +; X86-SSE-NEXT: movaps %xmm1, %xmm4 # encoding: [0x0f,0x28,0xe1] +; X86-SSE-NEXT: unpcklps %xmm2, %xmm4 # encoding: [0x0f,0x14,0xe2] +; X86-SSE-NEXT: # xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; X86-SSE-NEXT: movaps %xmm0, %xmm5 # encoding: [0x0f,0x28,0xe8] ; X86-SSE-NEXT: unpcklps %xmm3, %xmm5 # encoding: [0x0f,0x14,0xeb] ; X86-SSE-NEXT: # xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; X86-SSE-NEXT: unpckhps %xmm1, %xmm0 # encoding: [0x0f,0x15,0xc1] -; X86-SSE-NEXT: # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-SSE-NEXT: unpckhps %xmm3, %xmm2 # encoding: [0x0f,0x15,0xd3] -; X86-SSE-NEXT: # xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; X86-SSE-NEXT: movaps %xmm4, %xmm1 # encoding: [0x0f,0x28,0xcc] -; X86-SSE-NEXT: movlhps %xmm5, %xmm1 # encoding: [0x0f,0x16,0xcd] -; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm5[0] +; X86-SSE-NEXT: unpckhps %xmm2, %xmm1 # encoding: [0x0f,0x15,0xca] +; X86-SSE-NEXT: # xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: unpckhps %xmm3, %xmm0 # encoding: [0x0f,0x15,0xc3] +; X86-SSE-NEXT: # xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; X86-SSE-NEXT: movaps %xmm4, %xmm2 # encoding: [0x0f,0x28,0xd4] +; X86-SSE-NEXT: movlhps %xmm5, %xmm2 # encoding: [0x0f,0x16,0xd5] +; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm5[0] ; X86-SSE-NEXT: movhlps %xmm4, %xmm5 # encoding: [0x0f,0x12,0xec] ; X86-SSE-NEXT: # xmm5 = xmm4[1],xmm5[1] -; X86-SSE-NEXT: movaps %xmm0, %xmm3 # encoding: [0x0f,0x28,0xd8] -; X86-SSE-NEXT: movlhps %xmm2, %xmm3 # encoding: [0x0f,0x16,0xda] -; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm2[0] -; X86-SSE-NEXT: movhlps %xmm0, %xmm2 # encoding: [0x0f,0x12,0xd0] -; X86-SSE-NEXT: # xmm2 = xmm0[1],xmm2[1] -; X86-SSE-NEXT: movaps %xmm1, (%esi) # encoding: [0x0f,0x29,0x0e] +; X86-SSE-NEXT: movaps %xmm1, %xmm3 # encoding: [0x0f,0x28,0xd9] +; X86-SSE-NEXT: movlhps %xmm0, %xmm3 # encoding: [0x0f,0x16,0xd8] +; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm0[0] +; X86-SSE-NEXT: movhlps %xmm1, %xmm0 # encoding: [0x0f,0x12,0xc1] +; X86-SSE-NEXT: # xmm0 = xmm1[1],xmm0[1] +; X86-SSE-NEXT: movaps %xmm2, (%esi) # encoding: [0x0f,0x29,0x16] ; X86-SSE-NEXT: movaps %xmm5, (%edx) # encoding: [0x0f,0x29,0x2a] ; X86-SSE-NEXT: movaps %xmm3, (%ecx) # encoding: [0x0f,0x29,0x19] -; X86-SSE-NEXT: movaps %xmm2, (%eax) # encoding: [0x0f,0x29,0x10] +; X86-SSE-NEXT: movaps %xmm0, (%eax) # encoding: [0x0f,0x29,0x00] ; X86-SSE-NEXT: popl %esi # encoding: [0x5e] ; X86-SSE-NEXT: retl # encoding: [0xc3] ; Index: llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll =================================================================== --- llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -3343,59 +3343,59 @@ ; X86-SSE-NEXT: punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8] ; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c] -; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x10] ; X86-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] -; X86-SSE-NEXT: punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0] -; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X86-SSE-NEXT: punpcklwd %xmm1, %xmm2 # encoding: [0x66,0x0f,0x61,0xd1] -; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x14] +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x10] ; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] +; X86-SSE-NEXT: punpcklbw %xmm2, %xmm0 # encoding: [0x66,0x0f,0x60,0xc2] +; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X86-SSE-NEXT: punpcklwd %xmm1, %xmm0 # encoding: [0x66,0x0f,0x61,0xc1] +; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x14] +; X86-SSE-NEXT: movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x18] -; X86-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] -; X86-SSE-NEXT: punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8] -; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; X86-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] +; X86-SSE-NEXT: punpcklbw %xmm1, %xmm2 # encoding: [0x66,0x0f,0x60,0xd1] +; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x1c] -; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] +; X86-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x20] ; X86-SSE-NEXT: movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8] -; X86-SSE-NEXT: punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8] -; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X86-SSE-NEXT: punpcklwd %xmm3, %xmm1 # encoding: [0x66,0x0f,0x61,0xcb] -; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; X86-SSE-NEXT: punpckldq %xmm2, %xmm1 # encoding: [0x66,0x0f,0x62,0xca] -; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-SSE-NEXT: punpcklbw %xmm3, %xmm1 # encoding: [0x66,0x0f,0x60,0xcb] +; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; X86-SSE-NEXT: punpcklwd %xmm2, %xmm1 # encoding: [0x66,0x0f,0x61,0xca] +; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: punpckldq %xmm0, %xmm1 # encoding: [0x66,0x0f,0x62,0xc8] +; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x24] ; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x28] -; X86-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] -; X86-SSE-NEXT: punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0] -; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x2c] -; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x30] ; X86-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] ; X86-SSE-NEXT: punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8] ; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X86-SSE-NEXT: punpcklwd %xmm2, %xmm3 # encoding: [0x66,0x0f,0x61,0xda] -; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x34] +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x2c] ; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x38] +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x30] ; X86-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] ; X86-SSE-NEXT: punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0] ; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; X86-SSE-NEXT: punpcklwd %xmm3, %xmm2 # encoding: [0x66,0x0f,0x61,0xd3] +; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x34] +; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x38] +; X86-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] +; X86-SSE-NEXT: punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8] +; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x3c] ; X86-SSE-NEXT: movd %eax, %xmm4 # encoding: [0x66,0x0f,0x6e,0xe0] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x40] ; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X86-SSE-NEXT: punpcklbw %xmm4, %xmm0 # encoding: [0x66,0x0f,0x60,0xc4] ; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd %xmm2, %xmm0 # encoding: [0x66,0x0f,0x61,0xc2] -; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: punpckldq %xmm3, %xmm0 # encoding: [0x66,0x0f,0x62,0xc3] -; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X86-SSE-NEXT: punpcklwd %xmm3, %xmm0 # encoding: [0x66,0x0f,0x61,0xc3] +; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; X86-SSE-NEXT: punpckldq %xmm2, %xmm0 # encoding: [0x66,0x0f,0x62,0xc2] +; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X86-SSE-NEXT: punpcklqdq %xmm1, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc1] ; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0] ; X86-SSE-NEXT: retl # encoding: [0xc3] @@ -3775,9 +3775,9 @@ ; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x08] ; X86-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] ; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x0c] -; X86-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x10] ; X86-SSE-NEXT: movd %eax, %xmm4 # encoding: [0x66,0x0f,0x6e,0xe0] +; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x10] +; X86-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] ; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x14] ; X86-SSE-NEXT: movd %eax, %xmm5 # encoding: [0x66,0x0f,0x6e,0xe8] ; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x18] @@ -3788,18 +3788,18 @@ ; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X86-SSE-NEXT: punpcklwd %xmm1, %xmm2 # encoding: [0x66,0x0f,0x61,0xd1] ; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; X86-SSE-NEXT: punpcklwd %xmm3, %xmm4 # encoding: [0x66,0x0f,0x61,0xe3] -; X86-SSE-NEXT: # xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; X86-SSE-NEXT: punpckldq %xmm2, %xmm4 # encoding: [0x66,0x0f,0x62,0xe2] -; X86-SSE-NEXT: # xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; X86-SSE-NEXT: punpcklwd %xmm4, %xmm3 # encoding: [0x66,0x0f,0x61,0xdc] +; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X86-SSE-NEXT: punpckldq %xmm2, %xmm3 # encoding: [0x66,0x0f,0x62,0xda] +; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; X86-SSE-NEXT: punpcklwd %xmm5, %xmm6 # encoding: [0x66,0x0f,0x61,0xf5] ; X86-SSE-NEXT: # xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; X86-SSE-NEXT: punpcklwd %xmm7, %xmm0 # encoding: [0x66,0x0f,0x61,0xc7] ; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; X86-SSE-NEXT: punpckldq %xmm6, %xmm0 # encoding: [0x66,0x0f,0x62,0xc6] ; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; X86-SSE-NEXT: punpcklqdq %xmm4, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc4] -; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm4[0] +; X86-SSE-NEXT: punpcklqdq %xmm3, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc3] +; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0] ; X86-SSE-NEXT: retl # encoding: [0xc3] ; ; X86-AVX1-LABEL: test_mm_set_epi16: @@ -4760,59 +4760,59 @@ ; X86-SSE-NEXT: punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8] ; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x38] -; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x34] ; X86-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] -; X86-SSE-NEXT: punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0] -; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X86-SSE-NEXT: punpcklwd %xmm1, %xmm2 # encoding: [0x66,0x0f,0x61,0xd1] -; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x30] +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x34] ; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] +; X86-SSE-NEXT: punpcklbw %xmm2, %xmm0 # encoding: [0x66,0x0f,0x60,0xc2] +; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X86-SSE-NEXT: punpcklwd %xmm1, %xmm0 # encoding: [0x66,0x0f,0x61,0xc1] +; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x30] +; X86-SSE-NEXT: movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x2c] -; X86-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] -; X86-SSE-NEXT: punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8] -; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; X86-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] +; X86-SSE-NEXT: punpcklbw %xmm1, %xmm2 # encoding: [0x66,0x0f,0x60,0xd1] +; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x28] -; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] +; X86-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x24] ; X86-SSE-NEXT: movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8] -; X86-SSE-NEXT: punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8] -; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X86-SSE-NEXT: punpcklwd %xmm3, %xmm1 # encoding: [0x66,0x0f,0x61,0xcb] -; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; X86-SSE-NEXT: punpckldq %xmm2, %xmm1 # encoding: [0x66,0x0f,0x62,0xca] -; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-SSE-NEXT: punpcklbw %xmm3, %xmm1 # encoding: [0x66,0x0f,0x60,0xcb] +; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; X86-SSE-NEXT: punpcklwd %xmm2, %xmm1 # encoding: [0x66,0x0f,0x61,0xca] +; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: punpckldq %xmm0, %xmm1 # encoding: [0x66,0x0f,0x62,0xc8] +; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x20] ; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x1c] -; X86-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] -; X86-SSE-NEXT: punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0] -; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x18] -; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x14] ; X86-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] ; X86-SSE-NEXT: punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8] ; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X86-SSE-NEXT: punpcklwd %xmm2, %xmm3 # encoding: [0x66,0x0f,0x61,0xda] -; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x10] +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x18] ; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] -; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c] +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x14] ; X86-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] ; X86-SSE-NEXT: punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0] ; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; X86-SSE-NEXT: punpcklwd %xmm3, %xmm2 # encoding: [0x66,0x0f,0x61,0xd3] +; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x10] +; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c] +; X86-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] +; X86-SSE-NEXT: punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8] +; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-SSE-NEXT: movd %eax, %xmm4 # encoding: [0x66,0x0f,0x6e,0xe0] ; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X86-SSE-NEXT: punpcklbw %xmm4, %xmm0 # encoding: [0x66,0x0f,0x60,0xc4] ; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd %xmm2, %xmm0 # encoding: [0x66,0x0f,0x61,0xc2] -; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: punpckldq %xmm3, %xmm0 # encoding: [0x66,0x0f,0x62,0xc3] -; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X86-SSE-NEXT: punpcklwd %xmm3, %xmm0 # encoding: [0x66,0x0f,0x61,0xc3] +; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; X86-SSE-NEXT: punpckldq %xmm2, %xmm0 # encoding: [0x66,0x0f,0x62,0xc2] +; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X86-SSE-NEXT: punpcklqdq %xmm1, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc1] ; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0] ; X86-SSE-NEXT: retl # encoding: [0xc3] @@ -5192,9 +5192,9 @@ ; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x1c] ; X86-SSE-NEXT: movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0] ; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x18] -; X86-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x14] ; X86-SSE-NEXT: movd %eax, %xmm4 # encoding: [0x66,0x0f,0x6e,0xe0] +; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x14] +; X86-SSE-NEXT: movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8] ; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x10] ; X86-SSE-NEXT: movd %eax, %xmm5 # encoding: [0x66,0x0f,0x6e,0xe8] ; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x0c] @@ -5205,18 +5205,18 @@ ; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X86-SSE-NEXT: punpcklwd %xmm1, %xmm2 # encoding: [0x66,0x0f,0x61,0xd1] ; X86-SSE-NEXT: # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; X86-SSE-NEXT: punpcklwd %xmm3, %xmm4 # encoding: [0x66,0x0f,0x61,0xe3] -; X86-SSE-NEXT: # xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; X86-SSE-NEXT: punpckldq %xmm2, %xmm4 # encoding: [0x66,0x0f,0x62,0xe2] -; X86-SSE-NEXT: # xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; X86-SSE-NEXT: punpcklwd %xmm4, %xmm3 # encoding: [0x66,0x0f,0x61,0xdc] +; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X86-SSE-NEXT: punpckldq %xmm2, %xmm3 # encoding: [0x66,0x0f,0x62,0xda] +; X86-SSE-NEXT: # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; X86-SSE-NEXT: punpcklwd %xmm5, %xmm6 # encoding: [0x66,0x0f,0x61,0xf5] ; X86-SSE-NEXT: # xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; X86-SSE-NEXT: punpcklwd %xmm7, %xmm0 # encoding: [0x66,0x0f,0x61,0xc7] ; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; X86-SSE-NEXT: punpckldq %xmm6, %xmm0 # encoding: [0x66,0x0f,0x62,0xc6] ; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; X86-SSE-NEXT: punpcklqdq %xmm4, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc4] -; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm4[0] +; X86-SSE-NEXT: punpcklqdq %xmm3, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc3] +; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm3[0] ; X86-SSE-NEXT: retl # encoding: [0xc3] ; ; X86-AVX1-LABEL: test_mm_setr_epi16: Index: llvm/test/CodeGen/X86/sshl_sat.ll =================================================================== --- llvm/test/CodeGen/X86/sshl_sat.ll +++ llvm/test/CodeGen/X86/sshl_sat.ll @@ -76,26 +76,24 @@ ; ; X86-LABEL: func2: ; X86: # %bb.0: -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-NEXT: movsbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: addl %edx, %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movswl %si, %edi -; X86-NEXT: sarl %cl, %edi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testw %dx, %dx -; X86-NEXT: sets %al -; X86-NEXT: addl $32767, %eax # imm = 0x7FFF -; X86-NEXT: cmpw %di, %dx -; X86-NEXT: cmovel %esi, %eax -; X86-NEXT: cwtl +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl %eax, %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movswl %dx, %esi +; X86-NEXT: sarl %cl, %esi +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: testw %ax, %ax +; X86-NEXT: sets %cl +; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF +; X86-NEXT: cmpw %si, %ax +; X86-NEXT: cmovel %edx, %ecx +; X86-NEXT: movswl %cx, %eax ; X86-NEXT: shrl %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl %x2 = sext i8 %x to i15 %y2 = sext i8 %y to i15 @@ -128,28 +126,26 @@ ; ; X86-LABEL: func3: ; X86: # %bb.0: -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll $7, %ecx -; X86-NEXT: addl %edx, %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movswl %si, %edi +; X86-NEXT: addl %eax, %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movswl %dx, %esi ; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: sarl %cl, %edi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testw %dx, %dx -; X86-NEXT: sets %al -; X86-NEXT: addl $32767, %eax # imm = 0x7FFF -; X86-NEXT: cmpw %di, %dx -; X86-NEXT: cmovel %esi, %eax -; X86-NEXT: cwtl +; X86-NEXT: sarl %cl, %esi +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: testw %ax, %ax +; X86-NEXT: sets %cl +; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF +; X86-NEXT: cmpw %si, %ax +; X86-NEXT: cmovel %edx, %ecx +; X86-NEXT: movswl %cx, %eax ; X86-NEXT: shrl %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl %y2 = sext i8 %y to i15 %y3 = shl i15 %y2, 7 @@ -230,38 +226,38 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: shll %cl, %ebp -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: xorl %edx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: shldl %cl, %edx, %edi +; X86-NEXT: xorl %eax, %eax ; X86-NEXT: testb $32, %cl -; X86-NEXT: cmovnel %ebp, %ebx -; X86-NEXT: cmovnel %edx, %ebp -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: sarl %cl, %edx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: sarl $31, %edi +; X86-NEXT: cmovnel %ebx, %edi +; X86-NEXT: cmovnel %eax, %ebx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: sarl %cl, %eax +; X86-NEXT: movl %edi, %ebp +; X86-NEXT: sarl $31, %ebp ; X86-NEXT: testb $32, %cl -; X86-NEXT: cmovel %edx, %edi -; X86-NEXT: movl %ebp, %esi -; X86-NEXT: shrdl %cl, %ebx, %esi +; X86-NEXT: cmovel %eax, %ebp +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: shrdl %cl, %edi, %esi ; X86-NEXT: testb $32, %cl -; X86-NEXT: cmovnel %edx, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorl %ecx, %edi -; X86-NEXT: xorl %eax, %esi +; X86-NEXT: cmovnel %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: xorl %eax, %ebp +; X86-NEXT: xorl %edx, %esi ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: testl %eax, %eax ; X86-NEXT: movl $-1, %eax ; X86-NEXT: movl $0, %ecx ; X86-NEXT: cmovsl %ecx, %eax ; X86-NEXT: sets %dl ; X86-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF -; X86-NEXT: orl %edi, %esi -; X86-NEXT: cmovel %ebp, %eax -; X86-NEXT: cmovel %ebx, %edx +; X86-NEXT: orl %ebp, %esi +; X86-NEXT: cmovel %ebx, %eax +; X86-NEXT: cmovel %edi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx Index: llvm/test/CodeGen/X86/sshl_sat_vec.ll =================================================================== --- llvm/test/CodeGen/X86/sshl_sat_vec.ll +++ llvm/test/CodeGen/X86/sshl_sat_vec.ll @@ -85,28 +85,28 @@ ; X86-NEXT: movb {{[0-9]+}}(%esp), %ch ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: sarl %cl, %ebp +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: shll %cl, %ebp +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: sarl %cl, %edi ; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: testl %edx, %edx ; X86-NEXT: sets %bl ; X86-NEXT: addl $2147483647, %ebx # imm = 0x7FFFFFFF -; X86-NEXT: cmpl %ebp, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: cmovel %edi, %ebx -; X86-NEXT: movl %ebp, %edi +; X86-NEXT: cmpl %edi, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmovel %ebp, %ebx +; X86-NEXT: movl %edi, %ebp ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: shll %cl, %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: sarl %cl, %eax ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: testl %ebp, %ebp +; X86-NEXT: testl %edi, %edi ; X86-NEXT: sets %dl ; X86-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF -; X86-NEXT: cmpl %eax, %ebp -; X86-NEXT: cmovel %edi, %edx +; X86-NEXT: cmpl %eax, %edi +; X86-NEXT: cmovel %ebp, %edx ; X86-NEXT: movl %esi, %edi ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: shll %cl, %edi Index: llvm/test/CodeGen/X86/ssub_sat.ll =================================================================== --- llvm/test/CodeGen/X86/ssub_sat.ll +++ llvm/test/CodeGen/X86/ssub_sat.ll @@ -39,23 +39,21 @@ ; X86-LABEL: func2: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi +; X86-NEXT: subl {{[0-9]+}}(%esp), %edx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: seto %bl -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sarl $31, %eax ; X86-NEXT: testb %bl, %bl -; X86-NEXT: cmovel %ecx, %eax +; X86-NEXT: cmovel %edx, %eax ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: testl %esi, %esi +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setns %dl ; X86-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF ; X86-NEXT: testb %bl, %bl -; X86-NEXT: cmovel %esi, %edx -; X86-NEXT: popl %esi +; X86-NEXT: cmovel %ecx, %edx ; X86-NEXT: popl %ebx ; X86-NEXT: retl ; Index: llvm/test/CodeGen/X86/ssub_sat_plus.ll =================================================================== --- llvm/test/CodeGen/X86/ssub_sat_plus.ll +++ llvm/test/CodeGen/X86/ssub_sat_plus.ll @@ -41,23 +41,21 @@ ; X86-LABEL: func64: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi +; X86-NEXT: subl {{[0-9]+}}(%esp), %edx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: seto %bl -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sarl $31, %eax ; X86-NEXT: testb %bl, %bl -; X86-NEXT: cmovel %ecx, %eax +; X86-NEXT: cmovel %edx, %eax ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: testl %esi, %esi +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setns %dl ; X86-NEXT: addl $2147483647, %edx # imm = 0x7FFFFFFF ; X86-NEXT: testb %bl, %bl -; X86-NEXT: cmovel %esi, %edx -; X86-NEXT: popl %esi +; X86-NEXT: cmovel %ecx, %edx ; X86-NEXT: popl %ebx ; X86-NEXT: retl ; Index: llvm/test/CodeGen/X86/stack-align-memcpy.ll =================================================================== --- llvm/test/CodeGen/X86/stack-align-memcpy.ll +++ llvm/test/CodeGen/X86/stack-align-memcpy.ll @@ -27,10 +27,10 @@ ; CHECK-NEXT: subl %edx, %eax ; CHECK-NEXT: movl %eax, %esp ; CHECK-NEXT: subl $4, %esp -; CHECK-NEXT: movl 84(%ecx), %edx +; CHECK-NEXT: movl 84(%ecx), %edi +; CHECK-NEXT: movl 80(%ecx), %ebx +; CHECK-NEXT: movl 76(%ecx), %edx ; CHECK-NEXT: movl %edx, 68(%esi) ## 4-byte Spill -; CHECK-NEXT: movl 80(%ecx), %edi -; CHECK-NEXT: movl 76(%ecx), %ebx ; CHECK-NEXT: movl 72(%ecx), %edx ; CHECK-NEXT: movl %edx, 64(%esi) ## 4-byte Spill ; CHECK-NEXT: movl 68(%ecx), %edx @@ -68,9 +68,9 @@ ; CHECK-NEXT: movl (%ecx), %edx ; CHECK-NEXT: movl %edx, 72(%esi) ## 4-byte Spill ; CHECK-NEXT: movl 4(%ecx), %ecx -; CHECK-NEXT: pushl 68(%esi) ## 4-byte Folded Reload ; CHECK-NEXT: pushl %edi ; CHECK-NEXT: pushl %ebx +; CHECK-NEXT: pushl 68(%esi) ## 4-byte Folded Reload ; CHECK-NEXT: pushl 64(%esi) ## 4-byte Folded Reload ; CHECK-NEXT: pushl 60(%esi) ## 4-byte Folded Reload ; CHECK-NEXT: pushl 56(%esi) ## 4-byte Folded Reload @@ -119,10 +119,10 @@ ; CHECK-NEXT: movl 12(%ebp), %edi ; CHECK-NEXT: movl 8(%ebp), %eax ; CHECK-NEXT: subl $4, %esp -; CHECK-NEXT: movl 84(%eax), %ecx +; CHECK-NEXT: movl 84(%eax), %edx +; CHECK-NEXT: movl 80(%eax), %ebx +; CHECK-NEXT: movl 76(%eax), %ecx ; CHECK-NEXT: movl %ecx, 68(%esi) ## 4-byte Spill -; CHECK-NEXT: movl 80(%eax), %edx -; CHECK-NEXT: movl 76(%eax), %ebx ; CHECK-NEXT: movl 72(%eax), %ecx ; CHECK-NEXT: movl %ecx, 64(%esi) ## 4-byte Spill ; CHECK-NEXT: movl 68(%eax), %ecx @@ -160,9 +160,9 @@ ; CHECK-NEXT: movl (%eax), %ecx ; CHECK-NEXT: movl %ecx, 72(%esi) ## 4-byte Spill ; CHECK-NEXT: movl 4(%eax), %eax -; CHECK-NEXT: pushl 68(%esi) ## 4-byte Folded Reload ; CHECK-NEXT: pushl %edx ; CHECK-NEXT: pushl %ebx +; CHECK-NEXT: pushl 68(%esi) ## 4-byte Folded Reload ; CHECK-NEXT: pushl 64(%esi) ## 4-byte Folded Reload ; CHECK-NEXT: pushl 60(%esi) ## 4-byte Folded Reload ; CHECK-NEXT: pushl 56(%esi) ## 4-byte Folded Reload Index: llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll =================================================================== --- llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll +++ llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll @@ -53,9 +53,9 @@ ; CHECK-PREG: renamable $rbx = COPY $r9 ; CHECK-PREG: MOV64mr %stack.6, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.6) -; CHECK-PREG: renamable $r15 = COPY $rcx -; CHECK-PREG: renamable $r12 = COPY $rdx -; CHECK-PREG: renamable $r14 = COPY $rsi +; CHECK-PREG: renamable $r12 = COPY $rcx +; CHECK-PREG: renamable $r14 = COPY $rdx +; CHECK-PREG: renamable $r15 = COPY $rsi ; CHECK-PREG: renamable $r13 = COPY $rdi ; CHECK-PREG: renamable $rax = MOV64rm %fixed-stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.11, align 16) ; CHECK-PREG: MOV64mr %stack.7, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.7) @@ -80,38 +80,38 @@ ; CHECK-PREG: MOV64mr %stack.9, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.9) ; CHECK-PREG: renamable $rax = MOV64rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.0) ; CHECK-PREG: MOV64mr %stack.10, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.10) -; CHECK-PREG: renamable $rbp, renamable $rbx, renamable $r15, renamable $r12, renamable $r14, renamable $r13 = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 2, 18, 1, 8, %stack.10, 0, 1, 8, %stack.9, 0, 1, 8, %stack.8, 0, 1, 8, %stack.0, 0, 1, 8, %stack.1, 0, 1, 8, %stack.2, 0, 1, 8, %stack.3, 0, 1, 8, %stack.4, 0, 1, 8, %stack.5, 0, 1, 8, %stack.11, 0, killed renamable $rbp(tied-def 0), 1, 8, %stack.7, 0, killed renamable $rbx(tied-def 1), 1, 8, %stack.6, 0, killed renamable $r15(tied-def 2), killed renamable $r12(tied-def 3), killed renamable $r14(tied-def 4), killed renamable $r13(tied-def 5), 2, 0, 2, 18, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store (s64) on %stack.0), (load store (s64) on %stack.1), (load store (s64) on %stack.2), (load store (s64) on %stack.3), (load store (s64) on %stack.4), (load store (s64) on %stack.5), (load store (s64) on %stack.6), (load store (s64) on %stack.7), (load store (s64) on %stack.8), (load store (s64) on %stack.9), (load store (s64) on %stack.10), (load store (s64) on %stack.11) +; CHECK-PREG: renamable $rbp, renamable $rbx, renamable $r12, renamable $r14, renamable $r15, renamable $r13 = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 2, 18, 1, 8, %stack.10, 0, 1, 8, %stack.9, 0, 1, 8, %stack.8, 0, 1, 8, %stack.0, 0, 1, 8, %stack.1, 0, 1, 8, %stack.2, 0, 1, 8, %stack.3, 0, 1, 8, %stack.4, 0, 1, 8, %stack.5, 0, 1, 8, %stack.11, 0, killed renamable $rbp(tied-def 0), 1, 8, %stack.7, 0, killed renamable $rbx(tied-def 1), 1, 8, %stack.6, 0, killed renamable $r12(tied-def 2), killed renamable $r14(tied-def 3), killed renamable $r15(tied-def 4), killed renamable $r13(tied-def 5), 2, 0, 2, 18, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store (s64) on %stack.0), (load store (s64) on %stack.1), (load store (s64) on %stack.2), (load store (s64) on %stack.3), (load store (s64) on %stack.4), (load store (s64) on %stack.5), (load store (s64) on %stack.6), (load store (s64) on %stack.7), (load store (s64) on %stack.8), (load store (s64) on %stack.9), (load store (s64) on %stack.10), (load store (s64) on %stack.11) ; CHECK-PREG: renamable $eax = MOV32rm killed renamable $r13, 1, $noreg, 4, $noreg :: (load (s32) from %ir.gep00, addrspace 1) -; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r14, 1, $noreg, 8, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep01, addrspace 1) -; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r12, 1, $noreg, 12, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep02, addrspace 1) -; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r15, 1, $noreg, 16, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep03, addrspace 1) -; CHECK-PREG: renamable $rdx = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) -; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdx, 1, $noreg, 20, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep04, addrspace 1) +; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r15, 1, $noreg, 8, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep01, addrspace 1) +; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r14, 1, $noreg, 12, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep02, addrspace 1) +; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $r12, 1, $noreg, 16, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep03, addrspace 1) +; CHECK-PREG: renamable $rdi = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) +; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 20, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep04, addrspace 1) ; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rbx, 1, $noreg, 24, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep05, addrspace 1) -; CHECK-PREG: renamable $rdx = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7) -; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdx, 1, $noreg, 28, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep06, addrspace 1) +; CHECK-PREG: renamable $rdi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7) +; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 28, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep06, addrspace 1) ; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rbp, 1, $noreg, 32, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep07, addrspace 1) ; CHECK-PREG: renamable $rcx = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11) ; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 36, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep08, addrspace 1) -; CHECK-PREG: renamable $rdx = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5) -; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdx, 1, $noreg, 40, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep09, addrspace 1) -; CHECK-PREG: renamable $rdx = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4) -; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdx, 1, $noreg, 44, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep10, addrspace 1) -; CHECK-PREG: renamable $rdx = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3) -; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdx, 1, $noreg, 48, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep11, addrspace 1) -; CHECK-PREG: renamable $rdx = MOV64rm %stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %stack.2) -; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdx, 1, $noreg, 52, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep12, addrspace 1) -; CHECK-PREG: renamable $rdx = MOV64rm %stack.1, 1, $noreg, 0, $noreg :: (load (s64) from %stack.1) -; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdx, 1, $noreg, 56, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep13, addrspace 1) -; CHECK-PREG: renamable $rdx = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %stack.0) -; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdx, 1, $noreg, 60, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep14, addrspace 1) -; CHECK-PREG: renamable $rdx = MOV64rm %stack.8, 1, $noreg, 0, $noreg :: (load (s64) from %stack.8) -; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdx, 1, $noreg, 64, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep15, addrspace 1) +; CHECK-PREG: renamable $rdi = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5) +; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 40, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep09, addrspace 1) +; CHECK-PREG: renamable $rdi = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4) +; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 44, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep10, addrspace 1) +; CHECK-PREG: renamable $rdi = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3) +; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 48, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep11, addrspace 1) +; CHECK-PREG: renamable $rdi = MOV64rm %stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %stack.2) +; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 52, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep12, addrspace 1) +; CHECK-PREG: renamable $rdi = MOV64rm %stack.1, 1, $noreg, 0, $noreg :: (load (s64) from %stack.1) +; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 56, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep13, addrspace 1) +; CHECK-PREG: renamable $rdi = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %stack.0) +; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 60, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep14, addrspace 1) +; CHECK-PREG: renamable $rsi = MOV64rm %stack.8, 1, $noreg, 0, $noreg :: (load (s64) from %stack.8) +; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rsi, 1, $noreg, 64, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep15, addrspace 1) ; CHECK-PREG: renamable $rdx = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9) ; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdx, 1, $noreg, 68, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep16, addrspace 1) ; CHECK-PREG: renamable $rcx = MOV64rm %stack.10, 1, $noreg, 0, $noreg :: (load (s64) from %stack.10) ; CHECK-PREG: renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 72, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep17, addrspace 1) - + %token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @func, i32 0, i32 0, i32 0, i32 0) [ "gc-live"(i32 addrspace(1)* %arg00, i32 addrspace(1)* %arg01, i32 addrspace(1)* %arg02, i32 addrspace(1)* %arg03, i32 addrspace(1)* %arg04, i32 addrspace(1)* %arg05, i32 addrspace(1)* %arg06, i32 addrspace(1)* %arg07, i32 addrspace(1)* %arg08, i32 addrspace(1)* %arg09, i32 addrspace(1)* %arg10, i32 addrspace(1)* %arg11, i32 addrspace(1)* %arg12, i32 addrspace(1)* %arg13, i32 addrspace(1)* %arg14, i32 addrspace(1)* %arg15, i32 addrspace(1)* %arg16, i32 addrspace(1)* %arg17) ] %rel00 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %token, i32 0, i32 0) ; (%arg00, %arg00) Index: llvm/test/CodeGen/X86/subvector-broadcast.ll =================================================================== --- llvm/test/CodeGen/X86/subvector-broadcast.ll +++ llvm/test/CodeGen/X86/subvector-broadcast.ll @@ -803,24 +803,24 @@ define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) { ; X86-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64: ; X86-AVX1: # %bb.0: # %entry -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,2,0] -; X86-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm4 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,0,2,0] +; X86-AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm3 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [3,0,4,0] ; X86-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 ; X86-AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [1,0,2,0,3,0,4,0] ; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 ; X86-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm7 -; X86-AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 ; X86-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm5 -; X86-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; X86-AVX1-NEXT: vandps %ymm6, %ymm1, %ymm1 ; X86-AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2 ; X86-AVX1-NEXT: vmovdqu %xmm0, ga4+16 -; X86-AVX1-NEXT: vmovdqu %xmm4, ga4 +; X86-AVX1-NEXT: vmovdqu %xmm3, ga4 ; X86-AVX1-NEXT: vmovups %ymm2, gb4+32 ; X86-AVX1-NEXT: vmovups %ymm1, gb4 ; X86-AVX1-NEXT: vzeroupper Index: llvm/test/CodeGen/X86/uadd_sat.ll =================================================================== --- llvm/test/CodeGen/X86/uadd_sat.ll +++ llvm/test/CodeGen/X86/uadd_sat.ll @@ -126,23 +126,23 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: addl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $-1, %ebx -; X86-NEXT: cmovbl %ebx, %edi -; X86-NEXT: addl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovbl %ebx, %esi +; X86-NEXT: cmovbl %ebx, %ecx ; X86-NEXT: addl {{[0-9]+}}(%esp), %edx ; X86-NEXT: cmovbl %ebx, %edx -; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovbl %ebx, %ecx -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: addl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmovbl %ebx, %esi +; X86-NEXT: addl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmovbl %ebx, %edi +; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx Index: llvm/test/CodeGen/X86/udiv_fix_sat.ll =================================================================== --- llvm/test/CodeGen/X86/udiv_fix_sat.ll +++ llvm/test/CodeGen/X86/udiv_fix_sat.ll @@ -392,16 +392,16 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: addl %ecx, %ecx -; X86-NEXT: setb %al -; X86-NEXT: shldl $31, %ecx, %eax -; X86-NEXT: shll $31, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: addl %eax, %eax +; X86-NEXT: setb %cl +; X86-NEXT: shldl $31, %eax, %ecx +; X86-NEXT: shll $31, %eax ; X86-NEXT: pushl $0 ; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax ; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %eax ; X86-NEXT: calll __udivdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -444,32 +444,32 @@ ; X86-NEXT: calll __udivdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: cmpl $2, %edx -; X86-NEXT: movl $-1, %ecx -; X86-NEXT: cmovael %ecx, %eax +; X86-NEXT: movl $-1, %esi +; X86-NEXT: cmovael %esi, %eax ; X86-NEXT: cmpl $1, %edx -; X86-NEXT: movl $1, %esi -; X86-NEXT: cmovael %esi, %edx +; X86-NEXT: movl $1, %ecx +; X86-NEXT: cmovael %ecx, %edx ; X86-NEXT: shldl $31, %eax, %edx ; X86-NEXT: cmpl $2, %edi -; X86-NEXT: cmovael %ecx, %ebx +; X86-NEXT: cmovael %esi, %ebx ; X86-NEXT: cmpl $1, %edi -; X86-NEXT: cmovael %esi, %edi +; X86-NEXT: cmovael %ecx, %edi ; X86-NEXT: shldl $31, %ebx, %edi ; X86-NEXT: cmpl $2, %ebp ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: cmovael %ecx, %eax +; X86-NEXT: cmovael %esi, %eax ; X86-NEXT: cmpl $1, %ebp -; X86-NEXT: cmovael %esi, %ebp +; X86-NEXT: cmovael %ecx, %ebp ; X86-NEXT: shldl $31, %eax, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: cmpl $2, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: cmovael %ecx, %eax +; X86-NEXT: cmovael %esi, %eax ; X86-NEXT: cmpl $1, %ebx -; X86-NEXT: cmovbl %ebx, %esi -; X86-NEXT: shldl $31, %eax, %esi +; X86-NEXT: cmovbl %ebx, %ecx +; X86-NEXT: shldl $31, %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %esi, 12(%eax) +; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: movl %ebp, 8(%eax) ; X86-NEXT: movl %edi, 4(%eax) ; X86-NEXT: movl %edx, (%eax) Index: llvm/test/CodeGen/X86/umax.ll =================================================================== --- llvm/test/CodeGen/X86/umax.ll +++ llvm/test/CodeGen/X86/umax.ll @@ -154,50 +154,50 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl %ebx, %edx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: cmoval %edx, %eax -; X86-NEXT: cmpl %esi, %ecx -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: cmoval %edx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmpl %ecx, %esi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: cmoval %esi, %eax +; X86-NEXT: cmpl %edx, %edi +; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: cmoval %esi, %ebp ; X86-NEXT: cmovel %eax, %ebp -; X86-NEXT: movl %esi, %eax -; X86-NEXT: cmoval %ecx, %eax +; X86-NEXT: movl %edx, %eax +; X86-NEXT: cmoval %edi, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: sbbl %edi, %ecx -; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: xorl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: sbbl %ebx, %edi +; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ebx, %edi ; X86-NEXT: xorl %eax, %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: cmovel %ebp, %ebx -; X86-NEXT: cmovel (%esp), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmpl %eax, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: cmoval %edi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: xorl %esi, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: cmovel %ebp, %ecx +; X86-NEXT: cmovel (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: cmpl %esi, %ebx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: cmoval %ebx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: cmpl %edx, %ebp -; X86-NEXT: cmoval %edi, %eax -; X86-NEXT: cmovel %ecx, %eax -; X86-NEXT: cmoval %ebp, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %edx, 12(%ecx) -; X86-NEXT: movl %eax, 8(%ecx) -; X86-NEXT: movl %esi, 4(%ecx) -; X86-NEXT: movl %ebx, (%ecx) -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: cmpl %eax, %ebp +; X86-NEXT: cmoval %ebx, %esi +; X86-NEXT: cmovel %edi, %esi +; X86-NEXT: cmoval %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %eax, 12(%edi) +; X86-NEXT: movl %esi, 8(%edi) +; X86-NEXT: movl %edx, 4(%edi) +; X86-NEXT: movl %ecx, (%edi) +; X86-NEXT: movl %edi, %eax ; X86-NEXT: addl $4, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi Index: llvm/test/CodeGen/X86/umin.ll =================================================================== --- llvm/test/CodeGen/X86/umin.ll +++ llvm/test/CodeGen/X86/umin.ll @@ -157,47 +157,46 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: cmpl %ecx, %edi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: cmovbl %edi, %eax +; X86-NEXT: cmpl %esi, %ebp ; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: cmovbl %eax, %ebx -; X86-NEXT: cmpl %esi, %edi -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: cmovbl %eax, %ebp -; X86-NEXT: cmovel %ebx, %ebp +; X86-NEXT: cmovbl %edi, %ebx +; X86-NEXT: cmovel %eax, %ebx ; X86-NEXT: movl %esi, %eax -; X86-NEXT: cmovbl %edi, %eax +; X86-NEXT: cmovbl %ebp, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: cmpl %edx, %edi ; X86-NEXT: movl %edx, %eax ; X86-NEXT: cmovbl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: sbbl %edi, %ebp ; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %esi ; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: xorl %eax, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: xorl %edx, %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: cmovel %ebp, %ecx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: xorl %edi, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: xorl %edx, %eax +; X86-NEXT: orl %ebp, %eax +; X86-NEXT: cmovel %ebx, %ecx ; X86-NEXT: cmovel (%esp), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmpl %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %edi, %eax ; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: cmovbl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %eax, 12(%edi) -; X86-NEXT: movl %edx, 8(%edi) -; X86-NEXT: movl %esi, 4(%edi) -; X86-NEXT: movl %ecx, (%edi) -; X86-NEXT: movl %edi, %eax +; X86-NEXT: cmovbl %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi Index: llvm/test/CodeGen/X86/umul-with-overflow.ll =================================================================== --- llvm/test/CodeGen/X86/umul-with-overflow.ll +++ llvm/test/CodeGen/X86/umul-with-overflow.ll @@ -92,7 +92,7 @@ ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %edi @@ -105,7 +105,7 @@ ; X86-NEXT: movl %ecx, %ebx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: adcl %edi, %esi ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -141,12 +141,12 @@ ; X86-NEXT: addl %ebx, %esi ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebp, %eax @@ -175,13 +175,13 @@ ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %ecx ; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %edi # 4-byte Folded Reload ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: adcl (%esp), %ecx # 4-byte Folded Reload -; X86-NEXT: setb (%esp) # 1-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi @@ -200,16 +200,16 @@ ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: adcl %edi, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: setb (%esp) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %esi, %eax -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X86-NEXT: movzbl (%esp), %esi # 1-byte Folded Reload ; X86-NEXT: adcl %esi, %edx ; X86-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl %ecx, %ebp ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NEXT: adcl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %edx @@ -220,7 +220,8 @@ ; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebx @@ -229,75 +230,76 @@ ; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %esi, %ebp -; X86-NEXT: setb %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: setb %bl +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %eax, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx +; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: mull %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebx, %esi -; X86-NEXT: setb %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %edi, %ebp +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: setb %bl +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %edi +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzbl %bl, %ecx ; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %esi, %edi -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, %edi -; X86-NEXT: adcl %ecx, %ebp -; X86-NEXT: setb %cl +; X86-NEXT: adcl %esi, %ebp +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: addl %ebp, %esi -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload @@ -365,7 +367,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ebx @@ -377,7 +379,7 @@ ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: adcl %ebx, %edi ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -406,44 +408,43 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: adcl %edi, %esi +; X86-NEXT: adcl %edi, %ebp ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ebp, %esi ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edi ; X86-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: adcl $0, %esi ; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %ecx, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: imull %ecx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: imull {{[0-9]+}}(%esp), %esi -; X86-NEXT: addl %ebx, %esi -; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: adcl %edi, %esi -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: addl %edx, %ebp +; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx +; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: adcl %edi, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -451,66 +452,66 @@ ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: imull %edx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edx ; X86-NEXT: movl %eax, %edi ; X86-NEXT: addl %edx, %ebp -; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx -; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: imull {{[0-9]+}}(%esp), %esi +; X86-NEXT: addl %ebp, %esi ; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: adcl %ebx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: imull %edx, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx -; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: imull {{[0-9]+}}(%esp), %esi +; X86-NEXT: addl %ecx, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: imull %edx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %edx ; X86-NEXT: addl %edx, %ebp -; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx -; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: addl %esi, %eax -; X86-NEXT: adcl %ecx, %ebx +; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: addl %edi, %eax -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, (%ecx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, 4(%ecx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, 8(%ecx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, 12(%ecx) -; X86-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-NEXT: movl %edx, 16(%ecx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, 20(%ecx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, 24(%ecx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, 28(%ecx) -; X86-NEXT: movl %eax, 32(%ecx) -; X86-NEXT: andl $4095, %ebx # imm = 0xFFF -; X86-NEXT: movw %bx, 36(%ecx) -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, (%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, 4(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, 8(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, 12(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, 16(%edx) +; X86-NEXT: movl (%esp), %esi # 4-byte Reload +; X86-NEXT: movl %esi, 20(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, 24(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, 28(%edx) +; X86-NEXT: movl %eax, 32(%edx) +; X86-NEXT: andl $4095, %ecx # imm = 0xFFF +; X86-NEXT: movw %cx, 36(%edx) +; X86-NEXT: movl %edx, %eax ; X86-NEXT: addl $76, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi Index: llvm/test/CodeGen/X86/umul_fix.ll =================================================================== --- llvm/test/CodeGen/X86/umul_fix.ll +++ llvm/test/CodeGen/X86/umul_fix.ll @@ -43,27 +43,27 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: addl %ebx, %ebp ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: adcl %edi, %edx -; X86-NEXT: imull {{[0-9]+}}(%esp), %esi -; X86-NEXT: addl %edx, %esi -; X86-NEXT: shldl $30, %eax, %esi -; X86-NEXT: shldl $30, %ecx, %eax -; X86-NEXT: movl %esi, %edx +; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: shldl $30, %eax, %ecx +; X86-NEXT: shldl $30, %esi, %eax +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -122,28 +122,28 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: shldl $30, %eax, %ebp +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shldl $30, %eax, %esi ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: shldl $30, %eax, %ebx -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %edi -; X86-NEXT: shldl $30, %eax, %edi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: shldl $30, %eax, %ebp +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: shldl $30, %eax, %edx ; X86-NEXT: movl %edx, 12(%ecx) -; X86-NEXT: movl %edi, 8(%ecx) +; X86-NEXT: movl %ebp, 8(%ecx) ; X86-NEXT: movl %ebx, 4(%ecx) -; X86-NEXT: movl %ebp, (%ecx) +; X86-NEXT: movl %esi, (%ecx) ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -267,23 +267,23 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ebp ; X86-NEXT: addl %edx, %ebx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %ebp ; X86-NEXT: addl %ebx, %eax -; X86-NEXT: adcl %edi, %edx -; X86-NEXT: imull {{[0-9]+}}(%esp), %esi -; X86-NEXT: addl %esi, %edx +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addl %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -308,24 +308,24 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %esi -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: mull %ebx +; X86-NEXT: addl %edx, %edi +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: adcl %edi, %edx +; X86-NEXT: mull %ebx +; X86-NEXT: addl %edi, %eax +; X86-NEXT: adcl %esi, %edx ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: addl %ebp, %edx ; X86-NEXT: adcl $0, %ecx @@ -355,31 +355,30 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: adcl %edi, %edx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl %edx, %ebp -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl %ebx, %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %edi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx Index: llvm/test/CodeGen/X86/umul_fix_sat.ll =================================================================== --- llvm/test/CodeGen/X86/umul_fix_sat.ll +++ llvm/test/CodeGen/X86/umul_fix_sat.ll @@ -52,32 +52,32 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: addl %esi, %ebp ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %esi +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: adcl %edi, %edx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl %esi, %edx -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl %ebx, %edx +; X86-NEXT: adcl $0, %esi ; X86-NEXT: shrdl $2, %eax, %ecx ; X86-NEXT: shrdl $2, %edx, %eax ; X86-NEXT: shrl $2, %edx -; X86-NEXT: orl %ebx, %edx +; X86-NEXT: orl %esi, %edx ; X86-NEXT: movl $-1, %edx ; X86-NEXT: cmovnel %edx, %ecx ; X86-NEXT: cmovel %eax, %edx @@ -195,8 +195,8 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %esi @@ -204,26 +204,26 @@ ; X86-NEXT: cmpl $4, %edx ; X86-NEXT: movl $-1, %ecx ; X86-NEXT: cmovael %ecx, %esi -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: shrdl $2, %edx, %ebp -; X86-NEXT: cmpl $4, %edx -; X86-NEXT: cmovael %ecx, %ebp ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: shrdl $2, %edx, %ebx ; X86-NEXT: cmpl $4, %edx ; X86-NEXT: cmovael %ecx, %ebx +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: shrdl $2, %edx, %ebp +; X86-NEXT: cmpl $4, %edx +; X86-NEXT: cmovael %ecx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: shrdl $2, %edx, %eax ; X86-NEXT: cmpl $4, %edx ; X86-NEXT: cmovael %ecx, %eax ; X86-NEXT: movl %eax, 12(%edi) -; X86-NEXT: movl %ebx, 8(%edi) -; X86-NEXT: movl %ebp, 4(%edi) +; X86-NEXT: movl %ebp, 8(%edi) +; X86-NEXT: movl %ebx, 4(%edi) ; X86-NEXT: movl %esi, (%edi) ; X86-NEXT: movl %edi, %eax ; X86-NEXT: popl %esi @@ -279,29 +279,31 @@ ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: testl %esi, %esi ; X86-NEXT: setne %dl ; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %cl -; X86-NEXT: andb %dl, %cl -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: setne %bl +; X86-NEXT: andb %dl, %bl +; X86-NEXT: mull %ebp ; X86-NEXT: movl %eax, %edi -; X86-NEXT: seto %bl +; X86-NEXT: seto %bh ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ebp +; X86-NEXT: mull %ecx +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: movl %eax, %esi -; X86-NEXT: seto %ch -; X86-NEXT: orb %bl, %ch +; X86-NEXT: seto %cl +; X86-NEXT: orb %bh, %cl ; X86-NEXT: addl %edi, %esi -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %eax +; X86-NEXT: mull %ebp ; X86-NEXT: addl %esi, %edx -; X86-NEXT: setb %bl -; X86-NEXT: orb %ch, %bl -; X86-NEXT: orb %cl, %bl +; X86-NEXT: setb %ch +; X86-NEXT: orb %cl, %ch +; X86-NEXT: orb %bl, %ch ; X86-NEXT: movl $-1, %ecx ; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: cmovnel %ecx, %edx @@ -393,28 +395,28 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: movl $-1, %esi -; X86-NEXT: cmovol %esi, %ebp +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl $-1, %edi +; X86-NEXT: cmovol %edi, %esi ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: cmovol %esi, %ebx -; X86-NEXT: movl %edi, %eax +; X86-NEXT: cmovol %edi, %ebx +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %edi -; X86-NEXT: cmovol %esi, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: cmovol %edi, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: cmovol %esi, %eax +; X86-NEXT: cmovol %edi, %eax ; X86-NEXT: movl %eax, 12(%ecx) -; X86-NEXT: movl %edi, 8(%ecx) +; X86-NEXT: movl %ebp, 8(%ecx) ; X86-NEXT: movl %ebx, 4(%ecx) -; X86-NEXT: movl %ebp, (%ecx) +; X86-NEXT: movl %esi, (%ecx) ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -443,31 +445,30 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: adcl %edi, %edx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl %ebp, %edx -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl %ebx, %edx +; X86-NEXT: adcl $0, %edi ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpl $1, %ebx +; X86-NEXT: cmpl $1, %edi ; X86-NEXT: sbbl %ecx, %ecx ; X86-NEXT: notl %ecx ; X86-NEXT: orl %ecx, %eax @@ -500,24 +501,24 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: addl %ebx, %ebp -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %ebp, %eax -; X86-NEXT: adcl %edi, %edx +; X86-NEXT: adcl %esi, %edx ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: addl %ebx, %edx ; X86-NEXT: adcl $0, %ecx Index: llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll =================================================================== --- llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll +++ llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll @@ -16,25 +16,27 @@ ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: testl %esi, %esi ; X86-NEXT: setne %dl ; X86-NEXT: testl %eax, %eax ; X86-NEXT: setne %bl ; X86-NEXT: andb %dl, %bl -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: mull %ebp ; X86-NEXT: movl %eax, %edi -; X86-NEXT: seto %cl +; X86-NEXT: seto %bh ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ebp +; X86-NEXT: mull %ecx +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: seto %ch -; X86-NEXT: orb %cl, %ch +; X86-NEXT: orb %bh, %ch ; X86-NEXT: addl %edi, %esi -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %eax +; X86-NEXT: mull %ebp ; X86-NEXT: addl %esi, %edx ; X86-NEXT: setb %cl ; X86-NEXT: orb %ch, %cl Index: llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll =================================================================== --- llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll +++ llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll @@ -993,16 +993,16 @@ ; CHECK-BASELINE-NEXT: notl %r14d ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r14w ; CHECK-BASELINE-NEXT: orl %ebx, %r14d -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: andw %r11w, %bx +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-BASELINE-NEXT: andw %r11w, %di ; CHECK-BASELINE-NEXT: notl %r11d ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r11w -; CHECK-BASELINE-NEXT: orl %ebx, %r11d -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: andw %r10w, %bx +; CHECK-BASELINE-NEXT: orl %edi, %r11d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-BASELINE-NEXT: andw %r10w, %di ; CHECK-BASELINE-NEXT: notl %r10d ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r10w -; CHECK-BASELINE-NEXT: orl %ebx, %r10d +; CHECK-BASELINE-NEXT: orl %edi, %r10d ; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ebx ; CHECK-BASELINE-NEXT: andl %ebx, %r9d ; CHECK-BASELINE-NEXT: notl %ebx @@ -1055,16 +1055,16 @@ ; CHECK-SSE1-NEXT: notl %r14d ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r14w ; CHECK-SSE1-NEXT: orl %ebx, %r14d -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: andw %r11w, %bx +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-SSE1-NEXT: andw %r11w, %di ; CHECK-SSE1-NEXT: notl %r11d ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r11w -; CHECK-SSE1-NEXT: orl %ebx, %r11d -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: andw %r10w, %bx +; CHECK-SSE1-NEXT: orl %edi, %r11d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-SSE1-NEXT: andw %r10w, %di ; CHECK-SSE1-NEXT: notl %r10d ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r10w -; CHECK-SSE1-NEXT: orl %ebx, %r10d +; CHECK-SSE1-NEXT: orl %edi, %r10d ; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ebx ; CHECK-SSE1-NEXT: andl %ebx, %r9d ; CHECK-SSE1-NEXT: notl %ebx @@ -1320,7 +1320,7 @@ ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx ; CHECK-BASELINE-NEXT: movq %rcx, %r15 -; CHECK-BASELINE-NEXT: movq %rsi, %r14 +; CHECK-BASELINE-NEXT: movq %rsi, %r12 ; CHECK-BASELINE-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-BASELINE-NEXT: movb 16(%rcx), %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill @@ -1332,7 +1332,7 @@ ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movb 20(%rcx), %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 21(%rcx), %r12b +; CHECK-BASELINE-NEXT: movb 21(%rcx), %r14b ; CHECK-BASELINE-NEXT: movb 22(%rcx), %r9b ; CHECK-BASELINE-NEXT: movb 23(%rcx), %r10b ; CHECK-BASELINE-NEXT: movb 24(%rcx), %r11b @@ -1343,94 +1343,94 @@ ; CHECK-BASELINE-NEXT: movb 29(%rcx), %sil ; CHECK-BASELINE-NEXT: movb 30(%rcx), %bl ; CHECK-BASELINE-NEXT: movb 31(%rcx), %al -; CHECK-BASELINE-NEXT: movb 31(%r14), %cl +; CHECK-BASELINE-NEXT: movb 31(%r12), %cl ; CHECK-BASELINE-NEXT: andb %al, %cl ; CHECK-BASELINE-NEXT: notb %al ; CHECK-BASELINE-NEXT: andb 31(%rdx), %al ; CHECK-BASELINE-NEXT: orb %cl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 30(%r14), %al +; CHECK-BASELINE-NEXT: movb 30(%r12), %al ; CHECK-BASELINE-NEXT: andb %bl, %al ; CHECK-BASELINE-NEXT: notb %bl ; CHECK-BASELINE-NEXT: andb 30(%rdx), %bl ; CHECK-BASELINE-NEXT: orb %al, %bl ; CHECK-BASELINE-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 29(%r14), %al +; CHECK-BASELINE-NEXT: movb 29(%r12), %al ; CHECK-BASELINE-NEXT: andb %sil, %al ; CHECK-BASELINE-NEXT: notb %sil ; CHECK-BASELINE-NEXT: andb 29(%rdx), %sil ; CHECK-BASELINE-NEXT: orb %al, %sil ; CHECK-BASELINE-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 28(%r14), %al +; CHECK-BASELINE-NEXT: movb 28(%r12), %al ; CHECK-BASELINE-NEXT: andb %dil, %al ; CHECK-BASELINE-NEXT: notb %dil ; CHECK-BASELINE-NEXT: andb 28(%rdx), %dil ; CHECK-BASELINE-NEXT: orb %al, %dil ; CHECK-BASELINE-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 27(%r14), %al +; CHECK-BASELINE-NEXT: movb 27(%r12), %al ; CHECK-BASELINE-NEXT: andb %r8b, %al ; CHECK-BASELINE-NEXT: notb %r8b ; CHECK-BASELINE-NEXT: andb 27(%rdx), %r8b ; CHECK-BASELINE-NEXT: orb %al, %r8b ; CHECK-BASELINE-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 26(%r14), %al +; CHECK-BASELINE-NEXT: movb 26(%r12), %al ; CHECK-BASELINE-NEXT: andb %r13b, %al ; CHECK-BASELINE-NEXT: notb %r13b ; CHECK-BASELINE-NEXT: andb 26(%rdx), %r13b ; CHECK-BASELINE-NEXT: orb %al, %r13b ; CHECK-BASELINE-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 25(%r14), %al +; CHECK-BASELINE-NEXT: movb 25(%r12), %al ; CHECK-BASELINE-NEXT: andb %bpl, %al ; CHECK-BASELINE-NEXT: notb %bpl ; CHECK-BASELINE-NEXT: andb 25(%rdx), %bpl ; CHECK-BASELINE-NEXT: orb %al, %bpl ; CHECK-BASELINE-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 24(%r14), %al +; CHECK-BASELINE-NEXT: movb 24(%r12), %al ; CHECK-BASELINE-NEXT: andb %r11b, %al ; CHECK-BASELINE-NEXT: notb %r11b ; CHECK-BASELINE-NEXT: andb 24(%rdx), %r11b ; CHECK-BASELINE-NEXT: orb %al, %r11b ; CHECK-BASELINE-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 23(%r14), %al +; CHECK-BASELINE-NEXT: movb 23(%r12), %al ; CHECK-BASELINE-NEXT: andb %r10b, %al ; CHECK-BASELINE-NEXT: notb %r10b ; CHECK-BASELINE-NEXT: andb 23(%rdx), %r10b ; CHECK-BASELINE-NEXT: orb %al, %r10b ; CHECK-BASELINE-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 22(%r14), %al +; CHECK-BASELINE-NEXT: movb 22(%r12), %al ; CHECK-BASELINE-NEXT: andb %r9b, %al ; CHECK-BASELINE-NEXT: notb %r9b ; CHECK-BASELINE-NEXT: andb 22(%rdx), %r9b ; CHECK-BASELINE-NEXT: orb %al, %r9b ; CHECK-BASELINE-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 21(%r14), %al -; CHECK-BASELINE-NEXT: andb %r12b, %al -; CHECK-BASELINE-NEXT: notb %r12b -; CHECK-BASELINE-NEXT: andb 21(%rdx), %r12b -; CHECK-BASELINE-NEXT: orb %al, %r12b -; CHECK-BASELINE-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 20(%r14), %al +; CHECK-BASELINE-NEXT: movb 21(%r12), %al +; CHECK-BASELINE-NEXT: andb %r14b, %al +; CHECK-BASELINE-NEXT: notb %r14b +; CHECK-BASELINE-NEXT: andb 21(%rdx), %r14b +; CHECK-BASELINE-NEXT: orb %al, %r14b +; CHECK-BASELINE-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movb 20(%r12), %al ; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload ; CHECK-BASELINE-NEXT: andb %cl, %al ; CHECK-BASELINE-NEXT: notb %cl ; CHECK-BASELINE-NEXT: andb 20(%rdx), %cl ; CHECK-BASELINE-NEXT: orb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 19(%r14), %al +; CHECK-BASELINE-NEXT: movb 19(%r12), %al ; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload ; CHECK-BASELINE-NEXT: andb %cl, %al ; CHECK-BASELINE-NEXT: notb %cl ; CHECK-BASELINE-NEXT: andb 19(%rdx), %cl ; CHECK-BASELINE-NEXT: orb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 18(%r14), %al +; CHECK-BASELINE-NEXT: movb 18(%r12), %al ; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload ; CHECK-BASELINE-NEXT: andb %cl, %al ; CHECK-BASELINE-NEXT: notb %cl ; CHECK-BASELINE-NEXT: andb 18(%rdx), %cl ; CHECK-BASELINE-NEXT: orb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 17(%r14), %al +; CHECK-BASELINE-NEXT: movb 17(%r12), %al ; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload ; CHECK-BASELINE-NEXT: andb %cl, %al ; CHECK-BASELINE-NEXT: notb %cl @@ -1438,7 +1438,7 @@ ; CHECK-BASELINE-NEXT: andb 17(%rdx), %cl ; CHECK-BASELINE-NEXT: orb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movb 16(%r14), %al +; CHECK-BASELINE-NEXT: movb 16(%r12), %al ; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload ; CHECK-BASELINE-NEXT: andb %cl, %al ; CHECK-BASELINE-NEXT: notb %cl @@ -1446,105 +1446,105 @@ ; CHECK-BASELINE-NEXT: orb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movb 15(%r15), %cl -; CHECK-BASELINE-NEXT: movb 15(%r14), %al +; CHECK-BASELINE-NEXT: movb 15(%r12), %al ; CHECK-BASELINE-NEXT: andb %cl, %al ; CHECK-BASELINE-NEXT: notb %cl ; CHECK-BASELINE-NEXT: andb 15(%rdx), %cl ; CHECK-BASELINE-NEXT: orb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movb 14(%r15), %cl -; CHECK-BASELINE-NEXT: movb 14(%r14), %al +; CHECK-BASELINE-NEXT: movb 14(%r12), %al ; CHECK-BASELINE-NEXT: andb %cl, %al ; CHECK-BASELINE-NEXT: notb %cl ; CHECK-BASELINE-NEXT: andb 14(%rdx), %cl ; CHECK-BASELINE-NEXT: orb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movb 13(%r15), %cl -; CHECK-BASELINE-NEXT: movb 13(%r14), %al +; CHECK-BASELINE-NEXT: movb 13(%r12), %al ; CHECK-BASELINE-NEXT: andb %cl, %al ; CHECK-BASELINE-NEXT: notb %cl ; CHECK-BASELINE-NEXT: andb 13(%rdx), %cl ; CHECK-BASELINE-NEXT: orb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movb 12(%r15), %cl -; CHECK-BASELINE-NEXT: movb 12(%r14), %al +; CHECK-BASELINE-NEXT: movb 12(%r12), %al ; CHECK-BASELINE-NEXT: andb %cl, %al ; CHECK-BASELINE-NEXT: notb %cl ; CHECK-BASELINE-NEXT: andb 12(%rdx), %cl ; CHECK-BASELINE-NEXT: orb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movb 11(%r15), %r13b -; CHECK-BASELINE-NEXT: movb 11(%r14), %al +; CHECK-BASELINE-NEXT: movb 11(%r12), %al ; CHECK-BASELINE-NEXT: andb %r13b, %al ; CHECK-BASELINE-NEXT: notb %r13b ; CHECK-BASELINE-NEXT: andb 11(%rdx), %r13b ; CHECK-BASELINE-NEXT: orb %al, %r13b -; CHECK-BASELINE-NEXT: movb 10(%r15), %r12b -; CHECK-BASELINE-NEXT: movb 10(%r14), %al -; CHECK-BASELINE-NEXT: andb %r12b, %al -; CHECK-BASELINE-NEXT: notb %r12b -; CHECK-BASELINE-NEXT: andb 10(%rdx), %r12b -; CHECK-BASELINE-NEXT: orb %al, %r12b +; CHECK-BASELINE-NEXT: movb 10(%r15), %r14b +; CHECK-BASELINE-NEXT: movb 10(%r12), %al +; CHECK-BASELINE-NEXT: andb %r14b, %al +; CHECK-BASELINE-NEXT: notb %r14b +; CHECK-BASELINE-NEXT: andb 10(%rdx), %r14b +; CHECK-BASELINE-NEXT: orb %al, %r14b ; CHECK-BASELINE-NEXT: movb 9(%r15), %bpl -; CHECK-BASELINE-NEXT: movb 9(%r14), %al +; CHECK-BASELINE-NEXT: movb 9(%r12), %al ; CHECK-BASELINE-NEXT: andb %bpl, %al ; CHECK-BASELINE-NEXT: notb %bpl ; CHECK-BASELINE-NEXT: andb 9(%rdx), %bpl ; CHECK-BASELINE-NEXT: orb %al, %bpl ; CHECK-BASELINE-NEXT: movb 8(%r15), %r11b -; CHECK-BASELINE-NEXT: movb 8(%r14), %al +; CHECK-BASELINE-NEXT: movb 8(%r12), %al ; CHECK-BASELINE-NEXT: andb %r11b, %al ; CHECK-BASELINE-NEXT: notb %r11b ; CHECK-BASELINE-NEXT: andb 8(%rdx), %r11b ; CHECK-BASELINE-NEXT: orb %al, %r11b ; CHECK-BASELINE-NEXT: movb 7(%r15), %r10b -; CHECK-BASELINE-NEXT: movb 7(%r14), %al +; CHECK-BASELINE-NEXT: movb 7(%r12), %al ; CHECK-BASELINE-NEXT: andb %r10b, %al ; CHECK-BASELINE-NEXT: notb %r10b ; CHECK-BASELINE-NEXT: andb 7(%rdx), %r10b ; CHECK-BASELINE-NEXT: orb %al, %r10b ; CHECK-BASELINE-NEXT: movb 6(%r15), %r9b -; CHECK-BASELINE-NEXT: movb 6(%r14), %al +; CHECK-BASELINE-NEXT: movb 6(%r12), %al ; CHECK-BASELINE-NEXT: andb %r9b, %al ; CHECK-BASELINE-NEXT: notb %r9b ; CHECK-BASELINE-NEXT: andb 6(%rdx), %r9b ; CHECK-BASELINE-NEXT: orb %al, %r9b ; CHECK-BASELINE-NEXT: movb 5(%r15), %r8b -; CHECK-BASELINE-NEXT: movb 5(%r14), %al +; CHECK-BASELINE-NEXT: movb 5(%r12), %al ; CHECK-BASELINE-NEXT: andb %r8b, %al ; CHECK-BASELINE-NEXT: notb %r8b ; CHECK-BASELINE-NEXT: andb 5(%rdx), %r8b ; CHECK-BASELINE-NEXT: orb %al, %r8b ; CHECK-BASELINE-NEXT: movb 4(%r15), %dil -; CHECK-BASELINE-NEXT: movb 4(%r14), %al +; CHECK-BASELINE-NEXT: movb 4(%r12), %al ; CHECK-BASELINE-NEXT: andb %dil, %al ; CHECK-BASELINE-NEXT: notb %dil ; CHECK-BASELINE-NEXT: andb 4(%rdx), %dil ; CHECK-BASELINE-NEXT: orb %al, %dil ; CHECK-BASELINE-NEXT: movb 3(%r15), %sil -; CHECK-BASELINE-NEXT: movb 3(%r14), %al +; CHECK-BASELINE-NEXT: movb 3(%r12), %al ; CHECK-BASELINE-NEXT: andb %sil, %al ; CHECK-BASELINE-NEXT: notb %sil ; CHECK-BASELINE-NEXT: andb 3(%rdx), %sil ; CHECK-BASELINE-NEXT: orb %al, %sil ; CHECK-BASELINE-NEXT: movb 2(%r15), %dl -; CHECK-BASELINE-NEXT: movb 2(%r14), %al +; CHECK-BASELINE-NEXT: movb 2(%r12), %al ; CHECK-BASELINE-NEXT: andb %dl, %al ; CHECK-BASELINE-NEXT: notb %dl ; CHECK-BASELINE-NEXT: andb 2(%rbx), %dl ; CHECK-BASELINE-NEXT: orb %al, %dl ; CHECK-BASELINE-NEXT: movb 1(%r15), %al -; CHECK-BASELINE-NEXT: movb 1(%r14), %cl +; CHECK-BASELINE-NEXT: movb 1(%r12), %cl ; CHECK-BASELINE-NEXT: andb %al, %cl ; CHECK-BASELINE-NEXT: notb %al ; CHECK-BASELINE-NEXT: andb 1(%rbx), %al ; CHECK-BASELINE-NEXT: orb %cl, %al ; CHECK-BASELINE-NEXT: movb (%r15), %r15b -; CHECK-BASELINE-NEXT: movb (%r14), %r14b -; CHECK-BASELINE-NEXT: andb %r15b, %r14b +; CHECK-BASELINE-NEXT: movb (%r12), %cl +; CHECK-BASELINE-NEXT: andb %r15b, %cl ; CHECK-BASELINE-NEXT: notb %r15b ; CHECK-BASELINE-NEXT: andb (%rbx), %r15b -; CHECK-BASELINE-NEXT: orb %r14b, %r15b +; CHECK-BASELINE-NEXT: orb %cl, %r15b ; CHECK-BASELINE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload ; CHECK-BASELINE-NEXT: movb %bl, 31(%rcx) @@ -1587,7 +1587,7 @@ ; CHECK-BASELINE-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload ; CHECK-BASELINE-NEXT: movb %bl, 12(%rcx) ; CHECK-BASELINE-NEXT: movb %r13b, 11(%rcx) -; CHECK-BASELINE-NEXT: movb %r12b, 10(%rcx) +; CHECK-BASELINE-NEXT: movb %r14b, 10(%rcx) ; CHECK-BASELINE-NEXT: movb %bpl, 9(%rcx) ; CHECK-BASELINE-NEXT: movb %r11b, 8(%rcx) ; CHECK-BASELINE-NEXT: movb %r10b, 7(%rcx) @@ -1616,7 +1616,7 @@ ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx ; CHECK-SSE1-NEXT: movq %rcx, %r15 -; CHECK-SSE1-NEXT: movq %rsi, %r14 +; CHECK-SSE1-NEXT: movq %rsi, %r12 ; CHECK-SSE1-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-SSE1-NEXT: movb 16(%rcx), %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill @@ -1628,7 +1628,7 @@ ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movb 20(%rcx), %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 21(%rcx), %r12b +; CHECK-SSE1-NEXT: movb 21(%rcx), %r14b ; CHECK-SSE1-NEXT: movb 22(%rcx), %r9b ; CHECK-SSE1-NEXT: movb 23(%rcx), %r10b ; CHECK-SSE1-NEXT: movb 24(%rcx), %r11b @@ -1639,94 +1639,94 @@ ; CHECK-SSE1-NEXT: movb 29(%rcx), %sil ; CHECK-SSE1-NEXT: movb 30(%rcx), %bl ; CHECK-SSE1-NEXT: movb 31(%rcx), %al -; CHECK-SSE1-NEXT: movb 31(%r14), %cl +; CHECK-SSE1-NEXT: movb 31(%r12), %cl ; CHECK-SSE1-NEXT: andb %al, %cl ; CHECK-SSE1-NEXT: notb %al ; CHECK-SSE1-NEXT: andb 31(%rdx), %al ; CHECK-SSE1-NEXT: orb %cl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 30(%r14), %al +; CHECK-SSE1-NEXT: movb 30(%r12), %al ; CHECK-SSE1-NEXT: andb %bl, %al ; CHECK-SSE1-NEXT: notb %bl ; CHECK-SSE1-NEXT: andb 30(%rdx), %bl ; CHECK-SSE1-NEXT: orb %al, %bl ; CHECK-SSE1-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 29(%r14), %al +; CHECK-SSE1-NEXT: movb 29(%r12), %al ; CHECK-SSE1-NEXT: andb %sil, %al ; CHECK-SSE1-NEXT: notb %sil ; CHECK-SSE1-NEXT: andb 29(%rdx), %sil ; CHECK-SSE1-NEXT: orb %al, %sil ; CHECK-SSE1-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 28(%r14), %al +; CHECK-SSE1-NEXT: movb 28(%r12), %al ; CHECK-SSE1-NEXT: andb %dil, %al ; CHECK-SSE1-NEXT: notb %dil ; CHECK-SSE1-NEXT: andb 28(%rdx), %dil ; CHECK-SSE1-NEXT: orb %al, %dil ; CHECK-SSE1-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 27(%r14), %al +; CHECK-SSE1-NEXT: movb 27(%r12), %al ; CHECK-SSE1-NEXT: andb %r8b, %al ; CHECK-SSE1-NEXT: notb %r8b ; CHECK-SSE1-NEXT: andb 27(%rdx), %r8b ; CHECK-SSE1-NEXT: orb %al, %r8b ; CHECK-SSE1-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 26(%r14), %al +; CHECK-SSE1-NEXT: movb 26(%r12), %al ; CHECK-SSE1-NEXT: andb %r13b, %al ; CHECK-SSE1-NEXT: notb %r13b ; CHECK-SSE1-NEXT: andb 26(%rdx), %r13b ; CHECK-SSE1-NEXT: orb %al, %r13b ; CHECK-SSE1-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 25(%r14), %al +; CHECK-SSE1-NEXT: movb 25(%r12), %al ; CHECK-SSE1-NEXT: andb %bpl, %al ; CHECK-SSE1-NEXT: notb %bpl ; CHECK-SSE1-NEXT: andb 25(%rdx), %bpl ; CHECK-SSE1-NEXT: orb %al, %bpl ; CHECK-SSE1-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 24(%r14), %al +; CHECK-SSE1-NEXT: movb 24(%r12), %al ; CHECK-SSE1-NEXT: andb %r11b, %al ; CHECK-SSE1-NEXT: notb %r11b ; CHECK-SSE1-NEXT: andb 24(%rdx), %r11b ; CHECK-SSE1-NEXT: orb %al, %r11b ; CHECK-SSE1-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 23(%r14), %al +; CHECK-SSE1-NEXT: movb 23(%r12), %al ; CHECK-SSE1-NEXT: andb %r10b, %al ; CHECK-SSE1-NEXT: notb %r10b ; CHECK-SSE1-NEXT: andb 23(%rdx), %r10b ; CHECK-SSE1-NEXT: orb %al, %r10b ; CHECK-SSE1-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 22(%r14), %al +; CHECK-SSE1-NEXT: movb 22(%r12), %al ; CHECK-SSE1-NEXT: andb %r9b, %al ; CHECK-SSE1-NEXT: notb %r9b ; CHECK-SSE1-NEXT: andb 22(%rdx), %r9b ; CHECK-SSE1-NEXT: orb %al, %r9b ; CHECK-SSE1-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 21(%r14), %al -; CHECK-SSE1-NEXT: andb %r12b, %al -; CHECK-SSE1-NEXT: notb %r12b -; CHECK-SSE1-NEXT: andb 21(%rdx), %r12b -; CHECK-SSE1-NEXT: orb %al, %r12b -; CHECK-SSE1-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 20(%r14), %al +; CHECK-SSE1-NEXT: movb 21(%r12), %al +; CHECK-SSE1-NEXT: andb %r14b, %al +; CHECK-SSE1-NEXT: notb %r14b +; CHECK-SSE1-NEXT: andb 21(%rdx), %r14b +; CHECK-SSE1-NEXT: orb %al, %r14b +; CHECK-SSE1-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movb 20(%r12), %al ; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload ; CHECK-SSE1-NEXT: andb %cl, %al ; CHECK-SSE1-NEXT: notb %cl ; CHECK-SSE1-NEXT: andb 20(%rdx), %cl ; CHECK-SSE1-NEXT: orb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 19(%r14), %al +; CHECK-SSE1-NEXT: movb 19(%r12), %al ; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload ; CHECK-SSE1-NEXT: andb %cl, %al ; CHECK-SSE1-NEXT: notb %cl ; CHECK-SSE1-NEXT: andb 19(%rdx), %cl ; CHECK-SSE1-NEXT: orb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 18(%r14), %al +; CHECK-SSE1-NEXT: movb 18(%r12), %al ; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload ; CHECK-SSE1-NEXT: andb %cl, %al ; CHECK-SSE1-NEXT: notb %cl ; CHECK-SSE1-NEXT: andb 18(%rdx), %cl ; CHECK-SSE1-NEXT: orb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 17(%r14), %al +; CHECK-SSE1-NEXT: movb 17(%r12), %al ; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload ; CHECK-SSE1-NEXT: andb %cl, %al ; CHECK-SSE1-NEXT: notb %cl @@ -1734,7 +1734,7 @@ ; CHECK-SSE1-NEXT: andb 17(%rdx), %cl ; CHECK-SSE1-NEXT: orb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movb 16(%r14), %al +; CHECK-SSE1-NEXT: movb 16(%r12), %al ; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload ; CHECK-SSE1-NEXT: andb %cl, %al ; CHECK-SSE1-NEXT: notb %cl @@ -1742,105 +1742,105 @@ ; CHECK-SSE1-NEXT: orb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movb 15(%r15), %cl -; CHECK-SSE1-NEXT: movb 15(%r14), %al +; CHECK-SSE1-NEXT: movb 15(%r12), %al ; CHECK-SSE1-NEXT: andb %cl, %al ; CHECK-SSE1-NEXT: notb %cl ; CHECK-SSE1-NEXT: andb 15(%rdx), %cl ; CHECK-SSE1-NEXT: orb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movb 14(%r15), %cl -; CHECK-SSE1-NEXT: movb 14(%r14), %al +; CHECK-SSE1-NEXT: movb 14(%r12), %al ; CHECK-SSE1-NEXT: andb %cl, %al ; CHECK-SSE1-NEXT: notb %cl ; CHECK-SSE1-NEXT: andb 14(%rdx), %cl ; CHECK-SSE1-NEXT: orb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movb 13(%r15), %cl -; CHECK-SSE1-NEXT: movb 13(%r14), %al +; CHECK-SSE1-NEXT: movb 13(%r12), %al ; CHECK-SSE1-NEXT: andb %cl, %al ; CHECK-SSE1-NEXT: notb %cl ; CHECK-SSE1-NEXT: andb 13(%rdx), %cl ; CHECK-SSE1-NEXT: orb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movb 12(%r15), %cl -; CHECK-SSE1-NEXT: movb 12(%r14), %al +; CHECK-SSE1-NEXT: movb 12(%r12), %al ; CHECK-SSE1-NEXT: andb %cl, %al ; CHECK-SSE1-NEXT: notb %cl ; CHECK-SSE1-NEXT: andb 12(%rdx), %cl ; CHECK-SSE1-NEXT: orb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movb 11(%r15), %r13b -; CHECK-SSE1-NEXT: movb 11(%r14), %al +; CHECK-SSE1-NEXT: movb 11(%r12), %al ; CHECK-SSE1-NEXT: andb %r13b, %al ; CHECK-SSE1-NEXT: notb %r13b ; CHECK-SSE1-NEXT: andb 11(%rdx), %r13b ; CHECK-SSE1-NEXT: orb %al, %r13b -; CHECK-SSE1-NEXT: movb 10(%r15), %r12b -; CHECK-SSE1-NEXT: movb 10(%r14), %al -; CHECK-SSE1-NEXT: andb %r12b, %al -; CHECK-SSE1-NEXT: notb %r12b -; CHECK-SSE1-NEXT: andb 10(%rdx), %r12b -; CHECK-SSE1-NEXT: orb %al, %r12b +; CHECK-SSE1-NEXT: movb 10(%r15), %r14b +; CHECK-SSE1-NEXT: movb 10(%r12), %al +; CHECK-SSE1-NEXT: andb %r14b, %al +; CHECK-SSE1-NEXT: notb %r14b +; CHECK-SSE1-NEXT: andb 10(%rdx), %r14b +; CHECK-SSE1-NEXT: orb %al, %r14b ; CHECK-SSE1-NEXT: movb 9(%r15), %bpl -; CHECK-SSE1-NEXT: movb 9(%r14), %al +; CHECK-SSE1-NEXT: movb 9(%r12), %al ; CHECK-SSE1-NEXT: andb %bpl, %al ; CHECK-SSE1-NEXT: notb %bpl ; CHECK-SSE1-NEXT: andb 9(%rdx), %bpl ; CHECK-SSE1-NEXT: orb %al, %bpl ; CHECK-SSE1-NEXT: movb 8(%r15), %r11b -; CHECK-SSE1-NEXT: movb 8(%r14), %al +; CHECK-SSE1-NEXT: movb 8(%r12), %al ; CHECK-SSE1-NEXT: andb %r11b, %al ; CHECK-SSE1-NEXT: notb %r11b ; CHECK-SSE1-NEXT: andb 8(%rdx), %r11b ; CHECK-SSE1-NEXT: orb %al, %r11b ; CHECK-SSE1-NEXT: movb 7(%r15), %r10b -; CHECK-SSE1-NEXT: movb 7(%r14), %al +; CHECK-SSE1-NEXT: movb 7(%r12), %al ; CHECK-SSE1-NEXT: andb %r10b, %al ; CHECK-SSE1-NEXT: notb %r10b ; CHECK-SSE1-NEXT: andb 7(%rdx), %r10b ; CHECK-SSE1-NEXT: orb %al, %r10b ; CHECK-SSE1-NEXT: movb 6(%r15), %r9b -; CHECK-SSE1-NEXT: movb 6(%r14), %al +; CHECK-SSE1-NEXT: movb 6(%r12), %al ; CHECK-SSE1-NEXT: andb %r9b, %al ; CHECK-SSE1-NEXT: notb %r9b ; CHECK-SSE1-NEXT: andb 6(%rdx), %r9b ; CHECK-SSE1-NEXT: orb %al, %r9b ; CHECK-SSE1-NEXT: movb 5(%r15), %r8b -; CHECK-SSE1-NEXT: movb 5(%r14), %al +; CHECK-SSE1-NEXT: movb 5(%r12), %al ; CHECK-SSE1-NEXT: andb %r8b, %al ; CHECK-SSE1-NEXT: notb %r8b ; CHECK-SSE1-NEXT: andb 5(%rdx), %r8b ; CHECK-SSE1-NEXT: orb %al, %r8b ; CHECK-SSE1-NEXT: movb 4(%r15), %dil -; CHECK-SSE1-NEXT: movb 4(%r14), %al +; CHECK-SSE1-NEXT: movb 4(%r12), %al ; CHECK-SSE1-NEXT: andb %dil, %al ; CHECK-SSE1-NEXT: notb %dil ; CHECK-SSE1-NEXT: andb 4(%rdx), %dil ; CHECK-SSE1-NEXT: orb %al, %dil ; CHECK-SSE1-NEXT: movb 3(%r15), %sil -; CHECK-SSE1-NEXT: movb 3(%r14), %al +; CHECK-SSE1-NEXT: movb 3(%r12), %al ; CHECK-SSE1-NEXT: andb %sil, %al ; CHECK-SSE1-NEXT: notb %sil ; CHECK-SSE1-NEXT: andb 3(%rdx), %sil ; CHECK-SSE1-NEXT: orb %al, %sil ; CHECK-SSE1-NEXT: movb 2(%r15), %dl -; CHECK-SSE1-NEXT: movb 2(%r14), %al +; CHECK-SSE1-NEXT: movb 2(%r12), %al ; CHECK-SSE1-NEXT: andb %dl, %al ; CHECK-SSE1-NEXT: notb %dl ; CHECK-SSE1-NEXT: andb 2(%rbx), %dl ; CHECK-SSE1-NEXT: orb %al, %dl ; CHECK-SSE1-NEXT: movb 1(%r15), %al -; CHECK-SSE1-NEXT: movb 1(%r14), %cl +; CHECK-SSE1-NEXT: movb 1(%r12), %cl ; CHECK-SSE1-NEXT: andb %al, %cl ; CHECK-SSE1-NEXT: notb %al ; CHECK-SSE1-NEXT: andb 1(%rbx), %al ; CHECK-SSE1-NEXT: orb %cl, %al ; CHECK-SSE1-NEXT: movb (%r15), %r15b -; CHECK-SSE1-NEXT: movb (%r14), %r14b -; CHECK-SSE1-NEXT: andb %r15b, %r14b +; CHECK-SSE1-NEXT: movb (%r12), %cl +; CHECK-SSE1-NEXT: andb %r15b, %cl ; CHECK-SSE1-NEXT: notb %r15b ; CHECK-SSE1-NEXT: andb (%rbx), %r15b -; CHECK-SSE1-NEXT: orb %r14b, %r15b +; CHECK-SSE1-NEXT: orb %cl, %r15b ; CHECK-SSE1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload ; CHECK-SSE1-NEXT: movb %bl, 31(%rcx) @@ -1883,7 +1883,7 @@ ; CHECK-SSE1-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %bl # 1-byte Reload ; CHECK-SSE1-NEXT: movb %bl, 12(%rcx) ; CHECK-SSE1-NEXT: movb %r13b, 11(%rcx) -; CHECK-SSE1-NEXT: movb %r12b, 10(%rcx) +; CHECK-SSE1-NEXT: movb %r14b, 10(%rcx) ; CHECK-SSE1-NEXT: movb %bpl, 9(%rcx) ; CHECK-SSE1-NEXT: movb %r11b, 8(%rcx) ; CHECK-SSE1-NEXT: movb %r10b, 7(%rcx) @@ -2751,46 +2751,45 @@ ; CHECK-BASELINE-NEXT: pushq %r13 ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx -; CHECK-BASELINE-NEXT: movl %ecx, %r10d -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r11b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r10b ; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %bpl ; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r14b ; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r15b ; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r12b -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r13b -; CHECK-BASELINE-NEXT: xorb %r13b, %sil +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r11b +; CHECK-BASELINE-NEXT: xorb %r11b, %sil ; CHECK-BASELINE-NEXT: xorb %r12b, %dl -; CHECK-BASELINE-NEXT: xorb %r15b, %r10b +; CHECK-BASELINE-NEXT: xorb %r15b, %cl ; CHECK-BASELINE-NEXT: xorb %r14b, %r8b ; CHECK-BASELINE-NEXT: xorb %bpl, %r9b +; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %r13b +; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %r13b ; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %bl ; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %bl -; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %cl -; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %cl ; CHECK-BASELINE-NEXT: movb {{[0-9]+}}(%rsp), %al -; CHECK-BASELINE-NEXT: xorb %r11b, %al +; CHECK-BASELINE-NEXT: xorb %r10b, %al ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r9b ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r8b -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r10b +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %dl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %sil ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %al -; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %bl -; CHECK-BASELINE-NEXT: xorb %r13b, %sil +; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %r13b +; CHECK-BASELINE-NEXT: xorb %r11b, %sil ; CHECK-BASELINE-NEXT: xorb %r12b, %dl -; CHECK-BASELINE-NEXT: xorb %r15b, %r10b +; CHECK-BASELINE-NEXT: xorb %r15b, %cl ; CHECK-BASELINE-NEXT: xorb %r14b, %r8b ; CHECK-BASELINE-NEXT: xorb %bpl, %r9b +; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %r13b ; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %bl -; CHECK-BASELINE-NEXT: xorb {{[0-9]+}}(%rsp), %cl -; CHECK-BASELINE-NEXT: xorb %r11b, %al +; CHECK-BASELINE-NEXT: xorb %r10b, %al ; CHECK-BASELINE-NEXT: movb %al, 7(%rdi) -; CHECK-BASELINE-NEXT: movb %cl, 6(%rdi) -; CHECK-BASELINE-NEXT: movb %bl, 5(%rdi) +; CHECK-BASELINE-NEXT: movb %bl, 6(%rdi) +; CHECK-BASELINE-NEXT: movb %r13b, 5(%rdi) ; CHECK-BASELINE-NEXT: movb %r9b, 4(%rdi) ; CHECK-BASELINE-NEXT: movb %r8b, 3(%rdi) -; CHECK-BASELINE-NEXT: movb %r10b, 2(%rdi) +; CHECK-BASELINE-NEXT: movb %cl, 2(%rdi) ; CHECK-BASELINE-NEXT: movb %dl, 1(%rdi) ; CHECK-BASELINE-NEXT: movb %sil, (%rdi) ; CHECK-BASELINE-NEXT: movq %rdi, %rax @@ -2810,46 +2809,45 @@ ; CHECK-SSE1-NEXT: pushq %r13 ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx -; CHECK-SSE1-NEXT: movl %ecx, %r10d -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r11b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r10b ; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %bpl ; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r14b ; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r15b ; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r12b -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r13b -; CHECK-SSE1-NEXT: xorb %r13b, %sil +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r11b +; CHECK-SSE1-NEXT: xorb %r11b, %sil ; CHECK-SSE1-NEXT: xorb %r12b, %dl -; CHECK-SSE1-NEXT: xorb %r15b, %r10b +; CHECK-SSE1-NEXT: xorb %r15b, %cl ; CHECK-SSE1-NEXT: xorb %r14b, %r8b ; CHECK-SSE1-NEXT: xorb %bpl, %r9b +; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %r13b +; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %r13b ; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %bl ; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %bl -; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %cl -; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %cl ; CHECK-SSE1-NEXT: movb {{[0-9]+}}(%rsp), %al -; CHECK-SSE1-NEXT: xorb %r11b, %al +; CHECK-SSE1-NEXT: xorb %r10b, %al ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r9b ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r8b -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r10b +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %dl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %sil ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %al -; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %bl -; CHECK-SSE1-NEXT: xorb %r13b, %sil +; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %r13b +; CHECK-SSE1-NEXT: xorb %r11b, %sil ; CHECK-SSE1-NEXT: xorb %r12b, %dl -; CHECK-SSE1-NEXT: xorb %r15b, %r10b +; CHECK-SSE1-NEXT: xorb %r15b, %cl ; CHECK-SSE1-NEXT: xorb %r14b, %r8b ; CHECK-SSE1-NEXT: xorb %bpl, %r9b +; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %r13b ; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %bl -; CHECK-SSE1-NEXT: xorb {{[0-9]+}}(%rsp), %cl -; CHECK-SSE1-NEXT: xorb %r11b, %al +; CHECK-SSE1-NEXT: xorb %r10b, %al ; CHECK-SSE1-NEXT: movb %al, 7(%rdi) -; CHECK-SSE1-NEXT: movb %cl, 6(%rdi) -; CHECK-SSE1-NEXT: movb %bl, 5(%rdi) +; CHECK-SSE1-NEXT: movb %bl, 6(%rdi) +; CHECK-SSE1-NEXT: movb %r13b, 5(%rdi) ; CHECK-SSE1-NEXT: movb %r9b, 4(%rdi) ; CHECK-SSE1-NEXT: movb %r8b, 3(%rdi) -; CHECK-SSE1-NEXT: movb %r10b, 2(%rdi) +; CHECK-SSE1-NEXT: movb %cl, 2(%rdi) ; CHECK-SSE1-NEXT: movb %dl, 1(%rdi) ; CHECK-SSE1-NEXT: movb %sil, (%rdi) ; CHECK-SSE1-NEXT: movq %rdi, %rax @@ -4387,7 +4385,6 @@ ; CHECK-BASELINE-NEXT: movl 28(%rdx), %r15d ; CHECK-BASELINE-NEXT: movl 24(%rdx), %r14d ; CHECK-BASELINE-NEXT: movl 20(%rdx), %r10d -; CHECK-BASELINE-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movl 16(%rdx), %eax ; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-BASELINE-NEXT: movl 12(%rdx), %ebp @@ -4425,7 +4422,7 @@ ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload ; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload -; CHECK-BASELINE-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload +; CHECK-BASELINE-NEXT: xorl %r10d, %edx ; CHECK-BASELINE-NEXT: xorl %r14d, %eax ; CHECK-BASELINE-NEXT: xorl %r15d, %esi ; CHECK-BASELINE-NEXT: movl %esi, 28(%rdi) @@ -4456,7 +4453,6 @@ ; CHECK-SSE1-NEXT: movl 28(%rdx), %r15d ; CHECK-SSE1-NEXT: movl 24(%rdx), %r14d ; CHECK-SSE1-NEXT: movl 20(%rdx), %r10d -; CHECK-SSE1-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movl 16(%rdx), %eax ; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE1-NEXT: movl 12(%rdx), %ebp @@ -4494,7 +4490,7 @@ ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Folded Reload ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload ; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload -; CHECK-SSE1-NEXT: xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload +; CHECK-SSE1-NEXT: xorl %r10d, %edx ; CHECK-SSE1-NEXT: xorl %r14d, %eax ; CHECK-SSE1-NEXT: xorl %r15d, %esi ; CHECK-SSE1-NEXT: movl %esi, 28(%rdi) Index: llvm/test/CodeGen/X86/ushl_sat.ll =================================================================== --- llvm/test/CodeGen/X86/ushl_sat.ll +++ llvm/test/CodeGen/X86/ushl_sat.ll @@ -202,30 +202,30 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edi, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: shldl %cl, %edi, %edx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: shll %cl, %edi +; X86-NEXT: shldl %cl, %esi, %edx ; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: testb $32, %cl -; X86-NEXT: cmovnel %esi, %edx -; X86-NEXT: cmovnel %ebx, %esi +; X86-NEXT: cmovnel %edi, %edx +; X86-NEXT: cmovnel %ebx, %edi ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: shrl %cl, %ebp ; X86-NEXT: testb $32, %cl ; X86-NEXT: cmovel %ebp, %ebx -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: shrdl %cl, %edx, %eax ; X86-NEXT: testb $32, %cl ; X86-NEXT: cmovnel %ebp, %eax -; X86-NEXT: xorl %edi, %eax +; X86-NEXT: xorl %esi, %eax ; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: orl %eax, %ebx ; X86-NEXT: movl $-1, %eax -; X86-NEXT: cmovnel %eax, %esi +; X86-NEXT: cmovnel %eax, %edi ; X86-NEXT: cmovnel %eax, %edx -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx Index: llvm/test/CodeGen/X86/ushl_sat_vec.ll =================================================================== --- llvm/test/CodeGen/X86/ushl_sat_vec.ll +++ llvm/test/CodeGen/X86/ushl_sat_vec.ll @@ -66,45 +66,46 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NEXT: movb {{[0-9]+}}(%esp), %ah +; X86-NEXT: movb {{[0-9]+}}(%esp), %dl ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: shll %cl, %ebp -; X86-NEXT: movl %ebp, %esi -; X86-NEXT: shrl %cl, %esi -; X86-NEXT: cmpl %esi, %edx -; X86-NEXT: movl $-1, %edx -; X86-NEXT: cmovnel %edx, %ebp -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: movb %ah, %cl -; X86-NEXT: shll %cl, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, %esi -; X86-NEXT: shrl %cl, %esi -; X86-NEXT: cmpl %esi, %edi -; X86-NEXT: cmovnel %edx, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movb %ch, %cl ; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl %esi, %edi +; X86-NEXT: movl %esi, %ebp +; X86-NEXT: shrl %cl, %ebp +; X86-NEXT: cmpl %ebp, %ebx +; X86-NEXT: movl $-1, %ebx +; X86-NEXT: cmovnel %ebx, %esi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movb %dl, %cl +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: shrl %cl, %ebp +; X86-NEXT: cmpl %ebp, %edi +; X86-NEXT: cmovnel %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %edx +; X86-NEXT: movb %ch, %cl +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: shrl %cl, %edi +; X86-NEXT: cmpl %edi, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: cmovnel %ebx, %edx +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: shll %cl, %ebp +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: shrl %cl, %edi ; X86-NEXT: cmpl %edi, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovnel %edx, %esi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: shrl %cl, %eax -; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnel %edx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %esi, 8(%eax) -; X86-NEXT: movl %ebx, 4(%eax) -; X86-NEXT: movl %ebp, (%eax) +; X86-NEXT: cmovnel %ebx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ebp, 12(%ecx) +; X86-NEXT: movl %edx, 8(%ecx) +; X86-NEXT: movl %eax, 4(%ecx) +; X86-NEXT: movl %esi, (%ecx) +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx Index: llvm/test/CodeGen/X86/usub_sat.ll =================================================================== --- llvm/test/CodeGen/X86/usub_sat.ll +++ llvm/test/CodeGen/X86/usub_sat.ll @@ -124,23 +124,23 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovbl %ebx, %edi -; X86-NEXT: subl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovbl %ebx, %esi -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmovbl %ebx, %edx ; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmovbl %ebx, %ecx -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: subl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmovbl %ebx, %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmovbl %ebx, %esi +; X86-NEXT: subl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmovbl %ebx, %edi +; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx Index: llvm/test/CodeGen/X86/vec-strict-cmp-128.ll =================================================================== --- llvm/test/CodeGen/X86/vec-strict-cmp-128.ll +++ llvm/test/CodeGen/X86/vec-strict-cmp-128.ll @@ -118,30 +118,30 @@ ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: movaps %xmm3, %xmm5 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: movaps %xmm3, %xmm4 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-32-NEXT: movaps %xmm2, %xmm6 ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-32-NEXT: ucomiss %xmm5, %xmm6 +; SSE-32-NEXT: ucomiss %xmm4, %xmm6 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: cmoval %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-32-NEXT: pand %xmm4, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm4 -; SSE-32-NEXT: por %xmm4, %xmm0 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-32-NEXT: pand %xmm5, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm5 +; SSE-32-NEXT: por %xmm5, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -272,30 +272,30 @@ ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: movaps %xmm3, %xmm5 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: movaps %xmm3, %xmm4 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-32-NEXT: movaps %xmm2, %xmm6 ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-32-NEXT: ucomiss %xmm5, %xmm6 +; SSE-32-NEXT: ucomiss %xmm4, %xmm6 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: cmovael %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-32-NEXT: pand %xmm4, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm4 -; SSE-32-NEXT: por %xmm4, %xmm0 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-32-NEXT: pand %xmm5, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm5 +; SSE-32-NEXT: por %xmm5, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -426,30 +426,30 @@ ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: movaps %xmm2, %xmm5 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] +; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: movaps %xmm2, %xmm4 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] ; SSE-32-NEXT: movaps %xmm3, %xmm6 ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-32-NEXT: ucomiss %xmm5, %xmm6 +; SSE-32-NEXT: ucomiss %xmm4, %xmm6 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: cmoval %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-32-NEXT: pand %xmm4, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm4 -; SSE-32-NEXT: por %xmm4, %xmm0 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-32-NEXT: pand %xmm5, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm5 +; SSE-32-NEXT: por %xmm5, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -578,30 +578,30 @@ ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: movaps %xmm2, %xmm5 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] +; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: movaps %xmm2, %xmm4 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] ; SSE-32-NEXT: movaps %xmm3, %xmm6 ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-32-NEXT: ucomiss %xmm5, %xmm6 +; SSE-32-NEXT: ucomiss %xmm4, %xmm6 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: cmovael %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-32-NEXT: pand %xmm4, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm4 -; SSE-32-NEXT: por %xmm4, %xmm0 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-32-NEXT: pand %xmm5, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm5 +; SSE-32-NEXT: por %xmm5, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -1023,30 +1023,30 @@ ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: movaps %xmm2, %xmm5 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] +; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: movaps %xmm2, %xmm4 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] ; SSE-32-NEXT: movaps %xmm3, %xmm6 ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-32-NEXT: ucomiss %xmm5, %xmm6 +; SSE-32-NEXT: ucomiss %xmm4, %xmm6 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: cmovbl %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-32-NEXT: pand %xmm4, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm4 -; SSE-32-NEXT: por %xmm4, %xmm0 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-32-NEXT: pand %xmm5, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm5 +; SSE-32-NEXT: por %xmm5, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -1175,30 +1175,30 @@ ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: movaps %xmm2, %xmm5 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] +; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: movaps %xmm2, %xmm4 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] ; SSE-32-NEXT: movaps %xmm3, %xmm6 ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-32-NEXT: ucomiss %xmm5, %xmm6 +; SSE-32-NEXT: ucomiss %xmm4, %xmm6 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: cmovbel %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-32-NEXT: pand %xmm4, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm4 -; SSE-32-NEXT: por %xmm4, %xmm0 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-32-NEXT: pand %xmm5, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm5 +; SSE-32-NEXT: por %xmm5, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -1327,30 +1327,30 @@ ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: movaps %xmm3, %xmm5 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: movaps %xmm3, %xmm4 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-32-NEXT: movaps %xmm2, %xmm6 ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-32-NEXT: ucomiss %xmm5, %xmm6 +; SSE-32-NEXT: ucomiss %xmm4, %xmm6 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: cmovbl %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-32-NEXT: pand %xmm4, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm4 -; SSE-32-NEXT: por %xmm4, %xmm0 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-32-NEXT: pand %xmm5, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm5 +; SSE-32-NEXT: por %xmm5, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -1481,30 +1481,30 @@ ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: movaps %xmm3, %xmm5 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-32-NEXT: movd %edx, %xmm5 +; SSE-32-NEXT: movaps %xmm3, %xmm4 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-32-NEXT: movaps %xmm2, %xmm6 ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-32-NEXT: ucomiss %xmm5, %xmm6 +; SSE-32-NEXT: ucomiss %xmm4, %xmm6 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: cmovbel %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-32-NEXT: pand %xmm4, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm4 -; SSE-32-NEXT: por %xmm4, %xmm0 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-32-NEXT: pand %xmm5, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm5 +; SSE-32-NEXT: por %xmm5, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -1904,24 +1904,24 @@ ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm3 +; SSE-32-NEXT: movapd 8(%ebp), %xmm4 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm3, %xmm2 +; SSE-32-NEXT: ucomisd %xmm4, %xmm2 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE-32-NEXT: movd %edx, %xmm3 +; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: ucomisd %xmm3, %xmm2 +; SSE-32-NEXT: ucomisd %xmm4, %xmm2 ; SSE-32-NEXT: cmoval %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-32-NEXT: pand %xmm4, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm4 -; SSE-32-NEXT: por %xmm4, %xmm0 +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-32-NEXT: pand %xmm3, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm3 +; SSE-32-NEXT: por %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -2024,24 +2024,24 @@ ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm3 +; SSE-32-NEXT: movapd 8(%ebp), %xmm4 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm3, %xmm2 +; SSE-32-NEXT: ucomisd %xmm4, %xmm2 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE-32-NEXT: movd %edx, %xmm3 +; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: ucomisd %xmm3, %xmm2 +; SSE-32-NEXT: ucomisd %xmm4, %xmm2 ; SSE-32-NEXT: cmovael %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-32-NEXT: pand %xmm4, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm4 -; SSE-32-NEXT: por %xmm4, %xmm0 +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-32-NEXT: pand %xmm3, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm3 +; SSE-32-NEXT: por %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -2144,24 +2144,24 @@ ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm3 +; SSE-32-NEXT: movapd 8(%ebp), %xmm4 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm2, %xmm3 +; SSE-32-NEXT: ucomisd %xmm2, %xmm4 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] +; SSE-32-NEXT: movd %edx, %xmm3 +; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] -; SSE-32-NEXT: ucomisd %xmm2, %xmm3 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] +; SSE-32-NEXT: ucomisd %xmm2, %xmm4 ; SSE-32-NEXT: cmoval %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-32-NEXT: pand %xmm4, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm4 -; SSE-32-NEXT: por %xmm4, %xmm0 +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-32-NEXT: pand %xmm3, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm3 +; SSE-32-NEXT: por %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -2262,24 +2262,24 @@ ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm3 +; SSE-32-NEXT: movapd 8(%ebp), %xmm4 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm2, %xmm3 +; SSE-32-NEXT: ucomisd %xmm2, %xmm4 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] +; SSE-32-NEXT: movd %edx, %xmm3 +; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] -; SSE-32-NEXT: ucomisd %xmm2, %xmm3 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] +; SSE-32-NEXT: ucomisd %xmm2, %xmm4 ; SSE-32-NEXT: cmovael %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-32-NEXT: pand %xmm4, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm4 -; SSE-32-NEXT: por %xmm4, %xmm0 +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-32-NEXT: pand %xmm3, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm3 +; SSE-32-NEXT: por %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -2673,24 +2673,24 @@ ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm3 +; SSE-32-NEXT: movapd 8(%ebp), %xmm4 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm2, %xmm3 +; SSE-32-NEXT: ucomisd %xmm2, %xmm4 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] +; SSE-32-NEXT: movd %edx, %xmm3 +; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] -; SSE-32-NEXT: ucomisd %xmm2, %xmm3 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] +; SSE-32-NEXT: ucomisd %xmm2, %xmm4 ; SSE-32-NEXT: cmovbl %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-32-NEXT: pand %xmm4, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm4 -; SSE-32-NEXT: por %xmm4, %xmm0 +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-32-NEXT: pand %xmm3, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm3 +; SSE-32-NEXT: por %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -2791,24 +2791,24 @@ ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm3 +; SSE-32-NEXT: movapd 8(%ebp), %xmm4 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm2, %xmm3 +; SSE-32-NEXT: ucomisd %xmm2, %xmm4 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] +; SSE-32-NEXT: movd %edx, %xmm3 +; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] -; SSE-32-NEXT: ucomisd %xmm2, %xmm3 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] +; SSE-32-NEXT: ucomisd %xmm2, %xmm4 ; SSE-32-NEXT: cmovbel %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-32-NEXT: pand %xmm4, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm4 -; SSE-32-NEXT: por %xmm4, %xmm0 +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-32-NEXT: pand %xmm3, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm3 +; SSE-32-NEXT: por %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -2909,24 +2909,24 @@ ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm3 +; SSE-32-NEXT: movapd 8(%ebp), %xmm4 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm3, %xmm2 +; SSE-32-NEXT: ucomisd %xmm4, %xmm2 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE-32-NEXT: movd %edx, %xmm3 +; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: ucomisd %xmm3, %xmm2 +; SSE-32-NEXT: ucomisd %xmm4, %xmm2 ; SSE-32-NEXT: cmovbl %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-32-NEXT: pand %xmm4, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm4 -; SSE-32-NEXT: por %xmm4, %xmm0 +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-32-NEXT: pand %xmm3, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm3 +; SSE-32-NEXT: por %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -3029,24 +3029,24 @@ ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm3 +; SSE-32-NEXT: movapd 8(%ebp), %xmm4 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm3, %xmm2 +; SSE-32-NEXT: ucomisd %xmm4, %xmm2 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE-32-NEXT: movd %edx, %xmm3 +; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] ; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: ucomisd %xmm3, %xmm2 +; SSE-32-NEXT: ucomisd %xmm4, %xmm2 ; SSE-32-NEXT: cmovbel %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-32-NEXT: pand %xmm4, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm4 -; SSE-32-NEXT: por %xmm4, %xmm0 +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-32-NEXT: pand %xmm3, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm3 +; SSE-32-NEXT: por %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl Index: llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll =================================================================== --- llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll +++ llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll @@ -15,22 +15,22 @@ ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movaps 8(%ebp), %xmm3 +; SSE-32-NEXT: movaps 8(%ebp), %xmm4 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: comiss %xmm3, %xmm2 +; SSE-32-NEXT: comiss %xmm4, %xmm2 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] +; SSE-32-NEXT: movd %edx, %xmm3 +; SSE-32-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-32-NEXT: comiss %xmm3, %xmm2 +; SSE-32-NEXT: comiss %xmm4, %xmm2 ; SSE-32-NEXT: cmoval %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-32-NEXT: pand %xmm4, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm4 -; SSE-32-NEXT: por %xmm4, %xmm0 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-32-NEXT: pand %xmm3, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm3 +; SSE-32-NEXT: por %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl @@ -198,24 +198,24 @@ ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movaps 8(%ebp), %xmm3 +; SSE-32-NEXT: movaps 8(%ebp), %xmm4 ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomiss %xmm3, %xmm2 +; SSE-32-NEXT: ucomiss %xmm4, %xmm2 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $-1, %edx ; SSE-32-NEXT: cmovnel %eax, %edx ; SSE-32-NEXT: cmovpl %eax, %edx -; SSE-32-NEXT: movd %edx, %xmm4 -; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] +; SSE-32-NEXT: movd %edx, %xmm3 +; SSE-32-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1,1,1] ; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-32-NEXT: ucomiss %xmm3, %xmm2 +; SSE-32-NEXT: ucomiss %xmm4, %xmm2 ; SSE-32-NEXT: cmovnel %eax, %ecx ; SSE-32-NEXT: cmovpl %eax, %ecx ; SSE-32-NEXT: movd %ecx, %xmm2 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-32-NEXT: pand %xmm4, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm4 -; SSE-32-NEXT: por %xmm4, %xmm0 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-32-NEXT: pand %xmm3, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm3 +; SSE-32-NEXT: por %xmm3, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl Index: llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll =================================================================== --- llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll +++ llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll @@ -377,36 +377,36 @@ ; AVX512F-32-NEXT: andl $-8, %esp ; AVX512F-32-NEXT: subl $40, %esp ; AVX512F-32-NEXT: .cfi_offset %ebx, -12 -; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX512F-32-NEXT: xorl %eax, %eax -; AVX512F-32-NEXT: vcomisd %xmm3, %xmm2 +; AVX512F-32-NEXT: vcomisd %xmm1, %xmm3 ; AVX512F-32-NEXT: setae %al ; AVX512F-32-NEXT: kmovw %eax, %k1 -; AVX512F-32-NEXT: vmovsd %xmm3, %xmm3, %xmm4 {%k1} {z} -; AVX512F-32-NEXT: vsubsd %xmm4, %xmm2, %xmm2 -; AVX512F-32-NEXT: vmovsd %xmm2, (%esp) +; AVX512F-32-NEXT: vmovsd %xmm1, %xmm1, %xmm4 {%k1} {z} +; AVX512F-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 +; AVX512F-32-NEXT: vmovsd %xmm3, (%esp) ; AVX512F-32-NEXT: xorl %edx, %edx -; AVX512F-32-NEXT: vcomisd %xmm3, %xmm1 +; AVX512F-32-NEXT: vcomisd %xmm1, %xmm2 ; AVX512F-32-NEXT: setae %dl ; AVX512F-32-NEXT: kmovw %edx, %k1 -; AVX512F-32-NEXT: vmovsd %xmm3, %xmm3, %xmm2 {%k1} {z} -; AVX512F-32-NEXT: vsubsd %xmm2, %xmm1, %xmm1 -; AVX512F-32-NEXT: vmovsd %xmm1, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512F-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z} +; AVX512F-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 +; AVX512F-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512F-32-NEXT: xorl %ecx, %ecx -; AVX512F-32-NEXT: vcomisd %xmm3, %xmm1 +; AVX512F-32-NEXT: vcomisd %xmm1, %xmm2 ; AVX512F-32-NEXT: setae %cl ; AVX512F-32-NEXT: kmovw %ecx, %k1 -; AVX512F-32-NEXT: vmovsd %xmm3, %xmm3, %xmm2 {%k1} {z} -; AVX512F-32-NEXT: vsubsd %xmm2, %xmm1, %xmm1 -; AVX512F-32-NEXT: vmovsd %xmm1, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z} +; AVX512F-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 +; AVX512F-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: xorl %ebx, %ebx -; AVX512F-32-NEXT: vcomisd %xmm3, %xmm0 +; AVX512F-32-NEXT: vcomisd %xmm1, %xmm0 ; AVX512F-32-NEXT: setae %bl ; AVX512F-32-NEXT: kmovw %ebx, %k1 -; AVX512F-32-NEXT: vmovsd %xmm3, %xmm3, %xmm1 {%k1} {z} +; AVX512F-32-NEXT: vmovsd %xmm1, %xmm1, %xmm1 {%k1} {z} ; AVX512F-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; AVX512F-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: fldl (%esp) @@ -470,36 +470,36 @@ ; AVX512VL-32-NEXT: andl $-8, %esp ; AVX512VL-32-NEXT: subl $40, %esp ; AVX512VL-32-NEXT: .cfi_offset %ebx, -12 -; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX512VL-32-NEXT: xorl %eax, %eax -; AVX512VL-32-NEXT: vcomisd %xmm3, %xmm2 +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3 ; AVX512VL-32-NEXT: setae %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm3, %xmm4 {%k1} {z} -; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm2, %xmm2 -; AVX512VL-32-NEXT: vmovsd %xmm2, (%esp) +; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm4 {%k1} {z} +; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vmovsd %xmm3, (%esp) ; AVX512VL-32-NEXT: xorl %edx, %edx -; AVX512VL-32-NEXT: vcomisd %xmm3, %xmm1 +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2 ; AVX512VL-32-NEXT: setae %dl ; AVX512VL-32-NEXT: kmovw %edx, %k1 -; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm3, %xmm2 {%k1} {z} -; AVX512VL-32-NEXT: vsubsd %xmm2, %xmm1, %xmm1 -; AVX512VL-32-NEXT: vmovsd %xmm1, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z} +; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512VL-32-NEXT: xorl %ecx, %ecx -; AVX512VL-32-NEXT: vcomisd %xmm3, %xmm1 +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2 ; AVX512VL-32-NEXT: setae %cl ; AVX512VL-32-NEXT: kmovw %ecx, %k1 -; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm3, %xmm2 {%k1} {z} -; AVX512VL-32-NEXT: vsubsd %xmm2, %xmm1, %xmm1 -; AVX512VL-32-NEXT: vmovsd %xmm1, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z} +; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: xorl %ebx, %ebx -; AVX512VL-32-NEXT: vcomisd %xmm3, %xmm0 +; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm0 ; AVX512VL-32-NEXT: setae %bl ; AVX512VL-32-NEXT: kmovw %ebx, %k1 -; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm3, %xmm1 {%k1} {z} +; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm1 {%k1} {z} ; AVX512VL-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fldl (%esp) @@ -908,36 +908,36 @@ ; AVX512F-32-NEXT: andl $-8, %esp ; AVX512F-32-NEXT: subl $40, %esp ; AVX512F-32-NEXT: .cfi_offset %ebx, -12 -; AVX512F-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] -; AVX512F-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX512F-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512F-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX512F-32-NEXT: xorl %eax, %eax -; AVX512F-32-NEXT: vcomiss %xmm2, %xmm1 +; AVX512F-32-NEXT: vcomiss %xmm1, %xmm2 ; AVX512F-32-NEXT: setae %al ; AVX512F-32-NEXT: kmovw %eax, %k1 -; AVX512F-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z} -; AVX512F-32-NEXT: vsubss %xmm3, %xmm1, %xmm1 -; AVX512F-32-NEXT: vmovss %xmm1, (%esp) -; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512F-32-NEXT: vmovss %xmm1, %xmm1, %xmm3 {%k1} {z} +; AVX512F-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 +; AVX512F-32-NEXT: vmovss %xmm2, (%esp) +; AVX512F-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512F-32-NEXT: xorl %edx, %edx -; AVX512F-32-NEXT: vcomiss %xmm2, %xmm1 +; AVX512F-32-NEXT: vcomiss %xmm1, %xmm2 ; AVX512F-32-NEXT: setae %dl ; AVX512F-32-NEXT: kmovw %edx, %k1 -; AVX512F-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z} -; AVX512F-32-NEXT: vsubss %xmm3, %xmm1, %xmm1 -; AVX512F-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp) -; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512F-32-NEXT: vmovss %xmm1, %xmm1, %xmm3 {%k1} {z} +; AVX512F-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 +; AVX512F-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX512F-32-NEXT: xorl %ecx, %ecx -; AVX512F-32-NEXT: vcomiss %xmm2, %xmm1 +; AVX512F-32-NEXT: vcomiss %xmm1, %xmm2 ; AVX512F-32-NEXT: setae %cl ; AVX512F-32-NEXT: kmovw %ecx, %k1 -; AVX512F-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z} -; AVX512F-32-NEXT: vsubss %xmm3, %xmm1, %xmm1 -; AVX512F-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vmovss %xmm1, %xmm1, %xmm3 {%k1} {z} +; AVX512F-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 +; AVX512F-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: xorl %ebx, %ebx -; AVX512F-32-NEXT: vcomiss %xmm2, %xmm0 +; AVX512F-32-NEXT: vcomiss %xmm1, %xmm0 ; AVX512F-32-NEXT: setae %bl ; AVX512F-32-NEXT: kmovw %ebx, %k1 -; AVX512F-32-NEXT: vmovss %xmm2, %xmm2, %xmm1 {%k1} {z} +; AVX512F-32-NEXT: vmovss %xmm1, %xmm1, %xmm1 {%k1} {z} ; AVX512F-32-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; AVX512F-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: flds (%esp) @@ -1001,36 +1001,36 @@ ; AVX512VL-32-NEXT: andl $-8, %esp ; AVX512VL-32-NEXT: subl $40, %esp ; AVX512VL-32-NEXT: .cfi_offset %ebx, -12 -; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] -; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX512VL-32-NEXT: xorl %eax, %eax -; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm1 +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 ; AVX512VL-32-NEXT: setae %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z} -; AVX512VL-32-NEXT: vsubss %xmm3, %xmm1, %xmm1 -; AVX512VL-32-NEXT: vmovss %xmm1, (%esp) -; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm3 {%k1} {z} +; AVX512VL-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovss %xmm2, (%esp) +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512VL-32-NEXT: xorl %edx, %edx -; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm1 +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 ; AVX512VL-32-NEXT: setae %dl ; AVX512VL-32-NEXT: kmovw %edx, %k1 -; AVX512VL-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z} -; AVX512VL-32-NEXT: vsubss %xmm3, %xmm1, %xmm1 -; AVX512VL-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm3 {%k1} {z} +; AVX512VL-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX512VL-32-NEXT: xorl %ecx, %ecx -; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm1 +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 ; AVX512VL-32-NEXT: setae %cl ; AVX512VL-32-NEXT: kmovw %ecx, %k1 -; AVX512VL-32-NEXT: vmovss %xmm2, %xmm2, %xmm3 {%k1} {z} -; AVX512VL-32-NEXT: vsubss %xmm3, %xmm1, %xmm1 -; AVX512VL-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm3 {%k1} {z} +; AVX512VL-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: xorl %ebx, %ebx -; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm0 +; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm0 ; AVX512VL-32-NEXT: setae %bl ; AVX512VL-32-NEXT: kmovw %ebx, %k1 -; AVX512VL-32-NEXT: vmovss %xmm2, %xmm2, %xmm1 {%k1} {z} +; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm1 {%k1} {z} ; AVX512VL-32-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; AVX512VL-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: flds (%esp) Index: llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll =================================================================== --- llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll +++ llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll @@ -369,14 +369,14 @@ ; NODQ-32-NEXT: .cfi_def_cfa_register %ebp ; NODQ-32-NEXT: andl $-8, %esp ; NODQ-32-NEXT: subl $128, %esp -; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm2 -; NODQ-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,3,2,3] -; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm3 +; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm3 ; NODQ-32-NEXT: vmovlps %xmm3, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm2 +; NODQ-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) +; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,3,2,3] +; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) @@ -384,25 +384,25 @@ ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm4, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vextractps $1, %xmm2, %eax +; NODQ-32-NEXT: vextractps $1, %xmm3, %eax ; NODQ-32-NEXT: shrl $31, %eax ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4) ; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp) ; NODQ-32-NEXT: wait -; NODQ-32-NEXT: vextractps $3, %xmm2, %eax +; NODQ-32-NEXT: vextractps $3, %xmm3, %eax ; NODQ-32-NEXT: shrl $31, %eax ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4) ; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp) ; NODQ-32-NEXT: wait -; NODQ-32-NEXT: vextractps $1, %xmm3, %eax +; NODQ-32-NEXT: vextractps $1, %xmm2, %eax ; NODQ-32-NEXT: shrl $31, %eax ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4) ; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp) ; NODQ-32-NEXT: wait -; NODQ-32-NEXT: vextractps $3, %xmm3, %eax +; NODQ-32-NEXT: vextractps $3, %xmm2, %eax ; NODQ-32-NEXT: shrl $31, %eax ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4) Index: llvm/test/CodeGen/X86/vec_shift4.ll =================================================================== --- llvm/test/CodeGen/X86/vec_shift4.ll +++ llvm/test/CodeGen/X86/vec_shift4.ll @@ -27,25 +27,26 @@ define <2 x i64> @shl2(<16 x i8> %r, <16 x i8> %a) nounwind readnone ssp { ; X86-LABEL: shl2: ; X86: # %bb.0: # %entry -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: psllw $5, %xmm1 +; X86-NEXT: movdqa %xmm1, %xmm2 +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: psllw $5, %xmm2 ; X86-NEXT: movdqa %xmm0, %xmm3 ; X86-NEXT: psllw $4, %xmm3 ; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 -; X86-NEXT: movdqa %xmm1, %xmm0 -; X86-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; X86-NEXT: movdqa %xmm2, %xmm3 +; X86-NEXT: movdqa %xmm2, %xmm0 +; X86-NEXT: pblendvb %xmm0, %xmm3, %xmm1 +; X86-NEXT: movdqa %xmm1, %xmm3 ; X86-NEXT: psllw $2, %xmm3 ; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 -; X86-NEXT: paddb %xmm1, %xmm1 -; X86-NEXT: movdqa %xmm1, %xmm0 -; X86-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; X86-NEXT: movdqa %xmm2, %xmm3 -; X86-NEXT: paddb %xmm2, %xmm3 -; X86-NEXT: paddb %xmm1, %xmm1 -; X86-NEXT: movdqa %xmm1, %xmm0 -; X86-NEXT: pblendvb %xmm0, %xmm3, %xmm2 +; X86-NEXT: paddb %xmm2, %xmm2 +; X86-NEXT: movdqa %xmm2, %xmm0 +; X86-NEXT: pblendvb %xmm0, %xmm3, %xmm1 +; X86-NEXT: movdqa %xmm1, %xmm3 +; X86-NEXT: paddb %xmm1, %xmm3 +; X86-NEXT: paddb %xmm2, %xmm2 ; X86-NEXT: movdqa %xmm2, %xmm0 +; X86-NEXT: pblendvb %xmm0, %xmm3, %xmm1 +; X86-NEXT: movdqa %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: shl2: Index: llvm/test/CodeGen/X86/vec_smulo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_smulo.ll +++ llvm/test/CodeGen/X86/vec_smulo.ll @@ -3517,23 +3517,23 @@ ; AVX512F-NEXT: pushq %rbx ; AVX512F-NEXT: subq $24, %rsp ; AVX512F-NEXT: movq %r8, %rax -; AVX512F-NEXT: movq %rcx, %r14 +; AVX512F-NEXT: movq %rcx, %r12 ; AVX512F-NEXT: movq %rdx, %rbx -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbp ; AVX512F-NEXT: movq $0, {{[0-9]+}}(%rsp) ; AVX512F-NEXT: leaq {{[0-9]+}}(%rsp), %r8 ; AVX512F-NEXT: movq %rax, %rdx ; AVX512F-NEXT: movq %r9, %rcx ; AVX512F-NEXT: callq __muloti4@PLT -; AVX512F-NEXT: movq %rax, %r13 -; AVX512F-NEXT: movq %rdx, %rbp +; AVX512F-NEXT: movq %rax, %r14 +; AVX512F-NEXT: movq %rdx, %r15 ; AVX512F-NEXT: movq $0, {{[0-9]+}}(%rsp) ; AVX512F-NEXT: leaq {{[0-9]+}}(%rsp), %r8 ; AVX512F-NEXT: movq %rbx, %rdi -; AVX512F-NEXT: movq %r14, %rsi +; AVX512F-NEXT: movq %r12, %rsi ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512F-NEXT: movq %r12, %rcx +; AVX512F-NEXT: movq %rbp, %rcx ; AVX512F-NEXT: callq __muloti4@PLT ; AVX512F-NEXT: cmpq $0, {{[0-9]+}}(%rsp) ; AVX512F-NEXT: setne %cl @@ -3546,10 +3546,10 @@ ; AVX512F-NEXT: korw %k0, %k1, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512F-NEXT: movq %rdx, 24(%r15) -; AVX512F-NEXT: movq %rax, 16(%r15) -; AVX512F-NEXT: movq %rbp, 8(%r15) -; AVX512F-NEXT: movq %r13, (%r15) +; AVX512F-NEXT: movq %rdx, 24(%r13) +; AVX512F-NEXT: movq %rax, 16(%r13) +; AVX512F-NEXT: movq %r15, 8(%r13) +; AVX512F-NEXT: movq %r14, (%r13) ; AVX512F-NEXT: addq $24, %rsp ; AVX512F-NEXT: popq %rbx ; AVX512F-NEXT: popq %r12 @@ -3569,23 +3569,23 @@ ; AVX512BW-NEXT: pushq %rbx ; AVX512BW-NEXT: subq $24, %rsp ; AVX512BW-NEXT: movq %r8, %rax -; AVX512BW-NEXT: movq %rcx, %r14 +; AVX512BW-NEXT: movq %rcx, %r12 ; AVX512BW-NEXT: movq %rdx, %rbx -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rbp ; AVX512BW-NEXT: movq $0, {{[0-9]+}}(%rsp) ; AVX512BW-NEXT: leaq {{[0-9]+}}(%rsp), %r8 ; AVX512BW-NEXT: movq %rax, %rdx ; AVX512BW-NEXT: movq %r9, %rcx ; AVX512BW-NEXT: callq __muloti4@PLT -; AVX512BW-NEXT: movq %rax, %r13 -; AVX512BW-NEXT: movq %rdx, %rbp +; AVX512BW-NEXT: movq %rax, %r14 +; AVX512BW-NEXT: movq %rdx, %r15 ; AVX512BW-NEXT: movq $0, {{[0-9]+}}(%rsp) ; AVX512BW-NEXT: leaq {{[0-9]+}}(%rsp), %r8 ; AVX512BW-NEXT: movq %rbx, %rdi -; AVX512BW-NEXT: movq %r14, %rsi +; AVX512BW-NEXT: movq %r12, %rsi ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512BW-NEXT: movq %r12, %rcx +; AVX512BW-NEXT: movq %rbp, %rcx ; AVX512BW-NEXT: callq __muloti4@PLT ; AVX512BW-NEXT: cmpq $0, {{[0-9]+}}(%rsp) ; AVX512BW-NEXT: setne %cl @@ -3598,10 +3598,10 @@ ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512BW-NEXT: movq %rdx, 24(%r15) -; AVX512BW-NEXT: movq %rax, 16(%r15) -; AVX512BW-NEXT: movq %rbp, 8(%r15) -; AVX512BW-NEXT: movq %r13, (%r15) +; AVX512BW-NEXT: movq %rdx, 24(%r13) +; AVX512BW-NEXT: movq %rax, 16(%r13) +; AVX512BW-NEXT: movq %r15, 8(%r13) +; AVX512BW-NEXT: movq %r14, (%r13) ; AVX512BW-NEXT: addq $24, %rsp ; AVX512BW-NEXT: popq %rbx ; AVX512BW-NEXT: popq %r12 Index: llvm/test/CodeGen/X86/vec_umulo.ll =================================================================== --- llvm/test/CodeGen/X86/vec_umulo.ll +++ llvm/test/CodeGen/X86/vec_umulo.ll @@ -2971,7 +2971,7 @@ ; SSE2-NEXT: addq %rsi, %rcx ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: mulq %r8 -; SSE2-NEXT: movq %rax, %rdi +; SSE2-NEXT: movq %rax, %r8 ; SSE2-NEXT: movq %rdx, %rsi ; SSE2-NEXT: addq %rcx, %rsi ; SSE2-NEXT: setb %cl @@ -2980,33 +2980,33 @@ ; SSE2-NEXT: testq %r9, %r9 ; SSE2-NEXT: setne %al ; SSE2-NEXT: testq %r12, %r12 -; SSE2-NEXT: setne %r8b -; SSE2-NEXT: andb %al, %r8b +; SSE2-NEXT: setne %r10b +; SSE2-NEXT: andb %al, %r10b ; SSE2-NEXT: movq %r12, %rax ; SSE2-NEXT: mulq %r15 -; SSE2-NEXT: movq %rax, %rbp -; SSE2-NEXT: seto %r10b +; SSE2-NEXT: movq %rax, %rdi +; SSE2-NEXT: seto %bpl ; SSE2-NEXT: movq %r9, %rax ; SSE2-NEXT: mulq %r11 ; SSE2-NEXT: movq %rax, %rbx ; SSE2-NEXT: seto %r9b -; SSE2-NEXT: orb %r10b, %r9b -; SSE2-NEXT: addq %rbp, %rbx +; SSE2-NEXT: orb %bpl, %r9b +; SSE2-NEXT: addq %rdi, %rbx ; SSE2-NEXT: movq %r11, %rax ; SSE2-NEXT: mulq %r15 ; SSE2-NEXT: addq %rbx, %rdx ; SSE2-NEXT: setb %bl ; SSE2-NEXT: orb %r9b, %bl -; SSE2-NEXT: orb %r8b, %bl -; SSE2-NEXT: movzbl %bl, %ebp -; SSE2-NEXT: negl %ebp -; SSE2-NEXT: movd %ebp, %xmm1 +; SSE2-NEXT: orb %r10b, %bl +; SSE2-NEXT: movzbl %bl, %edi +; SSE2-NEXT: negl %edi +; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: movzbl %cl, %ecx ; SSE2-NEXT: negl %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: movq %rax, 16(%r14) -; SSE2-NEXT: movq %rdi, (%r14) +; SSE2-NEXT: movq %r8, (%r14) ; SSE2-NEXT: movq %rdx, 24(%r14) ; SSE2-NEXT: movq %rsi, 8(%r14) ; SSE2-NEXT: popq %rbx @@ -3048,7 +3048,7 @@ ; SSSE3-NEXT: addq %rsi, %rcx ; SSSE3-NEXT: movq %rdi, %rax ; SSSE3-NEXT: mulq %r8 -; SSSE3-NEXT: movq %rax, %rdi +; SSSE3-NEXT: movq %rax, %r8 ; SSSE3-NEXT: movq %rdx, %rsi ; SSSE3-NEXT: addq %rcx, %rsi ; SSSE3-NEXT: setb %cl @@ -3057,33 +3057,33 @@ ; SSSE3-NEXT: testq %r9, %r9 ; SSSE3-NEXT: setne %al ; SSSE3-NEXT: testq %r12, %r12 -; SSSE3-NEXT: setne %r8b -; SSSE3-NEXT: andb %al, %r8b +; SSSE3-NEXT: setne %r10b +; SSSE3-NEXT: andb %al, %r10b ; SSSE3-NEXT: movq %r12, %rax ; SSSE3-NEXT: mulq %r15 -; SSSE3-NEXT: movq %rax, %rbp -; SSSE3-NEXT: seto %r10b +; SSSE3-NEXT: movq %rax, %rdi +; SSSE3-NEXT: seto %bpl ; SSSE3-NEXT: movq %r9, %rax ; SSSE3-NEXT: mulq %r11 ; SSSE3-NEXT: movq %rax, %rbx ; SSSE3-NEXT: seto %r9b -; SSSE3-NEXT: orb %r10b, %r9b -; SSSE3-NEXT: addq %rbp, %rbx +; SSSE3-NEXT: orb %bpl, %r9b +; SSSE3-NEXT: addq %rdi, %rbx ; SSSE3-NEXT: movq %r11, %rax ; SSSE3-NEXT: mulq %r15 ; SSSE3-NEXT: addq %rbx, %rdx ; SSSE3-NEXT: setb %bl ; SSSE3-NEXT: orb %r9b, %bl -; SSSE3-NEXT: orb %r8b, %bl -; SSSE3-NEXT: movzbl %bl, %ebp -; SSSE3-NEXT: negl %ebp -; SSSE3-NEXT: movd %ebp, %xmm1 +; SSSE3-NEXT: orb %r10b, %bl +; SSSE3-NEXT: movzbl %bl, %edi +; SSSE3-NEXT: negl %edi +; SSSE3-NEXT: movd %edi, %xmm1 ; SSSE3-NEXT: movzbl %cl, %ecx ; SSSE3-NEXT: negl %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: movq %rax, 16(%r14) -; SSSE3-NEXT: movq %rdi, (%r14) +; SSSE3-NEXT: movq %r8, (%r14) ; SSSE3-NEXT: movq %rdx, 24(%r14) ; SSSE3-NEXT: movq %rsi, 8(%r14) ; SSSE3-NEXT: popq %rbx @@ -3125,7 +3125,7 @@ ; SSE41-NEXT: addq %rsi, %rcx ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: mulq %r8 -; SSE41-NEXT: movq %rax, %rdi +; SSE41-NEXT: movq %rax, %r8 ; SSE41-NEXT: movq %rdx, %rsi ; SSE41-NEXT: addq %rcx, %rsi ; SSE41-NEXT: setb %cl @@ -3134,32 +3134,32 @@ ; SSE41-NEXT: testq %r9, %r9 ; SSE41-NEXT: setne %al ; SSE41-NEXT: testq %r12, %r12 -; SSE41-NEXT: setne %r8b -; SSE41-NEXT: andb %al, %r8b +; SSE41-NEXT: setne %r10b +; SSE41-NEXT: andb %al, %r10b ; SSE41-NEXT: movq %r12, %rax ; SSE41-NEXT: mulq %r15 -; SSE41-NEXT: movq %rax, %rbp -; SSE41-NEXT: seto %r10b +; SSE41-NEXT: movq %rax, %rdi +; SSE41-NEXT: seto %bpl ; SSE41-NEXT: movq %r9, %rax ; SSE41-NEXT: mulq %r11 ; SSE41-NEXT: movq %rax, %rbx ; SSE41-NEXT: seto %r9b -; SSE41-NEXT: orb %r10b, %r9b -; SSE41-NEXT: addq %rbp, %rbx +; SSE41-NEXT: orb %bpl, %r9b +; SSE41-NEXT: addq %rdi, %rbx ; SSE41-NEXT: movq %r11, %rax ; SSE41-NEXT: mulq %r15 ; SSE41-NEXT: addq %rbx, %rdx ; SSE41-NEXT: setb %bl ; SSE41-NEXT: orb %r9b, %bl -; SSE41-NEXT: orb %r8b, %bl -; SSE41-NEXT: movzbl %bl, %ebp -; SSE41-NEXT: negl %ebp +; SSE41-NEXT: orb %r10b, %bl +; SSE41-NEXT: movzbl %bl, %edi +; SSE41-NEXT: negl %edi ; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: negl %ecx ; SSE41-NEXT: movd %ecx, %xmm0 -; SSE41-NEXT: pinsrd $1, %ebp, %xmm0 +; SSE41-NEXT: pinsrd $1, %edi, %xmm0 ; SSE41-NEXT: movq %rax, 16(%r14) -; SSE41-NEXT: movq %rdi, (%r14) +; SSE41-NEXT: movq %r8, (%r14) ; SSE41-NEXT: movq %rdx, 24(%r14) ; SSE41-NEXT: movq %rsi, 8(%r14) ; SSE41-NEXT: popq %rbx @@ -3201,7 +3201,7 @@ ; AVX-NEXT: addq %rsi, %rcx ; AVX-NEXT: movq %rdi, %rax ; AVX-NEXT: mulq %r8 -; AVX-NEXT: movq %rax, %rdi +; AVX-NEXT: movq %rax, %r8 ; AVX-NEXT: movq %rdx, %rsi ; AVX-NEXT: addq %rcx, %rsi ; AVX-NEXT: setb %cl @@ -3210,32 +3210,32 @@ ; AVX-NEXT: testq %r9, %r9 ; AVX-NEXT: setne %al ; AVX-NEXT: testq %r12, %r12 -; AVX-NEXT: setne %r8b -; AVX-NEXT: andb %al, %r8b +; AVX-NEXT: setne %r10b +; AVX-NEXT: andb %al, %r10b ; AVX-NEXT: movq %r12, %rax ; AVX-NEXT: mulq %r15 -; AVX-NEXT: movq %rax, %rbp -; AVX-NEXT: seto %r10b +; AVX-NEXT: movq %rax, %rdi +; AVX-NEXT: seto %bpl ; AVX-NEXT: movq %r9, %rax ; AVX-NEXT: mulq %r11 ; AVX-NEXT: movq %rax, %rbx ; AVX-NEXT: seto %r9b -; AVX-NEXT: orb %r10b, %r9b -; AVX-NEXT: addq %rbp, %rbx +; AVX-NEXT: orb %bpl, %r9b +; AVX-NEXT: addq %rdi, %rbx ; AVX-NEXT: movq %r11, %rax ; AVX-NEXT: mulq %r15 ; AVX-NEXT: addq %rbx, %rdx ; AVX-NEXT: setb %bl ; AVX-NEXT: orb %r9b, %bl -; AVX-NEXT: orb %r8b, %bl -; AVX-NEXT: movzbl %bl, %ebp -; AVX-NEXT: negl %ebp +; AVX-NEXT: orb %r10b, %bl +; AVX-NEXT: movzbl %bl, %edi +; AVX-NEXT: negl %edi ; AVX-NEXT: movzbl %cl, %ecx ; AVX-NEXT: negl %ecx ; AVX-NEXT: vmovd %ecx, %xmm0 -; AVX-NEXT: vpinsrd $1, %ebp, %xmm0, %xmm0 +; AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 ; AVX-NEXT: movq %rax, 16(%r14) -; AVX-NEXT: movq %rdi, (%r14) +; AVX-NEXT: movq %r8, (%r14) ; AVX-NEXT: movq %rdx, 24(%r14) ; AVX-NEXT: movq %rsi, 8(%r14) ; AVX-NEXT: popq %rbx Index: llvm/test/CodeGen/X86/vector-fshl-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshl-128.ll +++ llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -1749,29 +1749,29 @@ ; ; X86-SSE2-LABEL: splatvar_funnnel_v16i8: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: pand %xmm3, %xmm4 -; X86-SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] -; X86-SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X86-SSE2-NEXT: psllw %xmm4, %xmm0 -; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm5 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pand %xmm4, %xmm5 +; X86-SSE2-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0] +; X86-SSE2-NEXT: psrldq {{.*#+}} xmm5 = xmm5[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE2-NEXT: psllw %xmm5, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; X86-SSE2-NEXT: psllw %xmm4, %xmm6 +; X86-SSE2-NEXT: psllw %xmm5, %xmm6 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[0,0,0,0,4,5,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; X86-SSE2-NEXT: pand %xmm4, %xmm0 -; X86-SSE2-NEXT: pandn %xmm3, %xmm2 +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,0,0,0,4,5,6,7] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: pandn %xmm4, %xmm2 ; X86-SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] ; X86-SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-SSE2-NEXT: psrlw $1, %xmm1 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: psrlw %xmm2, %xmm1 -; X86-SSE2-NEXT: psrlw %xmm2, %xmm5 -; X86-SSE2-NEXT: psrlw $8, %xmm5 -; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,0,0,0,4,5,6,7] +; X86-SSE2-NEXT: psrlw %xmm2, %xmm3 +; X86-SSE2-NEXT: psrlw $8, %xmm3 +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,0,0,0,4,5,6,7] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; X86-SSE2-NEXT: pand %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm2, %xmm0 Index: llvm/test/CodeGen/X86/vector-fshl-rot-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -1845,26 +1845,26 @@ ; ; X86-SSE2-LABEL: constant_funnnel_v16i8: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pxor %xmm1, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE2-NEXT: psrlw $8, %xmm2 +; X86-SSE2-NEXT: pxor %xmm2, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 ; X86-SSE2-NEXT: psrlw $8, %xmm3 -; X86-SSE2-NEXT: packuswb %xmm2, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: psrlw $8, %xmm1 +; X86-SSE2-NEXT: packuswb %xmm3, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; X86-SSE2-NEXT: pand %xmm3, %xmm2 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 -; X86-SSE2-NEXT: por %xmm3, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> ) ret <16 x i8> %res Index: llvm/test/CodeGen/X86/vector-fshr-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshr-128.ll +++ llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -1727,28 +1727,28 @@ ; ; X86-SSE2-LABEL: splatvar_funnnel_v16i8: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: pandn %xmm3, %xmm4 -; X86-SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] -; X86-SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pandn %xmm4, %xmm5 +; X86-SSE2-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0] +; X86-SSE2-NEXT: psrldq {{.*#+}} xmm5 = xmm5[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-SSE2-NEXT: paddb %xmm0, %xmm0 -; X86-SSE2-NEXT: psllw %xmm4, %xmm0 -; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm5 +; X86-SSE2-NEXT: psllw %xmm5, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; X86-SSE2-NEXT: psllw %xmm4, %xmm6 +; X86-SSE2-NEXT: psllw %xmm5, %xmm6 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[0,0,0,0,4,5,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; X86-SSE2-NEXT: pand %xmm4, %xmm0 -; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,0,0,0,4,5,6,7] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: pand %xmm4, %xmm2 ; X86-SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] ; X86-SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-SSE2-NEXT: psrlw %xmm2, %xmm1 -; X86-SSE2-NEXT: psrlw %xmm2, %xmm5 -; X86-SSE2-NEXT: psrlw $8, %xmm5 -; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,0,0,0,4,5,6,7] +; X86-SSE2-NEXT: psrlw %xmm2, %xmm3 +; X86-SSE2-NEXT: psrlw $8, %xmm3 +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,0,0,0,4,5,6,7] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; X86-SSE2-NEXT: pand %xmm1, %xmm2 ; X86-SSE2-NEXT: por %xmm2, %xmm0 Index: llvm/test/CodeGen/X86/vector-fshr-rot-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -1931,26 +1931,26 @@ ; ; X86-SSE2-LABEL: constant_funnnel_v16i8: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pxor %xmm1, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE2-NEXT: psrlw $8, %xmm2 +; X86-SSE2-NEXT: pxor %xmm2, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 ; X86-SSE2-NEXT: psrlw $8, %xmm3 -; X86-SSE2-NEXT: packuswb %xmm2, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: psrlw $8, %xmm1 +; X86-SSE2-NEXT: packuswb %xmm3, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; X86-SSE2-NEXT: pand %xmm3, %xmm2 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 -; X86-SSE2-NEXT: por %xmm3, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> ) ret <16 x i8> %res Index: llvm/test/CodeGen/X86/vector-gep.ll =================================================================== --- llvm/test/CodeGen/X86/vector-gep.ll +++ llvm/test/CodeGen/X86/vector-gep.ll @@ -152,39 +152,39 @@ ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovdqa 72(%ebp), %xmm3 -; CHECK-NEXT: vpaddd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpaddd %xmm3, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa 72(%ebp), %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, (%esp) # 16-byte Spill -; CHECK-NEXT: vmovdqa 88(%ebp), %xmm4 -; CHECK-NEXT: vpaddd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpaddd %xmm4, %xmm5, %xmm4 -; CHECK-NEXT: vmovdqa 104(%ebp), %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm5, %xmm1 -; CHECK-NEXT: vmovdqa 120(%ebp), %xmm6 +; CHECK-NEXT: vmovdqa 88(%ebp), %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm2 +; CHECK-NEXT: vmovdqa 104(%ebp), %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm1 +; CHECK-NEXT: vmovdqa 120(%ebp), %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa 136(%ebp), %xmm6 ; CHECK-NEXT: vpaddd %xmm6, %xmm6, %xmm6 ; CHECK-NEXT: vpaddd %xmm6, %xmm5, %xmm6 -; CHECK-NEXT: vmovdqa 136(%ebp), %xmm2 -; CHECK-NEXT: vpaddd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpaddd %xmm2, %xmm5, %xmm2 ; CHECK-NEXT: vmovdqa 152(%ebp), %xmm7 ; CHECK-NEXT: vpaddd %xmm7, %xmm7, %xmm7 ; CHECK-NEXT: vpaddd %xmm7, %xmm5, %xmm7 -; CHECK-NEXT: vmovdqa 168(%ebp), %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa 168(%ebp), %xmm4 +; CHECK-NEXT: vpaddd %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpaddd %xmm4, %xmm5, %xmm4 ; CHECK-NEXT: vmovdqa 184(%ebp), %xmm3 ; CHECK-NEXT: vpaddd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpaddd %xmm3, %xmm5, %xmm3 ; CHECK-NEXT: movl 8(%ebp), %eax ; CHECK-NEXT: vmovdqa %xmm3, 240(%eax) -; CHECK-NEXT: vmovdqa %xmm0, 224(%eax) +; CHECK-NEXT: vmovdqa %xmm4, 224(%eax) ; CHECK-NEXT: vmovdqa %xmm7, 208(%eax) -; CHECK-NEXT: vmovdqa %xmm2, 192(%eax) -; CHECK-NEXT: vmovdqa %xmm6, 176(%eax) +; CHECK-NEXT: vmovdqa %xmm6, 192(%eax) +; CHECK-NEXT: vmovdqa %xmm0, 176(%eax) ; CHECK-NEXT: vmovdqa %xmm1, 160(%eax) -; CHECK-NEXT: vmovdqa %xmm4, 144(%eax) +; CHECK-NEXT: vmovdqa %xmm2, 144(%eax) ; CHECK-NEXT: vmovaps (%esp), %xmm0 # 16-byte Reload ; CHECK-NEXT: vmovaps %xmm0, 128(%eax) ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload Index: llvm/test/CodeGen/X86/vector-idiv-v2i32.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-v2i32.ll +++ llvm/test/CodeGen/X86/vector-idiv-v2i32.ll @@ -131,27 +131,27 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmuludq %xmm1, %xmm2 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X86-NEXT: movdqa %xmm0, %xmm3 +; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; X86-NEXT: movdqa %xmm1, %xmm0 +; X86-NEXT: pmuludq %xmm2, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; X86-NEXT: movdqa %xmm1, %xmm3 ; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] -; X86-NEXT: pmuludq %xmm1, %xmm3 +; X86-NEXT: pmuludq %xmm2, %xmm3 ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; X86-NEXT: pxor %xmm3, %xmm3 -; X86-NEXT: pcmpgtd %xmm0, %xmm3 -; X86-NEXT: pand %xmm1, %xmm3 -; X86-NEXT: paddd %xmm0, %xmm3 -; X86-NEXT: psubd %xmm3, %xmm2 -; X86-NEXT: paddd %xmm0, %xmm2 -; X86-NEXT: movdqa %xmm2, %xmm0 -; X86-NEXT: psrld $31, %xmm0 -; X86-NEXT: psrad $2, %xmm2 -; X86-NEXT: paddd %xmm0, %xmm2 -; X86-NEXT: movq %xmm2, (%eax) +; X86-NEXT: pcmpgtd %xmm1, %xmm3 +; X86-NEXT: pand %xmm2, %xmm3 +; X86-NEXT: paddd %xmm1, %xmm3 +; X86-NEXT: psubd %xmm3, %xmm0 +; X86-NEXT: paddd %xmm1, %xmm0 +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: psrld $31, %xmm1 +; X86-NEXT: psrad $2, %xmm0 +; X86-NEXT: paddd %xmm1, %xmm0 +; X86-NEXT: movq %xmm0, (%eax) ; X86-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = sdiv <2 x i32> %a, @@ -193,30 +193,30 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmuludq %xmm1, %xmm2 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X86-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; X86-NEXT: movdqa %xmm0, %xmm1 +; X86-NEXT: pmuludq %xmm2, %xmm1 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; X86-NEXT: movdqa %xmm0, %xmm3 ; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] -; X86-NEXT: pmuludq %xmm1, %xmm3 +; X86-NEXT: pmuludq %xmm2, %xmm3 ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; X86-NEXT: pxor %xmm3, %xmm3 ; X86-NEXT: pcmpgtd %xmm0, %xmm3 -; X86-NEXT: pand %xmm1, %xmm3 +; X86-NEXT: pand %xmm2, %xmm3 ; X86-NEXT: paddd %xmm0, %xmm3 -; X86-NEXT: psubd %xmm3, %xmm2 -; X86-NEXT: paddd %xmm0, %xmm2 -; X86-NEXT: movdqa %xmm2, %xmm1 -; X86-NEXT: psrld $31, %xmm1 -; X86-NEXT: psrad $2, %xmm2 -; X86-NEXT: paddd %xmm1, %xmm2 -; X86-NEXT: movdqa %xmm2, %xmm1 -; X86-NEXT: pslld $3, %xmm1 -; X86-NEXT: psubd %xmm1, %xmm2 -; X86-NEXT: paddd %xmm0, %xmm2 -; X86-NEXT: movq %xmm2, (%eax) +; X86-NEXT: psubd %xmm3, %xmm1 +; X86-NEXT: paddd %xmm0, %xmm1 +; X86-NEXT: movdqa %xmm1, %xmm2 +; X86-NEXT: psrld $31, %xmm2 +; X86-NEXT: psrad $2, %xmm1 +; X86-NEXT: paddd %xmm2, %xmm1 +; X86-NEXT: movdqa %xmm1, %xmm2 +; X86-NEXT: pslld $3, %xmm2 +; X86-NEXT: psubd %xmm2, %xmm1 +; X86-NEXT: paddd %xmm0, %xmm1 +; X86-NEXT: movq %xmm1, (%eax) ; X86-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = srem <2 x i32> %a, Index: llvm/test/CodeGen/X86/vector-lzcnt-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-lzcnt-128.ll +++ llvm/test/CodeGen/X86/vector-lzcnt-128.ll @@ -231,30 +231,30 @@ ; ; X32-SSE-LABEL: testv2i64: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pshufb %xmm0, %xmm3 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pshufb %xmm0, %xmm4 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlw $4, %xmm1 ; X32-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X32-SSE-NEXT: pxor %xmm4, %xmm4 -; X32-SSE-NEXT: pshufb %xmm1, %xmm2 -; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm1 -; X32-SSE-NEXT: pand %xmm3, %xmm1 -; X32-SSE-NEXT: paddb %xmm2, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm2 -; X32-SSE-NEXT: psrlw $8, %xmm2 -; X32-SSE-NEXT: pand %xmm1, %xmm2 +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: pshufb %xmm1, %xmm3 +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1 +; X32-SSE-NEXT: pand %xmm4, %xmm1 +; X32-SSE-NEXT: paddb %xmm3, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm3 +; X32-SSE-NEXT: psrlw $8, %xmm3 +; X32-SSE-NEXT: pand %xmm1, %xmm3 ; X32-SSE-NEXT: psrlw $8, %xmm1 -; X32-SSE-NEXT: paddw %xmm2, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: pcmpeqw %xmm4, %xmm2 -; X32-SSE-NEXT: psrld $16, %xmm2 -; X32-SSE-NEXT: pand %xmm1, %xmm2 +; X32-SSE-NEXT: paddw %xmm3, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm3 +; X32-SSE-NEXT: psrld $16, %xmm3 +; X32-SSE-NEXT: pand %xmm1, %xmm3 ; X32-SSE-NEXT: psrld $16, %xmm1 -; X32-SSE-NEXT: paddd %xmm2, %xmm1 -; X32-SSE-NEXT: pcmpeqd %xmm4, %xmm0 +; X32-SSE-NEXT: paddd %xmm3, %xmm1 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm0 ; X32-SSE-NEXT: psrlq $32, %xmm0 ; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: psrlq $32, %xmm1 @@ -484,30 +484,30 @@ ; ; X32-SSE-LABEL: testv2i64u: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pshufb %xmm0, %xmm3 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pshufb %xmm0, %xmm4 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlw $4, %xmm1 ; X32-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X32-SSE-NEXT: pxor %xmm4, %xmm4 -; X32-SSE-NEXT: pshufb %xmm1, %xmm2 -; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm1 -; X32-SSE-NEXT: pand %xmm3, %xmm1 -; X32-SSE-NEXT: paddb %xmm2, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm2 -; X32-SSE-NEXT: psrlw $8, %xmm2 -; X32-SSE-NEXT: pand %xmm1, %xmm2 +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: pshufb %xmm1, %xmm3 +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1 +; X32-SSE-NEXT: pand %xmm4, %xmm1 +; X32-SSE-NEXT: paddb %xmm3, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm3 +; X32-SSE-NEXT: psrlw $8, %xmm3 +; X32-SSE-NEXT: pand %xmm1, %xmm3 ; X32-SSE-NEXT: psrlw $8, %xmm1 -; X32-SSE-NEXT: paddw %xmm2, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: pcmpeqw %xmm4, %xmm2 -; X32-SSE-NEXT: psrld $16, %xmm2 -; X32-SSE-NEXT: pand %xmm1, %xmm2 +; X32-SSE-NEXT: paddw %xmm3, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm3 +; X32-SSE-NEXT: psrld $16, %xmm3 +; X32-SSE-NEXT: pand %xmm1, %xmm3 ; X32-SSE-NEXT: psrld $16, %xmm1 -; X32-SSE-NEXT: paddd %xmm2, %xmm1 -; X32-SSE-NEXT: pcmpeqd %xmm4, %xmm0 +; X32-SSE-NEXT: paddd %xmm3, %xmm1 +; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm0 ; X32-SSE-NEXT: psrlq $32, %xmm0 ; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: psrlq $32, %xmm1 @@ -719,24 +719,24 @@ ; ; X32-SSE-LABEL: testv4i32: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pshufb %xmm0, %xmm3 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pshufb %xmm0, %xmm4 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlw $4, %xmm1 ; X32-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X32-SSE-NEXT: pxor %xmm4, %xmm4 -; X32-SSE-NEXT: pshufb %xmm1, %xmm2 -; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm1 -; X32-SSE-NEXT: pand %xmm3, %xmm1 -; X32-SSE-NEXT: paddb %xmm2, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm2 -; X32-SSE-NEXT: psrlw $8, %xmm2 -; X32-SSE-NEXT: pand %xmm1, %xmm2 +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: pshufb %xmm1, %xmm3 +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1 +; X32-SSE-NEXT: pand %xmm4, %xmm1 +; X32-SSE-NEXT: paddb %xmm3, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm3 +; X32-SSE-NEXT: psrlw $8, %xmm3 +; X32-SSE-NEXT: pand %xmm1, %xmm3 ; X32-SSE-NEXT: psrlw $8, %xmm1 -; X32-SSE-NEXT: paddw %xmm2, %xmm1 -; X32-SSE-NEXT: pcmpeqw %xmm4, %xmm0 +; X32-SSE-NEXT: paddw %xmm3, %xmm1 +; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm0 ; X32-SSE-NEXT: psrld $16, %xmm0 ; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: psrld $16, %xmm1 @@ -948,24 +948,24 @@ ; ; X32-SSE-LABEL: testv4i32u: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pshufb %xmm0, %xmm3 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pshufb %xmm0, %xmm4 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlw $4, %xmm1 ; X32-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X32-SSE-NEXT: pxor %xmm4, %xmm4 -; X32-SSE-NEXT: pshufb %xmm1, %xmm2 -; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm1 -; X32-SSE-NEXT: pand %xmm3, %xmm1 -; X32-SSE-NEXT: paddb %xmm2, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: pcmpeqb %xmm4, %xmm2 -; X32-SSE-NEXT: psrlw $8, %xmm2 -; X32-SSE-NEXT: pand %xmm1, %xmm2 +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: pshufb %xmm1, %xmm3 +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1 +; X32-SSE-NEXT: pand %xmm4, %xmm1 +; X32-SSE-NEXT: paddb %xmm3, %xmm1 +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm3 +; X32-SSE-NEXT: psrlw $8, %xmm3 +; X32-SSE-NEXT: pand %xmm1, %xmm3 ; X32-SSE-NEXT: psrlw $8, %xmm1 -; X32-SSE-NEXT: paddw %xmm2, %xmm1 -; X32-SSE-NEXT: pcmpeqw %xmm4, %xmm0 +; X32-SSE-NEXT: paddw %xmm3, %xmm1 +; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm0 ; X32-SSE-NEXT: psrld $16, %xmm0 ; X32-SSE-NEXT: pand %xmm1, %xmm0 ; X32-SSE-NEXT: psrld $16, %xmm1 Index: llvm/test/CodeGen/X86/vector-rotate-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-rotate-128.ll +++ llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -1798,26 +1798,26 @@ ; ; X86-SSE2-LABEL: constant_rotate_v16i8: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pxor %xmm1, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE2-NEXT: psrlw $8, %xmm2 +; X86-SSE2-NEXT: pxor %xmm2, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 ; X86-SSE2-NEXT: psrlw $8, %xmm3 -; X86-SSE2-NEXT: packuswb %xmm2, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: psrlw $8, %xmm1 +; X86-SSE2-NEXT: packuswb %xmm3, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; X86-SSE2-NEXT: pand %xmm3, %xmm2 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 -; X86-SSE2-NEXT: por %xmm3, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl %shl = shl <16 x i8> %a, %lshr = lshr <16 x i8> %a, Index: llvm/test/CodeGen/X86/vector-sext.ll =================================================================== --- llvm/test/CodeGen/X86/vector-sext.ll +++ llvm/test/CodeGen/X86/vector-sext.ll @@ -3617,26 +3617,26 @@ ; ; X86-SSE2-LABEL: sext_4i17_to_4i32: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl (%eax), %ecx -; X86-SSE2-NEXT: movl 4(%eax), %edx -; X86-SSE2-NEXT: movl 8(%eax), %eax -; X86-SSE2-NEXT: shldl $13, %edx, %eax -; X86-SSE2-NEXT: shll $15, %eax -; X86-SSE2-NEXT: sarl $15, %eax -; X86-SSE2-NEXT: movd %eax, %xmm0 -; X86-SSE2-NEXT: movl %edx, %eax -; X86-SSE2-NEXT: shll $13, %eax -; X86-SSE2-NEXT: sarl $15, %eax -; X86-SSE2-NEXT: movd %eax, %xmm1 +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE2-NEXT: movl (%edx), %ecx +; X86-SSE2-NEXT: movl 4(%edx), %eax +; X86-SSE2-NEXT: movl 8(%edx), %edx +; X86-SSE2-NEXT: shldl $13, %eax, %edx +; X86-SSE2-NEXT: shll $15, %edx +; X86-SSE2-NEXT: sarl $15, %edx +; X86-SSE2-NEXT: movd %edx, %xmm0 +; X86-SSE2-NEXT: movl %eax, %edx +; X86-SSE2-NEXT: shll $13, %edx +; X86-SSE2-NEXT: sarl $15, %edx +; X86-SSE2-NEXT: movd %edx, %xmm1 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-SSE2-NEXT: shldl $15, %ecx, %edx +; X86-SSE2-NEXT: shldl $15, %ecx, %eax ; X86-SSE2-NEXT: shll $15, %ecx ; X86-SSE2-NEXT: sarl $15, %ecx ; X86-SSE2-NEXT: movd %ecx, %xmm0 -; X86-SSE2-NEXT: shll $15, %edx -; X86-SSE2-NEXT: sarl $15, %edx -; X86-SSE2-NEXT: movd %edx, %xmm2 +; X86-SSE2-NEXT: shll $15, %eax +; X86-SSE2-NEXT: sarl $15, %eax +; X86-SSE2-NEXT: movd %eax, %xmm2 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X86-SSE2-NEXT: retl @@ -3646,25 +3646,25 @@ ; X86-SSE41-NEXT: pushl %esi ; X86-SSE41-NEXT: .cfi_def_cfa_offset 8 ; X86-SSE41-NEXT: .cfi_offset %esi, -8 -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movl (%eax), %ecx -; X86-SSE41-NEXT: movl 4(%eax), %edx -; X86-SSE41-NEXT: movl %edx, %esi -; X86-SSE41-NEXT: movl 8(%eax), %eax -; X86-SSE41-NEXT: shldl $13, %edx, %eax -; X86-SSE41-NEXT: shldl $15, %ecx, %edx -; X86-SSE41-NEXT: shll $15, %edx -; X86-SSE41-NEXT: sarl $15, %edx +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE41-NEXT: movl (%edx), %eax +; X86-SSE41-NEXT: movl 4(%edx), %ecx +; X86-SSE41-NEXT: movl %ecx, %esi +; X86-SSE41-NEXT: movl 8(%edx), %edx +; X86-SSE41-NEXT: shldl $13, %ecx, %edx +; X86-SSE41-NEXT: shldl $15, %eax, %ecx ; X86-SSE41-NEXT: shll $15, %ecx ; X86-SSE41-NEXT: sarl $15, %ecx -; X86-SSE41-NEXT: movd %ecx, %xmm0 -; X86-SSE41-NEXT: pinsrd $1, %edx, %xmm0 +; X86-SSE41-NEXT: shll $15, %eax +; X86-SSE41-NEXT: sarl $15, %eax +; X86-SSE41-NEXT: movd %eax, %xmm0 +; X86-SSE41-NEXT: pinsrd $1, %ecx, %xmm0 ; X86-SSE41-NEXT: shll $13, %esi ; X86-SSE41-NEXT: sarl $15, %esi ; X86-SSE41-NEXT: pinsrd $2, %esi, %xmm0 -; X86-SSE41-NEXT: shll $15, %eax -; X86-SSE41-NEXT: sarl $15, %eax -; X86-SSE41-NEXT: pinsrd $3, %eax, %xmm0 +; X86-SSE41-NEXT: shll $15, %edx +; X86-SSE41-NEXT: sarl $15, %edx +; X86-SSE41-NEXT: pinsrd $3, %edx, %xmm0 ; X86-SSE41-NEXT: popl %esi ; X86-SSE41-NEXT: .cfi_def_cfa_offset 4 ; X86-SSE41-NEXT: retl Index: llvm/test/CodeGen/X86/vector-shift-lshr-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-lshr-256.ll +++ llvm/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -468,36 +468,36 @@ ; ; X86-AVX1-LABEL: var_shift_v32i8: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; X86-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X86-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpsrlw $4, %xmm3, %xmm4 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X86-AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; X86-AVX1-NEXT: vpsllw $5, %xmm5, %xmm5 -; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 -; X86-AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3 +; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpsrlw $2, %xmm3, %xmm4 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] -; X86-AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 ; X86-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 -; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 -; X86-AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3 +; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpsrlw $1, %xmm3, %xmm4 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; X86-AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4 ; X86-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 -; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 -; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 -; X86-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 +; X86-AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2 ; X86-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 -; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3 -; X86-AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 ; X86-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3 -; X86-AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 ; X86-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: var_shift_v32i8: Index: llvm/test/CodeGen/X86/vector-shift-shl-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -246,30 +246,30 @@ ; ; X86-AVX1-LABEL: var_shift_v16i16: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; X86-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7] -; X86-AVX1-NEXT: vpslld $23, %xmm3, %xmm3 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] -; X86-AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; X86-AVX1-NEXT: vcvttps2dq %xmm3, %xmm3 -; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; X86-AVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; X86-AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 -; X86-AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 -; X86-AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X86-AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; X86-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4,4,5,5,6,6,7,7] +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; X86-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4,4,5,5,6,6,7,7] +; X86-AVX1-NEXT: vpslld $23, %xmm2, %xmm4 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] +; X86-AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm4 +; X86-AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; X86-AVX1-NEXT: vpslld $23, %xmm3, %xmm3 -; X86-AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm3 ; X86-AVX1-NEXT: vcvttps2dq %xmm3, %xmm3 +; X86-AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X86-AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4,4,5,5,6,6,7,7] +; X86-AVX1-NEXT: vpslld $23, %xmm4, %xmm4 +; X86-AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm4 +; X86-AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; X86-AVX1-NEXT: vpslld $23, %xmm1, %xmm1 -; X86-AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 -; X86-AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: var_shift_v16i16: Index: llvm/test/CodeGen/X86/vector-trunc-ssat.ll =================================================================== --- llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -6186,10 +6186,10 @@ ; AVX512-NEXT: shrl $16, %r14d ; AVX512-NEXT: movb %r14b, 26(%rdi) ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpextrd $3, %xmm0, %edx -; AVX512-NEXT: movw %dx, 21(%rdi) -; AVX512-NEXT: vpextrd $2, %xmm0, %esi -; AVX512-NEXT: movw %si, 18(%rdi) +; AVX512-NEXT: vpextrd $3, %xmm0, %esi +; AVX512-NEXT: movw %si, 21(%rdi) +; AVX512-NEXT: vpextrd $2, %xmm0, %edx +; AVX512-NEXT: movw %dx, 18(%rdi) ; AVX512-NEXT: vpextrd $1, %xmm0, %ecx ; AVX512-NEXT: movw %cx, 15(%rdi) ; AVX512-NEXT: vmovd %xmm0, %eax @@ -6202,10 +6202,10 @@ ; AVX512-NEXT: movb %r9b, 5(%rdi) ; AVX512-NEXT: shrl $16, %r8d ; AVX512-NEXT: movb %r8b, 2(%rdi) -; AVX512-NEXT: shrl $16, %edx -; AVX512-NEXT: movb %dl, 23(%rdi) ; AVX512-NEXT: shrl $16, %esi -; AVX512-NEXT: movb %sil, 20(%rdi) +; AVX512-NEXT: movb %sil, 23(%rdi) +; AVX512-NEXT: shrl $16, %edx +; AVX512-NEXT: movb %dl, 20(%rdi) ; AVX512-NEXT: shrl $16, %ecx ; AVX512-NEXT: movb %cl, 17(%rdi) ; AVX512-NEXT: shrl $16, %eax @@ -6268,10 +6268,10 @@ ; SKX-NEXT: shrl $16, %r14d ; SKX-NEXT: movb %r14b, 26(%rdi) ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 -; SKX-NEXT: vpextrd $3, %xmm0, %edx -; SKX-NEXT: movw %dx, 21(%rdi) -; SKX-NEXT: vpextrd $2, %xmm0, %esi -; SKX-NEXT: movw %si, 18(%rdi) +; SKX-NEXT: vpextrd $3, %xmm0, %esi +; SKX-NEXT: movw %si, 21(%rdi) +; SKX-NEXT: vpextrd $2, %xmm0, %edx +; SKX-NEXT: movw %dx, 18(%rdi) ; SKX-NEXT: vpextrd $1, %xmm0, %ecx ; SKX-NEXT: movw %cx, 15(%rdi) ; SKX-NEXT: vmovd %xmm0, %eax @@ -6284,10 +6284,10 @@ ; SKX-NEXT: movb %r9b, 5(%rdi) ; SKX-NEXT: shrl $16, %r8d ; SKX-NEXT: movb %r8b, 2(%rdi) -; SKX-NEXT: shrl $16, %edx -; SKX-NEXT: movb %dl, 23(%rdi) ; SKX-NEXT: shrl $16, %esi -; SKX-NEXT: movb %sil, 20(%rdi) +; SKX-NEXT: movb %sil, 23(%rdi) +; SKX-NEXT: shrl $16, %edx +; SKX-NEXT: movb %dl, 20(%rdi) ; SKX-NEXT: shrl $16, %ecx ; SKX-NEXT: movb %cl, 17(%rdi) ; SKX-NEXT: shrl $16, %eax Index: llvm/test/CodeGen/X86/vector-tzcnt-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-tzcnt-128.ll +++ llvm/test/CodeGen/X86/vector-tzcnt-128.ll @@ -643,22 +643,22 @@ ; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1 ; X32-SSE-NEXT: paddd %xmm0, %xmm1 ; X32-SSE-NEXT: pandn %xmm1, %xmm0 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: pand %xmm1, %xmm2 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: pshufb %xmm2, %xmm4 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: pand %xmm2, %xmm3 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-SSE-NEXT: movdqa %xmm1, %xmm4 +; X32-SSE-NEXT: pshufb %xmm3, %xmm4 ; X32-SSE-NEXT: psrlw $4, %xmm0 -; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: pshufb %xmm0, %xmm3 -; X32-SSE-NEXT: paddb %xmm4, %xmm3 -; X32-SSE-NEXT: pxor %xmm1, %xmm1 -; X32-SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero -; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; X32-SSE-NEXT: psadbw %xmm1, %xmm3 -; X32-SSE-NEXT: psadbw %xmm1, %xmm0 -; X32-SSE-NEXT: packuswb %xmm3, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: pshufb %xmm0, %xmm1 +; X32-SSE-NEXT: paddb %xmm4, %xmm1 +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X32-SSE-NEXT: psadbw %xmm2, %xmm1 +; X32-SSE-NEXT: psadbw %xmm2, %xmm0 +; X32-SSE-NEXT: packuswb %xmm1, %xmm0 ; X32-SSE-NEXT: retl %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 0) ret <4 x i32> %out @@ -886,22 +886,22 @@ ; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1 ; X32-SSE-NEXT: paddd %xmm0, %xmm1 ; X32-SSE-NEXT: pandn %xmm1, %xmm0 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: pand %xmm1, %xmm2 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: pshufb %xmm2, %xmm4 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: pand %xmm2, %xmm3 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-SSE-NEXT: movdqa %xmm1, %xmm4 +; X32-SSE-NEXT: pshufb %xmm3, %xmm4 ; X32-SSE-NEXT: psrlw $4, %xmm0 -; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: pshufb %xmm0, %xmm3 -; X32-SSE-NEXT: paddb %xmm4, %xmm3 -; X32-SSE-NEXT: pxor %xmm1, %xmm1 -; X32-SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero -; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; X32-SSE-NEXT: psadbw %xmm1, %xmm3 -; X32-SSE-NEXT: psadbw %xmm1, %xmm0 -; X32-SSE-NEXT: packuswb %xmm3, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: pshufb %xmm0, %xmm1 +; X32-SSE-NEXT: paddb %xmm4, %xmm1 +; X32-SSE-NEXT: pxor %xmm2, %xmm2 +; X32-SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X32-SSE-NEXT: psadbw %xmm2, %xmm1 +; X32-SSE-NEXT: psadbw %xmm2, %xmm0 +; X32-SSE-NEXT: packuswb %xmm1, %xmm0 ; X32-SSE-NEXT: retl %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 -1) ret <4 x i32> %out Index: llvm/test/CodeGen/X86/vshift-6.ll =================================================================== --- llvm/test/CodeGen/X86/vshift-6.ll +++ llvm/test/CodeGen/X86/vshift-6.ll @@ -36,29 +36,29 @@ ; X86-NEXT: movdqa %xmm2, %xmm1 ; X86-NEXT: pandn %xmm0, %xmm1 ; X86-NEXT: por %xmm2, %xmm1 -; X86-NEXT: pcmpeqd %xmm2, %xmm2 +; X86-NEXT: pcmpeqd %xmm3, %xmm3 ; X86-NEXT: psllw $5, %xmm1 -; X86-NEXT: pxor %xmm3, %xmm3 +; X86-NEXT: pxor %xmm2, %xmm2 ; X86-NEXT: pxor %xmm0, %xmm0 ; X86-NEXT: pcmpgtb %xmm1, %xmm0 -; X86-NEXT: pxor %xmm0, %xmm2 +; X86-NEXT: pxor %xmm0, %xmm3 ; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-NEXT: por %xmm2, %xmm0 +; X86-NEXT: por %xmm3, %xmm0 ; X86-NEXT: paddb %xmm1, %xmm1 -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: pcmpgtb %xmm1, %xmm2 -; X86-NEXT: movdqa %xmm2, %xmm4 +; X86-NEXT: pxor %xmm3, %xmm3 +; X86-NEXT: pcmpgtb %xmm1, %xmm3 +; X86-NEXT: movdqa %xmm3, %xmm4 ; X86-NEXT: pandn %xmm0, %xmm4 ; X86-NEXT: psllw $2, %xmm0 -; X86-NEXT: pand %xmm2, %xmm0 +; X86-NEXT: pand %xmm3, %xmm0 ; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: por %xmm4, %xmm0 ; X86-NEXT: paddb %xmm1, %xmm1 -; X86-NEXT: pcmpgtb %xmm1, %xmm3 -; X86-NEXT: movdqa %xmm3, %xmm1 +; X86-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-NEXT: movdqa %xmm2, %xmm1 ; X86-NEXT: pandn %xmm0, %xmm1 ; X86-NEXT: paddb %xmm0, %xmm0 -; X86-NEXT: pand %xmm3, %xmm0 +; X86-NEXT: pand %xmm2, %xmm0 ; X86-NEXT: por %xmm1, %xmm0 ; X86-NEXT: retl ; Index: llvm/test/CodeGen/X86/widen_cast-4.ll =================================================================== --- llvm/test/CodeGen/X86/widen_cast-4.ll +++ llvm/test/CodeGen/X86/widen_cast-4.ll @@ -20,19 +20,19 @@ ; WIDE-NEXT: # %bb.2: # %forbody ; WIDE-NEXT: # in Loop: Header=BB0_1 Depth=1 ; WIDE-NEXT: movl (%esp), %eax -; WIDE-NEXT: leal (,%eax,8), %ecx -; WIDE-NEXT: movl {{[0-9]+}}(%esp), %edx -; WIDE-NEXT: addl %ecx, %edx -; WIDE-NEXT: movl %edx, {{[0-9]+}}(%esp) -; WIDE-NEXT: addl {{[0-9]+}}(%esp), %ecx +; WIDE-NEXT: leal (,%eax,8), %edx +; WIDE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIDE-NEXT: addl %edx, %ecx ; WIDE-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIDE-NEXT: addl {{[0-9]+}}(%esp), %edx +; WIDE-NEXT: movl %edx, {{[0-9]+}}(%esp) ; WIDE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; WIDE-NEXT: psubb %xmm0, %xmm3 ; WIDE-NEXT: psrlw $2, %xmm3 ; WIDE-NEXT: pand %xmm1, %xmm3 ; WIDE-NEXT: pxor %xmm2, %xmm3 ; WIDE-NEXT: psubb %xmm2, %xmm3 -; WIDE-NEXT: movq %xmm3, (%edx,%eax,8) +; WIDE-NEXT: movq %xmm3, (%ecx,%eax,8) ; WIDE-NEXT: incl (%esp) ; WIDE-NEXT: jmp .LBB0_1 ; WIDE-NEXT: .LBB0_3: # %afterfor Index: llvm/test/CodeGen/X86/x86-fpclass.ll =================================================================== --- llvm/test/CodeGen/X86/x86-fpclass.ll +++ llvm/test/CodeGen/X86/x86-fpclass.ll @@ -434,27 +434,27 @@ ; CHECK-32-NEXT: pushl %esi ; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %edx ; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %esi +; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %edx ; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %edi ; CHECK-32-NEXT: andl $32767, %edi # imm = 0x7FFF ; CHECK-32-NEXT: cmpl $31745, %edi # imm = 0x7C01 ; CHECK-32-NEXT: setge %bl -; CHECK-32-NEXT: andl $32767, %esi # imm = 0x7FFF -; CHECK-32-NEXT: cmpl $31745, %esi # imm = 0x7C01 -; CHECK-32-NEXT: setge %bh -; CHECK-32-NEXT: addb %bh, %bh -; CHECK-32-NEXT: orb %bl, %bh ; CHECK-32-NEXT: andl $32767, %edx # imm = 0x7FFF ; CHECK-32-NEXT: cmpl $31745, %edx # imm = 0x7C01 ; CHECK-32-NEXT: setge %dl +; CHECK-32-NEXT: addb %dl, %dl +; CHECK-32-NEXT: orb %bl, %dl +; CHECK-32-NEXT: andl $32767, %esi # imm = 0x7FFF +; CHECK-32-NEXT: cmpl $31745, %esi # imm = 0x7C01 +; CHECK-32-NEXT: setge %dh ; CHECK-32-NEXT: andl $32767, %ecx # imm = 0x7FFF ; CHECK-32-NEXT: cmpl $31745, %ecx # imm = 0x7C01 ; CHECK-32-NEXT: setge %cl ; CHECK-32-NEXT: addb %cl, %cl -; CHECK-32-NEXT: orb %dl, %cl +; CHECK-32-NEXT: orb %dh, %cl ; CHECK-32-NEXT: shlb $2, %cl -; CHECK-32-NEXT: orb %bh, %cl +; CHECK-32-NEXT: orb %dl, %cl ; CHECK-32-NEXT: movb %cl, (%eax) ; CHECK-32-NEXT: popl %esi ; CHECK-32-NEXT: popl %edi @@ -485,27 +485,27 @@ ; CHECK-32-NEXT: pushl %esi ; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %edx ; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %esi +; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %edx ; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %edi ; CHECK-32-NEXT: andl $32767, %edi # imm = 0x7FFF ; CHECK-32-NEXT: cmpl $32641, %edi # imm = 0x7F81 ; CHECK-32-NEXT: setge %bl -; CHECK-32-NEXT: andl $32767, %esi # imm = 0x7FFF -; CHECK-32-NEXT: cmpl $32641, %esi # imm = 0x7F81 -; CHECK-32-NEXT: setge %bh -; CHECK-32-NEXT: addb %bh, %bh -; CHECK-32-NEXT: orb %bl, %bh ; CHECK-32-NEXT: andl $32767, %edx # imm = 0x7FFF ; CHECK-32-NEXT: cmpl $32641, %edx # imm = 0x7F81 ; CHECK-32-NEXT: setge %dl +; CHECK-32-NEXT: addb %dl, %dl +; CHECK-32-NEXT: orb %bl, %dl +; CHECK-32-NEXT: andl $32767, %esi # imm = 0x7FFF +; CHECK-32-NEXT: cmpl $32641, %esi # imm = 0x7F81 +; CHECK-32-NEXT: setge %dh ; CHECK-32-NEXT: andl $32767, %ecx # imm = 0x7FFF ; CHECK-32-NEXT: cmpl $32641, %ecx # imm = 0x7F81 ; CHECK-32-NEXT: setge %cl ; CHECK-32-NEXT: addb %cl, %cl -; CHECK-32-NEXT: orb %dl, %cl +; CHECK-32-NEXT: orb %dh, %cl ; CHECK-32-NEXT: shlb $2, %cl -; CHECK-32-NEXT: orb %bh, %cl +; CHECK-32-NEXT: orb %dl, %cl ; CHECK-32-NEXT: movb %cl, (%eax) ; CHECK-32-NEXT: popl %esi ; CHECK-32-NEXT: popl %edi @@ -536,28 +536,28 @@ ; CHECK-32-NEXT: fnstsw %ax ; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax ; CHECK-32-NEXT: sahf -; CHECK-32-NEXT: setp %dl +; CHECK-32-NEXT: setp %dh ; CHECK-32-NEXT: fucomp %st(0) ; CHECK-32-NEXT: fnstsw %ax ; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax ; CHECK-32-NEXT: sahf -; CHECK-32-NEXT: setp %dh -; CHECK-32-NEXT: addb %dh, %dh -; CHECK-32-NEXT: orb %dl, %dh +; CHECK-32-NEXT: setp %dl +; CHECK-32-NEXT: addb %dl, %dl +; CHECK-32-NEXT: orb %dh, %dl ; CHECK-32-NEXT: fucomp %st(0) ; CHECK-32-NEXT: fnstsw %ax ; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax ; CHECK-32-NEXT: sahf -; CHECK-32-NEXT: setp %dl +; CHECK-32-NEXT: setp %dh ; CHECK-32-NEXT: fucomp %st(0) ; CHECK-32-NEXT: fnstsw %ax ; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax ; CHECK-32-NEXT: sahf ; CHECK-32-NEXT: setp %al ; CHECK-32-NEXT: addb %al, %al -; CHECK-32-NEXT: orb %dl, %al -; CHECK-32-NEXT: shlb $2, %al ; CHECK-32-NEXT: orb %dh, %al +; CHECK-32-NEXT: shlb $2, %al +; CHECK-32-NEXT: orb %dl, %al ; CHECK-32-NEXT: movb %al, (%ecx) ; CHECK-32-NEXT: movl %ecx, %eax ; CHECK-32-NEXT: retl $4 @@ -584,28 +584,28 @@ ; CHECK-32-NEXT: fnstsw %ax ; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax ; CHECK-32-NEXT: sahf -; CHECK-32-NEXT: setp %dl +; CHECK-32-NEXT: setp %dh ; CHECK-32-NEXT: fucomp %st(0) ; CHECK-32-NEXT: fnstsw %ax ; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax ; CHECK-32-NEXT: sahf -; CHECK-32-NEXT: setp %dh -; CHECK-32-NEXT: addb %dh, %dh -; CHECK-32-NEXT: orb %dl, %dh +; CHECK-32-NEXT: setp %dl +; CHECK-32-NEXT: addb %dl, %dl +; CHECK-32-NEXT: orb %dh, %dl ; CHECK-32-NEXT: fucomp %st(0) ; CHECK-32-NEXT: fnstsw %ax ; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax ; CHECK-32-NEXT: sahf -; CHECK-32-NEXT: setp %dl +; CHECK-32-NEXT: setp %dh ; CHECK-32-NEXT: fucomp %st(0) ; CHECK-32-NEXT: fnstsw %ax ; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax ; CHECK-32-NEXT: sahf ; CHECK-32-NEXT: setp %al ; CHECK-32-NEXT: addb %al, %al -; CHECK-32-NEXT: orb %dl, %al -; CHECK-32-NEXT: shlb $2, %al ; CHECK-32-NEXT: orb %dh, %al +; CHECK-32-NEXT: shlb $2, %al +; CHECK-32-NEXT: orb %dl, %al ; CHECK-32-NEXT: movb %al, (%ecx) ; CHECK-32-NEXT: movl %ecx, %eax ; CHECK-32-NEXT: retl $4 @@ -858,27 +858,27 @@ ; CHECK-32-NEXT: pushl %esi ; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %edx ; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %esi +; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %edx ; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %edi ; CHECK-32-NEXT: andl $32767, %edi # imm = 0x7FFF ; CHECK-32-NEXT: cmpl $31745, %edi # imm = 0x7C01 ; CHECK-32-NEXT: setge %bl -; CHECK-32-NEXT: andl $32767, %esi # imm = 0x7FFF -; CHECK-32-NEXT: cmpl $31745, %esi # imm = 0x7C01 -; CHECK-32-NEXT: setge %bh -; CHECK-32-NEXT: addb %bh, %bh -; CHECK-32-NEXT: orb %bl, %bh ; CHECK-32-NEXT: andl $32767, %edx # imm = 0x7FFF ; CHECK-32-NEXT: cmpl $31745, %edx # imm = 0x7C01 ; CHECK-32-NEXT: setge %dl +; CHECK-32-NEXT: addb %dl, %dl +; CHECK-32-NEXT: orb %bl, %dl +; CHECK-32-NEXT: andl $32767, %esi # imm = 0x7FFF +; CHECK-32-NEXT: cmpl $31745, %esi # imm = 0x7C01 +; CHECK-32-NEXT: setge %dh ; CHECK-32-NEXT: andl $32767, %ecx # imm = 0x7FFF ; CHECK-32-NEXT: cmpl $31745, %ecx # imm = 0x7C01 ; CHECK-32-NEXT: setge %cl ; CHECK-32-NEXT: addb %cl, %cl -; CHECK-32-NEXT: orb %dl, %cl +; CHECK-32-NEXT: orb %dh, %cl ; CHECK-32-NEXT: shlb $2, %cl -; CHECK-32-NEXT: orb %bh, %cl +; CHECK-32-NEXT: orb %dl, %cl ; CHECK-32-NEXT: movb %cl, (%eax) ; CHECK-32-NEXT: popl %esi ; CHECK-32-NEXT: popl %edi @@ -909,27 +909,27 @@ ; CHECK-32-NEXT: pushl %esi ; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %edx ; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %esi +; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %edx ; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %edi ; CHECK-32-NEXT: andl $32767, %edi # imm = 0x7FFF ; CHECK-32-NEXT: cmpl $32641, %edi # imm = 0x7F81 ; CHECK-32-NEXT: setge %bl -; CHECK-32-NEXT: andl $32767, %esi # imm = 0x7FFF -; CHECK-32-NEXT: cmpl $32641, %esi # imm = 0x7F81 -; CHECK-32-NEXT: setge %bh -; CHECK-32-NEXT: addb %bh, %bh -; CHECK-32-NEXT: orb %bl, %bh ; CHECK-32-NEXT: andl $32767, %edx # imm = 0x7FFF ; CHECK-32-NEXT: cmpl $32641, %edx # imm = 0x7F81 ; CHECK-32-NEXT: setge %dl +; CHECK-32-NEXT: addb %dl, %dl +; CHECK-32-NEXT: orb %bl, %dl +; CHECK-32-NEXT: andl $32767, %esi # imm = 0x7FFF +; CHECK-32-NEXT: cmpl $32641, %esi # imm = 0x7F81 +; CHECK-32-NEXT: setge %dh ; CHECK-32-NEXT: andl $32767, %ecx # imm = 0x7FFF ; CHECK-32-NEXT: cmpl $32641, %ecx # imm = 0x7F81 ; CHECK-32-NEXT: setge %cl ; CHECK-32-NEXT: addb %cl, %cl -; CHECK-32-NEXT: orb %dl, %cl +; CHECK-32-NEXT: orb %dh, %cl ; CHECK-32-NEXT: shlb $2, %cl -; CHECK-32-NEXT: orb %bh, %cl +; CHECK-32-NEXT: orb %dl, %cl ; CHECK-32-NEXT: movb %cl, (%eax) ; CHECK-32-NEXT: popl %esi ; CHECK-32-NEXT: popl %edi @@ -957,24 +957,24 @@ ; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK-32-NEXT: andl %ecx, %edx ; CHECK-32-NEXT: cmpl $2139095041, %edx # imm = 0x7F800001 -; CHECK-32-NEXT: setge %dl +; CHECK-32-NEXT: setge %dh ; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-32-NEXT: andl %ecx, %esi ; CHECK-32-NEXT: cmpl $2139095041, %esi # imm = 0x7F800001 -; CHECK-32-NEXT: setge %dh -; CHECK-32-NEXT: addb %dh, %dh -; CHECK-32-NEXT: orb %dl, %dh +; CHECK-32-NEXT: setge %dl +; CHECK-32-NEXT: addb %dl, %dl +; CHECK-32-NEXT: orb %dh, %dl ; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-32-NEXT: andl %ecx, %esi ; CHECK-32-NEXT: cmpl $2139095041, %esi # imm = 0x7F800001 -; CHECK-32-NEXT: setge %dl +; CHECK-32-NEXT: setge %dh ; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %ecx ; CHECK-32-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 ; CHECK-32-NEXT: setge %cl ; CHECK-32-NEXT: addb %cl, %cl -; CHECK-32-NEXT: orb %dl, %cl -; CHECK-32-NEXT: shlb $2, %cl ; CHECK-32-NEXT: orb %dh, %cl +; CHECK-32-NEXT: shlb $2, %cl +; CHECK-32-NEXT: orb %dl, %cl ; CHECK-32-NEXT: movb %cl, (%eax) ; CHECK-32-NEXT: popl %esi ; CHECK-32-NEXT: retl $4 @@ -1008,38 +1008,38 @@ ; CHECK-32-NEXT: fstpl {{[0-9]+}}(%esp) ; CHECK-32-NEXT: wait ; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-32-NEXT: andl %eax, %ecx -; CHECK-32-NEXT: xorl %edx, %edx -; CHECK-32-NEXT: cmpl (%esp), %edx +; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-32-NEXT: andl %eax, %edx +; CHECK-32-NEXT: xorl %ecx, %ecx +; CHECK-32-NEXT: cmpl (%esp), %ecx ; CHECK-32-NEXT: movl $2146435072, %esi # imm = 0x7FF00000 -; CHECK-32-NEXT: sbbl %ecx, %esi -; CHECK-32-NEXT: setl %cl +; CHECK-32-NEXT: sbbl %edx, %esi +; CHECK-32-NEXT: setl %dh ; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-32-NEXT: andl %eax, %esi -; CHECK-32-NEXT: cmpl {{[0-9]+}}(%esp), %edx +; CHECK-32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx ; CHECK-32-NEXT: movl $2146435072, %edi # imm = 0x7FF00000 ; CHECK-32-NEXT: sbbl %esi, %edi -; CHECK-32-NEXT: setl %ch -; CHECK-32-NEXT: addb %ch, %ch -; CHECK-32-NEXT: orb %cl, %ch +; CHECK-32-NEXT: setl %dl +; CHECK-32-NEXT: addb %dl, %dl +; CHECK-32-NEXT: orb %dh, %dl ; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-32-NEXT: andl %eax, %esi -; CHECK-32-NEXT: cmpl {{[0-9]+}}(%esp), %edx +; CHECK-32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx ; CHECK-32-NEXT: movl $2146435072, %edi # imm = 0x7FF00000 ; CHECK-32-NEXT: sbbl %esi, %edi -; CHECK-32-NEXT: setl %cl +; CHECK-32-NEXT: setl %dh ; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl {{[0-9]+}}(%esp), %edx -; CHECK-32-NEXT: movl $2146435072, %edx # imm = 0x7FF00000 -; CHECK-32-NEXT: sbbl %eax, %edx -; CHECK-32-NEXT: setl %dl -; CHECK-32-NEXT: addb %dl, %dl -; CHECK-32-NEXT: orb %cl, %dl -; CHECK-32-NEXT: shlb $2, %dl -; CHECK-32-NEXT: orb %ch, %dl +; CHECK-32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: movl $2146435072, %ecx # imm = 0x7FF00000 +; CHECK-32-NEXT: sbbl %eax, %ecx +; CHECK-32-NEXT: setl %cl +; CHECK-32-NEXT: addb %cl, %cl +; CHECK-32-NEXT: orb %dh, %cl +; CHECK-32-NEXT: shlb $2, %cl +; CHECK-32-NEXT: orb %dl, %cl ; CHECK-32-NEXT: movl 8(%ebp), %eax -; CHECK-32-NEXT: movb %dl, (%eax) +; CHECK-32-NEXT: movb %cl, (%eax) ; CHECK-32-NEXT: leal -8(%ebp), %esp ; CHECK-32-NEXT: popl %esi ; CHECK-32-NEXT: popl %edi Index: llvm/test/CodeGen/X86/xmulo.ll =================================================================== --- llvm/test/CodeGen/X86/xmulo.ll +++ llvm/test/CodeGen/X86/xmulo.ll @@ -438,12 +438,12 @@ ; WIN32-NEXT: andb %dl, %bl ; WIN32-NEXT: mull {{[0-9]+}}(%esp) ; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: seto %cl +; WIN32-NEXT: seto %bh ; WIN32-NEXT: movl %esi, %eax ; WIN32-NEXT: mull %ebp ; WIN32-NEXT: movl %eax, %esi ; WIN32-NEXT: seto %ch -; WIN32-NEXT: orb %cl, %ch +; WIN32-NEXT: orb %bh, %ch ; WIN32-NEXT: addl %edi, %esi ; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) @@ -2010,11 +2010,11 @@ ; WIN32-NEXT: pushl %esi ; WIN32-NEXT: pushl %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl (%eax), %esi +; WIN32-NEXT: movl (%eax), %ecx ; WIN32-NEXT: movl 4(%eax), %eax -; WIN32-NEXT: testl %ecx, %ecx +; WIN32-NEXT: testl %esi, %esi ; WIN32-NEXT: setne %dl ; WIN32-NEXT: testl %eax, %eax ; WIN32-NEXT: setne %bl @@ -2022,15 +2022,15 @@ ; WIN32-NEXT: mull %ebp ; WIN32-NEXT: movl %eax, %edi ; WIN32-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; WIN32-NEXT: movl %ecx, %eax -; WIN32-NEXT: mull %esi -; WIN32-NEXT: movl %eax, %ecx +; WIN32-NEXT: movl %esi, %eax +; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl %eax, %esi ; WIN32-NEXT: seto %bh ; WIN32-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload -; WIN32-NEXT: addl %edi, %ecx -; WIN32-NEXT: movl %esi, %eax +; WIN32-NEXT: addl %edi, %esi +; WIN32-NEXT: movl %ecx, %eax ; WIN32-NEXT: mull %ebp -; WIN32-NEXT: addl %ecx, %edx +; WIN32-NEXT: addl %esi, %edx ; WIN32-NEXT: setb %cl ; WIN32-NEXT: orb %bh, %cl ; WIN32-NEXT: orb %bl, %cl @@ -2089,34 +2089,37 @@ ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi +; WIN32-NEXT: pushl %eax +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: movl (%edx), %ebp +; WIN32-NEXT: movl (%edx), %ecx ; WIN32-NEXT: movl 4(%edx), %esi ; WIN32-NEXT: testl %eax, %eax ; WIN32-NEXT: setne %dl ; WIN32-NEXT: testl %esi, %esi ; WIN32-NEXT: setne %bl ; WIN32-NEXT: andb %dl, %bl -; WIN32-NEXT: mull %ebp +; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: seto %cl +; WIN32-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: mull {{[0-9]+}}(%esp) +; WIN32-NEXT: mull %ebp ; WIN32-NEXT: movl %eax, %esi -; WIN32-NEXT: seto %ch -; WIN32-NEXT: orb %cl, %ch +; WIN32-NEXT: seto %bh +; WIN32-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload ; WIN32-NEXT: addl %edi, %esi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: mull %ebp +; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: mull %ecx ; WIN32-NEXT: addl %esi, %edx ; WIN32-NEXT: setb %cl -; WIN32-NEXT: orb %ch, %cl +; WIN32-NEXT: orb %bh, %cl ; WIN32-NEXT: orb %bl, %cl ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: movl %eax, (%esi) ; WIN32-NEXT: movl %edx, 4(%esi) ; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: addl $4, %esp ; WIN32-NEXT: popl %esi ; WIN32-NEXT: popl %edi ; WIN32-NEXT: popl %ebx