diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -2724,7 +2724,11 @@ GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary &Zone) { if (Zone.isTop()) { - if (Cand.SU->getDepth() > Zone.getScheduledLatency()) { + // Prefer the candidate with the lesser depth, but only if one of them has + // depth greater than the total latency scheduled so far, otherwise either + // of them could be scheduled now with no stall. + if (std::max(TryCand.SU->getDepth(), Cand.SU->getDepth()) > + Zone.getScheduledLatency()) { if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(), TryCand, Cand, GenericSchedulerBase::TopDepthReduce)) return true; @@ -2733,7 +2737,11 @@ TryCand, Cand, GenericSchedulerBase::TopPathReduce)) return true; } else { - if (Cand.SU->getHeight() > Zone.getScheduledLatency()) { + // Prefer the candidate with the lesser height, but only if one of them has + // height greater than the total latency scheduled so far, otherwise either + // of them could be scheduled now with no stall. + if (std::max(TryCand.SU->getHeight(), Cand.SU->getHeight()) > + Zone.getScheduledLatency()) { if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(), TryCand, Cand, GenericSchedulerBase::BotHeightReduce)) return true; diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll --- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll +++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll @@ -27,10 +27,10 @@ ; NONE16: fmov s1, wzr ; NONE16: fmov d2, xzr ; NONE16: movi{{(.16b)?}} v3{{(.2d)?}}, #0 -; ZEROFP: ldr h0,{{.*}} -; ZEROFP: movi v{{[0-3]+}}.2d, #0 -; ZEROFP: movi v{{[0-3]+}}.2d, #0 -; ZEROFP: movi v{{[0-3]+}}.2d, #0 +; ZEROFP-DAG: ldr h0,{{.*}} +; ZEROFP-DAG: movi v{{[0-3]+}}.2d, #0 +; ZEROFP-DAG: movi v{{[0-3]+}}.2d, #0 +; ZEROFP-DAG: movi v{{[0-3]+}}.2d, #0 ; ZERO16: movi v{{[0-3]+}}.2d, #0 ; ZERO16: movi v{{[0-3]+}}.2d, #0 ; ZERO16: movi v{{[0-3]+}}.2d, #0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -590,21 +590,21 @@ ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 -; SI-NEXT: s_movk_i32 s6, 0xff +; SI-NEXT: s_movk_i32 s0, 0xff ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, s6, v2 +; SI-NEXT: v_and_b32_e32 v1, s0, v2 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v2, s6, v3 +; SI-NEXT: v_and_b32_e32 v2, s0, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, s6, v4 +; SI-NEXT: v_and_b32_e32 v3, s0, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, s6, v0 +; SI-NEXT: v_and_b32_e32 v4, s0, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v4 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -839,21 +839,21 @@ ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 -; SI-NEXT: s_movk_i32 s6, 0xff +; SI-NEXT: s_movk_i32 s0, 0xff ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, s6, v2 +; SI-NEXT: v_and_b32_e32 v1, s0, v2 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v2, s6, v3 +; SI-NEXT: v_and_b32_e32 v2, s0, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, s6, v4 +; SI-NEXT: v_and_b32_e32 v3, s0, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, s6, v0 +; SI-NEXT: v_and_b32_e32 v4, s0, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v4 -; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -141,18 +141,18 @@ ; CHECK-NEXT: s_cmp_lg_u32 s4, 0 ; CHECK-NEXT: s_cbranch_scc0 BB4_6 ; CHECK-NEXT: ; %bb.1: ; %bb2 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, const.ptr@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, const.ptr@gotpcrel32@hi+4 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; CHECK-NEXT: s_mov_b32 s4, -1 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, const.ptr@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, const.ptr@gotpcrel32@hi+4 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: flat_load_dword v0, v[0:1] -; CHECK-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 +; CHECK-NEXT: s_mov_b32 s4, -1 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, 1.0, v0 ; CHECK-NEXT: s_xor_b64 s[8:9], vcc, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -1555,40 +1555,40 @@ ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v3, s4 -; CI-NEXT: ds_inc_rtn_u32 v4, v3, v2 -; CI-NEXT: ds_inc_rtn_u32 v5, v3, v2 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, s4 +; CI-NEXT: ds_inc_rtn_u32 v4, v1, v0 +; CI-NEXT: ds_inc_rtn_u32 v5, v1, v0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: s_waitcnt lgkmcnt(1) -; CI-NEXT: flat_store_dword v[2:3], v4 +; CI-NEXT: flat_store_dword v[0:1], v4 ; CI-NEXT: s_waitcnt lgkmcnt(1) -; CI-NEXT: flat_store_dword v[0:1], v5 +; CI-NEXT: flat_store_dword v[2:3], v5 ; CI-NEXT: s_endpgm ; ; VI-LABEL: nocse_lds_atomic_inc_ret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s4 -; VI-NEXT: ds_inc_rtn_u32 v4, v3, v2 -; VI-NEXT: ds_inc_rtn_u32 v5, v3, v2 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: ds_inc_rtn_u32 v4, v1, v0 +; VI-NEXT: ds_inc_rtn_u32 v5, v1, v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_waitcnt lgkmcnt(1) -; VI-NEXT: flat_store_dword v[2:3], v4 +; VI-NEXT: flat_store_dword v[0:1], v4 ; VI-NEXT: s_waitcnt lgkmcnt(1) -; VI-NEXT: flat_store_dword v[0:1], v5 +; VI-NEXT: flat_store_dword v[2:3], v5 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: nocse_lds_atomic_inc_ret_i32: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -848,17 +848,17 @@ ; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX7-NEXT: buffer_load_dword v3, v[1:2], s[0:3], 0 addr64 ; GFX7-NEXT: buffer_load_dword v4, v[1:2], s[0:3], 0 addr64 offset:4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: buffer_load_dword v0, v[1:2], s[0:3], 0 addr64 offset:8 +; GFX7-NEXT: buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 offset:8 ; GFX7-NEXT: s_cmp_lg_u32 s8, 0 -; GFX7-NEXT: s_cselect_b32 s6, 1, 0 -; GFX7-NEXT: s_and_b32 s0, 1, s6 +; GFX7-NEXT: s_cselect_b32 s0, 1, 0 +; GFX7-NEXT: s_and_b32 s0, 1, s0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_and_b64 vcc, vcc, s[0:1] ; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_fmas_f32 v0, v3, v4, v0 +; GFX7-NEXT: v_div_fmas_f32 v0, v3, v4, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll @@ -950,21 +950,21 @@ define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0, ; GFX6-LABEL: simplify_bfe_u32_multi_use_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s8, s[2:3], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s0, s0, 63 -; GFX6-NEXT: s_bfe_u32 s1, s0, 0x20002 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_and_b32 s8, s8, 63 +; GFX6-NEXT: s_bfe_u32 s9, s8, 0x20002 +; GFX6-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -20,53 +20,53 @@ ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NOUNALIGNED-NEXT: v_add_co_u32_e32 v2, vcc, 11, v0 ; GFX9-NOUNALIGNED-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v[2:3], off offset:-6 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v[2:3], off offset:-5 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[2:3], off offset:-4 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[2:3], off offset:-3 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[2:3], off offset:-2 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[2:3], off offset:-1 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[2:3], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[2:3], off offset:-6 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v[2:3], off offset:-5 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v[2:3], off offset:-4 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v[2:3], off offset:-3 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[2:3], off offset:-2 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[2:3], off offset:-1 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[2:3], off ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v1, v[2:3], off offset:-10 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v13, v[2:3], off offset:-9 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v14, v[2:3], off offset:-8 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[2:3], off offset:-9 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[2:3], off offset:-8 ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[2:3], off offset:-7 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, 0xff +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, 0xff ; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, 8 ; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s5, 8 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(11) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v6, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v4 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v3 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v4 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v3 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(7) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v5, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v8, v13, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v10, v11, v4 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v3 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v11, v12, v4 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v10, v10, v3 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) ; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v13 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v11, s4, v11 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v13, s4, v14 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v12, s4, v12 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v2, v4, v6 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v2, v3, v4 ; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v13 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v8 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v4, v9, v4, v5 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v11 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v3 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v2, v6, v7 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v4, v5, v8 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v3, v7, v3, v8 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v10 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v11 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v2, v4, v5 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v3, v6, v7 ; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align1: @@ -85,61 +85,62 @@ ; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s6, 0 ; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NOUNALIGNED-NEXT: s_mov_b64 s[4:5], 0 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:5 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:6 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:7 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v6, v[0:1], s[4:7], 0 addr64 offset:8 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v7, v[0:1], s[4:7], 0 addr64 offset:9 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v8, v[0:1], s[4:7], 0 addr64 offset:10 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v9, v[0:1], s[4:7], 0 addr64 offset:11 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v10, v[0:1], s[4:7], 0 addr64 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, v[0:1], s[4:7], 0 addr64 offset:1 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v12, v[0:1], s[4:7], 0 addr64 offset:2 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v13, v[0:1], s[4:7], 0 addr64 offset:3 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:4 -; GFX7-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, 0xff -; GFX7-NOUNALIGNED-NEXT: s_movk_i32 s8, 0xff +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:1 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:3 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v6, v[0:1], s[4:7], 0 addr64 offset:4 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v7, v[0:1], s[4:7], 0 addr64 offset:5 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v8, v[0:1], s[4:7], 0 addr64 offset:6 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v9, v[0:1], s[4:7], 0 addr64 offset:7 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v10, v[0:1], s[4:7], 0 addr64 offset:8 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, v[0:1], s[4:7], 0 addr64 offset:9 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v12, v[0:1], s[4:7], 0 addr64 offset:10 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:11 +; GFX7-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, 0xff +; GFX7-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(11) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, v3, v2 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, v4, v2 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v2 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v4 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v2 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v5 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(7) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v2 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v6, s4, v6 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v2 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v1 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v9, v2 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v1 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s8, v10 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v1 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v10, s8, v11 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v10, v10, v1 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v11, s8, v12 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 8, v10 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v11, v11, v1 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v12, v12, v1 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s8, v0 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v5 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v9 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 8, v11 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v12, 24, v0 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v2, v1 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v6, v5 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v10, v9 ; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v12, s8, v13 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v9 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v3, v6, v7 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 24, v12 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v3, v3, v8 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v10 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v4, v0, v4 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v1, v11 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v4, v5 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v2, v11 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v2, v12 ; GFX7-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 1 ret <3 x i32> %load @@ -158,27 +159,27 @@ ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NOUNALIGNED-NEXT: v_add_co_u32_e32 v2, vcc, 10, v0 ; GFX9-NOUNALIGNED-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[2:3], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[2:3], off ; GFX9-NOUNALIGNED-NEXT: global_load_ushort v0, v[0:1], off ; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v[2:3], off offset:-8 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v[2:3], off offset:-6 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v7, v[2:3], off offset:-4 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[2:3], off offset:-6 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v[2:3], off offset:-4 ; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[2:3], off offset:-2 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v4 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) ; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, v7, v4 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v3 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v6, v4, v3 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v5, v3, v6 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v2, v4, v5 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v2, v3, v4 ; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align2: @@ -203,18 +204,18 @@ ; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:4 ; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:6 ; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:8 -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s8, 0xffff +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s8, v3 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v3 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s8, v4 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v4 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s8, v5 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v5 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s8, v6 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v6 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v6, s8, v0 -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s8, v2 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v6, s4, v0 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v2 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v0 @@ -432,58 +433,58 @@ ; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v16, s2 ; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 9 ; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v20, v[10:11], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[12:13], off -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s3 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s2 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v19, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v18, s2 ; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 10 ; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0 ; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NOUNALIGNED-NEXT: s_add_u32 s0, s0, 11 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v13, v[14:15], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v14, v[16:17], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v15, v[10:11], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v20, v[10:11], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v21, v[12:13], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v14, v[14:15], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v15, v[16:17], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v16, v[18:19], off +; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s3 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, s1 ; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s2 -; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v16, v[10:11], off -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s1 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s0 +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, s0 ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[10:11], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[12:13], off ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v1, v[2:3], off ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[4:5], off ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v3, v[6:7], off ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[8:9], off -; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s5, 8 -; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v18, 0xff -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v19, 8 +; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s1, 8 +; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, 0xff +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, 8 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s0, v1 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v2 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v3 ; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v2 -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v12, v18 -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v13, v18 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v21, v5 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v14, v5 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v6, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v4, v18, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v4, v5, v0 ; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v0, v1, v2 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v19, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v10, v18 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v11, v5 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v16, v18 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v14, v18, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v10, v5 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v15, v5, v0 ; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v0, v1, v2 @@ -508,60 +509,59 @@ ; GFX7-NOUNALIGNED: ; %bb.0: ; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s2, -1 ; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:5 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:6 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 offset:7 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v4, off, s[0:3], 0 offset:8 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v5, off, s[0:3], 0 offset:9 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v6, off, s[0:3], 0 offset:10 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v7, off, s[0:3], 0 offset:11 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v9, off, s[0:3], 0 offset:1 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v10, off, s[0:3], 0 offset:2 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, off, s[0:3], 0 offset:3 -; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v12, off, s[0:3], 0 offset:4 -; GFX7-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0xff -; GFX7-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:1 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:2 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 offset:3 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v4, off, s[0:3], 0 offset:4 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v5, off, s[0:3], 0 offset:5 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v6, off, s[0:3], 0 offset:6 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v7, off, s[0:3], 0 offset:7 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 offset:8 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v9, off, s[0:3], 0 offset:9 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v10, off, s[0:3], 0 offset:10 +; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, off, s[0:3], 0 offset:11 +; GFX7-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, 0xff +; GFX7-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(11) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v1, v0 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v2, v0 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, v3, v0 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, v4, v0 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(7) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v0 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v0 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v12 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v12 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v8, s4, v8 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v9, s4, v9 -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v12 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v10, s4, v10 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 8, v9 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v12 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v11, s4, v11 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v10, v10, v12 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v12 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v12, s4, v12 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v10, 24, v11 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 24, v0 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v8, v7 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v11, v11, v12 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v12, v1 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v4, v5 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v9 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v4, v6 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v4, v8, v9 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v10 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v4, v10 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v7 ; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v2, v11 ; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 @@ -613,21 +613,21 @@ ; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[4:5], off ; GFX9-NOUNALIGNED-NEXT: global_load_ushort v3, v[6:7], off ; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[8:9], off -; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, 0xffff +; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1 ; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s0, v1 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v3, v12 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v3, v5 ; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v2, v12, v0 -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v10, v12 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v2, v5, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v10, v5 ; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v4, v12, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v4, v5, v0 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog @@ -656,19 +656,19 @@ ; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:4 ; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:6 ; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:8 -; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s0, v5 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v0 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -96,8 +96,8 @@ ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, gv3@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, gv3@gotpcrel32@hi+4 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -116,10 +116,10 @@ ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, gv0@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, gv0@gotpcrel32@hi+4 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, gv1@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, gv1@gotpcrel32@hi+4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll @@ -162,9 +162,9 @@ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: s_mov_b32 s8, 0 +; GFX6-NEXT: s_mov_b32 s4, 0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load i32, i32 addrspace(1)* %ptr @@ -180,8 +180,8 @@ ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b32 s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -192,8 +192,8 @@ ; GFX8-NEXT: s_mov_b32 s4, 0 ; GFX8-NEXT: s_mov_b32 s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -204,11 +204,11 @@ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: s_mov_b32 s8, 0 ; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: s_mov_b32 s5, 0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s8 +; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load i32, i32 addrspace(1)* %ptr diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -105,8 +105,8 @@ ; GCN-LABEL: {{^}}v_test_add_v2i16_inline_neg1: ; GFX9: v_pk_sub_u16 v{{[0-9]+}}, v{{[0-9]+}}, 1 op_sel_hi:[1,0]{{$}} -; VI: v_mov_b32_e32 v[[SCONST:[0-9]+]], -1 -; VI: flat_load_dword [[LOAD:v[0-9]+]] +; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], -1 +; VI-DAG: flat_load_dword [[LOAD:v[0-9]+]] ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD]], v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD]] ; VI: v_or_b32_e32 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -5059,16 +5059,16 @@ ; ; GCN-LABEL: udiv_i64_pow2k_denom: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_lshr_b64 s[4:5], s[6:7], 12 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_lshr_b64 s[0:1], s[2:3], 12 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm %r = udiv i64 %x, 4096 store i64 %r, i64 addrspace(1)* %out @@ -5703,20 +5703,20 @@ ; ; GCN-LABEL: sdiv_i64_pow2k_denom: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_ashr_i32 s4, s7, 31 -; GCN-NEXT: s_lshr_b32 s4, s4, 20 -; GCN-NEXT: s_add_u32 s4, s6, s4 -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_addc_u32 s5, s7, 0 -; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_ashr_i32 s0, s3, 31 +; GCN-NEXT: s_lshr_b32 s0, s0, 20 +; GCN-NEXT: s_add_u32 s0, s2, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm %r = sdiv i64 %x, 4096 store i64 %r, i64 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -339,35 +339,34 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, 0xff00ff -; SI-NEXT: s_mov_b32 s8, 0xf0f0f0f -; SI-NEXT: s_mov_b32 s9, 0xf0f0f0f0 -; SI-NEXT: s_mov_b32 s10, 0x33333333 -; SI-NEXT: s_mov_b32 s11, 0xcccccccc -; SI-NEXT: s_mov_b32 s0, 0x55555555 -; SI-NEXT: s_mov_b32 s1, 0xaaaaaaaa +; SI-NEXT: s_mov_b32 s0, 0xff00ff +; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f +; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0 +; SI-NEXT: s_mov_b32 s3, 0x33333333 +; SI-NEXT: s_mov_b32 s6, 0xcccccccc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v2, v0, v0, 8 ; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 ; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8 ; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 -; SI-NEXT: v_bfi_b32 v2, s6, v0, v2 -; SI-NEXT: v_bfi_b32 v4, s6, v1, v3 -; SI-NEXT: v_and_b32_e32 v1, s8, v2 -; SI-NEXT: v_and_b32_e32 v0, s8, v4 -; SI-NEXT: v_and_b32_e32 v3, s9, v2 -; SI-NEXT: v_and_b32_e32 v2, s9, v4 +; SI-NEXT: v_bfi_b32 v2, s0, v0, v2 +; SI-NEXT: v_bfi_b32 v4, s0, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, s1, v2 +; SI-NEXT: v_and_b32_e32 v0, s1, v4 +; SI-NEXT: v_and_b32_e32 v3, s2, v2 +; SI-NEXT: v_and_b32_e32 v2, s2, v4 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s0, 0x55555555 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, s10, v3 -; SI-NEXT: v_and_b32_e32 v0, s10, v2 -; SI-NEXT: v_and_b32_e32 v3, s11, v3 -; SI-NEXT: v_and_b32_e32 v2, s11, v2 +; SI-NEXT: v_and_b32_e32 v1, s3, v3 +; SI-NEXT: v_and_b32_e32 v0, s3, v2 +; SI-NEXT: v_and_b32_e32 v3, s6, v3 +; SI-NEXT: v_and_b32_e32 v2, s6, v2 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2 +; SI-NEXT: s_mov_b32 s1, 0xaaaaaaaa ; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 ; SI-NEXT: v_and_b32_e32 v1, s0, v3 @@ -376,6 +375,7 @@ ; SI-NEXT: v_and_b32_e32 v2, s1, v2 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -386,33 +386,33 @@ ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; FLAT-NEXT: s_mov_b32 s6, 0x10203 -; FLAT-NEXT: s_mov_b32 s2, 0x33333333 -; FLAT-NEXT: s_mov_b32 s3, 0xcccccccc +; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0 +; FLAT-NEXT: s_mov_b32 s3, 0x33333333 +; FLAT-NEXT: s_mov_b32 s6, 0xcccccccc ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s1 ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; FLAT-NEXT: s_mov_b32 s0, 0xf0f0f0f -; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f0 +; FLAT-NEXT: s_mov_b32 s0, 0x10203 +; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; FLAT-NEXT: v_perm_b32 v2, 0, v0, s6 -; FLAT-NEXT: v_perm_b32 v4, 0, v1, s6 -; FLAT-NEXT: v_and_b32_e32 v1, s0, v2 -; FLAT-NEXT: v_and_b32_e32 v0, s0, v4 -; FLAT-NEXT: v_and_b32_e32 v3, s1, v2 -; FLAT-NEXT: v_and_b32_e32 v2, s1, v4 +; FLAT-NEXT: v_perm_b32 v2, 0, v0, s0 +; FLAT-NEXT: v_perm_b32 v4, 0, v1, s0 +; FLAT-NEXT: v_and_b32_e32 v1, s1, v2 +; FLAT-NEXT: v_and_b32_e32 v0, s1, v4 +; FLAT-NEXT: v_and_b32_e32 v3, s2, v2 +; FLAT-NEXT: v_and_b32_e32 v2, s2, v4 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] ; FLAT-NEXT: s_mov_b32 s0, 0x55555555 ; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_and_b32_e32 v1, s2, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s2, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s3, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s3, v2 +; FLAT-NEXT: v_and_b32_e32 v1, s3, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s3, v2 +; FLAT-NEXT: v_and_b32_e32 v3, s6, v3 +; FLAT-NEXT: v_and_b32_e32 v2, s6, v2 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] ; FLAT-NEXT: s_mov_b32 s1, 0xaaaaaaaa @@ -600,13 +600,13 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s8, 0xff00ff -; SI-NEXT: s_mov_b32 s9, 0xf0f0f0f -; SI-NEXT: s_mov_b32 s10, 0xf0f0f0f0 -; SI-NEXT: s_mov_b32 s11, 0x33333333 -; SI-NEXT: s_mov_b32 s12, 0xcccccccc -; SI-NEXT: s_mov_b32 s13, 0x55555555 -; SI-NEXT: s_mov_b32 s14, 0xaaaaaaaa +; SI-NEXT: s_mov_b32 s0, 0xff00ff +; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f +; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0 +; SI-NEXT: s_mov_b32 s3, 0x33333333 +; SI-NEXT: s_mov_b32 s8, 0xcccccccc +; SI-NEXT: s_mov_b32 s9, 0x55555555 +; SI-NEXT: s_mov_b32 s10, 0xaaaaaaaa ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v4, v2, v2, 8 @@ -617,18 +617,18 @@ ; SI-NEXT: v_alignbit_b32 v7, v1, v1, 8 ; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 ; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24 -; SI-NEXT: v_bfi_b32 v2, s8, v2, v4 -; SI-NEXT: v_bfi_b32 v4, s8, v3, v5 -; SI-NEXT: v_bfi_b32 v6, s8, v0, v6 -; SI-NEXT: v_bfi_b32 v8, s8, v1, v7 -; SI-NEXT: v_and_b32_e32 v1, s9, v2 -; SI-NEXT: v_and_b32_e32 v0, s9, v4 -; SI-NEXT: v_and_b32_e32 v3, s10, v2 -; SI-NEXT: v_and_b32_e32 v2, s10, v4 -; SI-NEXT: v_and_b32_e32 v5, s9, v6 -; SI-NEXT: v_and_b32_e32 v4, s9, v8 -; SI-NEXT: v_and_b32_e32 v7, s10, v6 -; SI-NEXT: v_and_b32_e32 v6, s10, v8 +; SI-NEXT: v_bfi_b32 v2, s0, v2, v4 +; SI-NEXT: v_bfi_b32 v4, s0, v3, v5 +; SI-NEXT: v_bfi_b32 v6, s0, v0, v6 +; SI-NEXT: v_bfi_b32 v8, s0, v1, v7 +; SI-NEXT: v_and_b32_e32 v1, s1, v2 +; SI-NEXT: v_and_b32_e32 v0, s1, v4 +; SI-NEXT: v_and_b32_e32 v3, s2, v2 +; SI-NEXT: v_and_b32_e32 v2, s2, v4 +; SI-NEXT: v_and_b32_e32 v5, s1, v6 +; SI-NEXT: v_and_b32_e32 v4, s1, v8 +; SI-NEXT: v_and_b32_e32 v7, s2, v6 +; SI-NEXT: v_and_b32_e32 v6, s2, v8 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4 ; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 4 @@ -637,14 +637,14 @@ ; SI-NEXT: v_or_b32_e32 v2, v2, v0 ; SI-NEXT: v_or_b32_e32 v7, v7, v5 ; SI-NEXT: v_or_b32_e32 v6, v6, v4 -; SI-NEXT: v_and_b32_e32 v1, s11, v3 -; SI-NEXT: v_and_b32_e32 v0, s11, v2 -; SI-NEXT: v_and_b32_e32 v5, s11, v7 -; SI-NEXT: v_and_b32_e32 v4, s11, v6 -; SI-NEXT: v_and_b32_e32 v3, s12, v3 -; SI-NEXT: v_and_b32_e32 v2, s12, v2 -; SI-NEXT: v_and_b32_e32 v7, s12, v7 -; SI-NEXT: v_and_b32_e32 v6, s12, v6 +; SI-NEXT: v_and_b32_e32 v1, s3, v3 +; SI-NEXT: v_and_b32_e32 v0, s3, v2 +; SI-NEXT: v_and_b32_e32 v5, s3, v7 +; SI-NEXT: v_and_b32_e32 v4, s3, v6 +; SI-NEXT: v_and_b32_e32 v3, s8, v3 +; SI-NEXT: v_and_b32_e32 v2, s8, v2 +; SI-NEXT: v_and_b32_e32 v7, s8, v7 +; SI-NEXT: v_and_b32_e32 v6, s8, v6 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2 ; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 2 @@ -653,14 +653,14 @@ ; SI-NEXT: v_or_b32_e32 v2, v2, v0 ; SI-NEXT: v_or_b32_e32 v7, v7, v5 ; SI-NEXT: v_or_b32_e32 v6, v6, v4 -; SI-NEXT: v_and_b32_e32 v1, s13, v3 -; SI-NEXT: v_and_b32_e32 v0, s13, v2 -; SI-NEXT: v_and_b32_e32 v5, s13, v7 -; SI-NEXT: v_and_b32_e32 v4, s13, v6 -; SI-NEXT: v_and_b32_e32 v3, s14, v3 -; SI-NEXT: v_and_b32_e32 v2, s14, v2 -; SI-NEXT: v_and_b32_e32 v7, s14, v7 -; SI-NEXT: v_and_b32_e32 v6, s14, v6 +; SI-NEXT: v_and_b32_e32 v1, s9, v3 +; SI-NEXT: v_and_b32_e32 v0, s9, v2 +; SI-NEXT: v_and_b32_e32 v5, s9, v7 +; SI-NEXT: v_and_b32_e32 v4, s9, v6 +; SI-NEXT: v_and_b32_e32 v3, s10, v3 +; SI-NEXT: v_and_b32_e32 v2, s10, v2 +; SI-NEXT: v_and_b32_e32 v7, s10, v7 +; SI-NEXT: v_and_b32_e32 v6, s10, v6 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 ; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 @@ -677,33 +677,33 @@ ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; FLAT-NEXT: s_mov_b32 s10, 0x10203 -; FLAT-NEXT: s_mov_b32 s2, 0x33333333 -; FLAT-NEXT: s_mov_b32 s3, 0xcccccccc +; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0 +; FLAT-NEXT: s_mov_b32 s3, 0x33333333 +; FLAT-NEXT: s_mov_b32 s8, 0xcccccccc ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s1 ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; FLAT-NEXT: s_mov_b32 s0, 0xf0f0f0f -; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f0 -; FLAT-NEXT: s_mov_b32 s8, 0x55555555 -; FLAT-NEXT: s_mov_b32 s9, 0xaaaaaaaa +; FLAT-NEXT: s_mov_b32 s0, 0x10203 +; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f +; FLAT-NEXT: s_mov_b32 s9, 0x55555555 +; FLAT-NEXT: s_mov_b32 s10, 0xaaaaaaaa ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; FLAT-NEXT: v_perm_b32 v6, 0, v0, s10 -; FLAT-NEXT: v_perm_b32 v4, 0, v3, s10 -; FLAT-NEXT: v_perm_b32 v2, 0, v2, s10 -; FLAT-NEXT: v_perm_b32 v8, 0, v1, s10 -; FLAT-NEXT: v_and_b32_e32 v1, s0, v2 -; FLAT-NEXT: v_and_b32_e32 v0, s0, v4 -; FLAT-NEXT: v_and_b32_e32 v3, s1, v2 -; FLAT-NEXT: v_and_b32_e32 v2, s1, v4 -; FLAT-NEXT: v_and_b32_e32 v5, s0, v6 -; FLAT-NEXT: v_and_b32_e32 v4, s0, v8 -; FLAT-NEXT: v_and_b32_e32 v7, s1, v6 -; FLAT-NEXT: v_and_b32_e32 v6, s1, v8 +; FLAT-NEXT: v_perm_b32 v6, 0, v0, s0 +; FLAT-NEXT: v_perm_b32 v4, 0, v3, s0 +; FLAT-NEXT: v_perm_b32 v2, 0, v2, s0 +; FLAT-NEXT: v_perm_b32 v8, 0, v1, s0 +; FLAT-NEXT: v_and_b32_e32 v1, s1, v2 +; FLAT-NEXT: v_and_b32_e32 v0, s1, v4 +; FLAT-NEXT: v_and_b32_e32 v3, s2, v2 +; FLAT-NEXT: v_and_b32_e32 v2, s2, v4 +; FLAT-NEXT: v_and_b32_e32 v5, s1, v6 +; FLAT-NEXT: v_and_b32_e32 v4, s1, v8 +; FLAT-NEXT: v_and_b32_e32 v7, s2, v6 +; FLAT-NEXT: v_and_b32_e32 v6, s2, v8 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] ; FLAT-NEXT: v_lshlrev_b64 v[4:5], 4, v[4:5] @@ -712,14 +712,14 @@ ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 ; FLAT-NEXT: v_or_b32_e32 v7, v7, v5 ; FLAT-NEXT: v_or_b32_e32 v6, v6, v4 -; FLAT-NEXT: v_and_b32_e32 v1, s2, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s2, v2 -; FLAT-NEXT: v_and_b32_e32 v5, s2, v7 -; FLAT-NEXT: v_and_b32_e32 v4, s2, v6 -; FLAT-NEXT: v_and_b32_e32 v3, s3, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s3, v2 -; FLAT-NEXT: v_and_b32_e32 v7, s3, v7 -; FLAT-NEXT: v_and_b32_e32 v6, s3, v6 +; FLAT-NEXT: v_and_b32_e32 v1, s3, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s3, v2 +; FLAT-NEXT: v_and_b32_e32 v5, s3, v7 +; FLAT-NEXT: v_and_b32_e32 v4, s3, v6 +; FLAT-NEXT: v_and_b32_e32 v3, s8, v3 +; FLAT-NEXT: v_and_b32_e32 v2, s8, v2 +; FLAT-NEXT: v_and_b32_e32 v7, s8, v7 +; FLAT-NEXT: v_and_b32_e32 v6, s8, v6 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] ; FLAT-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] @@ -728,14 +728,14 @@ ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 ; FLAT-NEXT: v_or_b32_e32 v7, v7, v5 ; FLAT-NEXT: v_or_b32_e32 v6, v6, v4 -; FLAT-NEXT: v_and_b32_e32 v1, s8, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s8, v2 -; FLAT-NEXT: v_and_b32_e32 v5, s8, v7 -; FLAT-NEXT: v_and_b32_e32 v4, s8, v6 -; FLAT-NEXT: v_and_b32_e32 v3, s9, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s9, v2 -; FLAT-NEXT: v_and_b32_e32 v7, s9, v7 -; FLAT-NEXT: v_and_b32_e32 v6, s9, v6 +; FLAT-NEXT: v_and_b32_e32 v1, s9, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s9, v2 +; FLAT-NEXT: v_and_b32_e32 v5, s9, v7 +; FLAT-NEXT: v_and_b32_e32 v4, s9, v6 +; FLAT-NEXT: v_and_b32_e32 v3, s10, v3 +; FLAT-NEXT: v_and_b32_e32 v2, s10, v2 +; FLAT-NEXT: v_and_b32_e32 v7, s10, v7 +; FLAT-NEXT: v_and_b32_e32 v6, s10, v6 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] ; FLAT-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -33,17 +33,17 @@ ; ; VI-LABEL: test_bswap_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_load_dword s4, s[6:7], 0x0 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_load_dword s0, s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_perm_b32 v0, 0, s4, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: v_perm_b32 v0, 0, s0, v0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %val = load i32, i32 addrspace(1)* %in, align 4 %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone @@ -72,18 +72,18 @@ ; ; VI-LABEL: test_bswap_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_perm_b32 v1, 0, s5, v0 -; VI-NEXT: v_perm_b32 v0, 0, s4, v0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: v_perm_b32 v1, 0, s3, v0 +; VI-NEXT: v_perm_b32 v0, 0, s2, v0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone @@ -123,14 +123,14 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_perm_b32 v3, 0, s7, v0 -; VI-NEXT: v_perm_b32 v2, 0, s6, v0 -; VI-NEXT: v_perm_b32 v1, 0, s5, v0 -; VI-NEXT: v_perm_b32 v0, 0, s4, v0 +; VI-NEXT: v_perm_b32 v3, 0, s11, v0 +; VI-NEXT: v_perm_b32 v2, 0, s10, v0 +; VI-NEXT: v_perm_b32 v1, 0, s9, v0 +; VI-NEXT: v_perm_b32 v0, 0, s8, v0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 @@ -226,18 +226,18 @@ ; ; VI-LABEL: test_bswap_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x10203 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 +; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_perm_b32 v1, 0, s4, v0 -; VI-NEXT: v_perm_b32 v0, 0, s5, v0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: v_perm_b32 v1, 0, s2, v0 +; VI-NEXT: v_perm_b32 v0, 0, s3, v0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %val = load i64, i64 addrspace(1)* %in, align 8 %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone @@ -277,14 +277,14 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_perm_b32 v3, 0, s6, v0 -; VI-NEXT: v_perm_b32 v2, 0, s7, v0 -; VI-NEXT: v_perm_b32 v1, 0, s4, v0 -; VI-NEXT: v_perm_b32 v0, 0, s5, v0 +; VI-NEXT: v_perm_b32 v3, 0, s10, v0 +; VI-NEXT: v_perm_b32 v2, 0, s11, v0 +; VI-NEXT: v_perm_b32 v1, 0, s8, v0 +; VI-NEXT: v_perm_b32 v0, 0, s9, v0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -199,17 +199,17 @@ ; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NEXT: global_load_ushort v4, v[2:3], off -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: global_load_ushort v2, v[0:1], off ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v4, off, s[0:3], 0 offset:4 -; GCN-NEXT: global_load_ushort v4, v[2:3], off offset:2 +; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 +; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v4, off, s[0:3], 0 offset:6 -; GCN-NEXT: global_load_ushort v2, v[2:3], off offset:4 +; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:6 +; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:4 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:8 ; GCN-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -51,24 +51,23 @@ ; SI-LABEL: test_copy_v4i8_x2: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_x2: @@ -78,17 +77,17 @@ ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s6 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 @@ -106,28 +105,25 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s22, 0 -; SI-NEXT: s_mov_b32 s23, s11 +; SI-NEXT: s_mov_b32 s14, 0 +; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[20:21], s[6:7] +; SI-NEXT: s_mov_b64 s[12:13], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dword v0, v[0:1], s[20:23], 0 addr64 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s16, s4 -; SI-NEXT: s_mov_b32 s17, s5 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_x3: @@ -144,17 +140,15 @@ ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: s_mov_b32 s12, s2 ; VI-NEXT: s_mov_b32 s13, s3 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s16, s4 -; VI-NEXT: s_mov_b32 s17, s5 -; VI-NEXT: s_mov_b32 s18, s10 -; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0 -; VI-NEXT: buffer_store_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -168,68 +162,70 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v4i8_x4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s14, 0 -; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x11 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s20, s8 -; SI-NEXT: s_mov_b32 s21, s9 -; SI-NEXT: s_mov_b32 s8, s10 -; SI-NEXT: s_mov_b32 s9, s11 -; SI-NEXT: s_mov_b32 s16, s6 -; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: s_mov_b32 s18, s2 -; SI-NEXT: s_mov_b32 s19, s3 -; SI-NEXT: s_mov_b32 s22, s2 -; SI-NEXT: s_mov_b32 s23, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s22, s10 +; SI-NEXT: s_mov_b32 s23, s11 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s20, s6 +; SI-NEXT: s_mov_b32 s21, s7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_x4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s16, s8 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s17, s9 -; VI-NEXT: s_mov_b32 s8, s10 -; VI-NEXT: s_mov_b32 s9, s11 -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s18, s2 -; VI-NEXT: s_mov_b32 s19, s3 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s22, s10 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_mov_b32 s23, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s20, s6 +; VI-NEXT: s_mov_b32 s21, s7 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; VI-NEXT: buffer_store_dword v0, off, s[16:19], 0 -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -245,23 +241,22 @@ ; SI-LABEL: test_copy_v4i8_extra_use: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_mov_b32 s12, 0xff00 ; SI-NEXT: s_movk_i32 s13, 0xff -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -277,47 +272,47 @@ ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_extra_use: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_movk_i32 s10, 0x900 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_movk_i32 s12, 0xff00 +; VI-NEXT: s_movk_i32 s13, 0xff +; VI-NEXT: s_movk_i32 s14, 0x900 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_movk_i32 s8, 0xff00 -; VI-NEXT: s_movk_i32 s9, 0xff -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_and_b32_e32 v4, s8, v1 +; VI-NEXT: v_and_b32_e32 v4, s12, v1 ; VI-NEXT: v_add_u16_e32 v1, 9, v1 ; VI-NEXT: v_add_u16_e32 v3, 9, v0 -; VI-NEXT: v_and_b32_e32 v1, s9, v1 +; VI-NEXT: v_and_b32_e32 v1, s13, v1 ; VI-NEXT: v_or_b32_e32 v1, v4, v1 -; VI-NEXT: v_and_b32_e32 v2, s8, v0 -; VI-NEXT: v_and_b32_e32 v3, s9, v3 +; VI-NEXT: v_and_b32_e32 v2, s12, v0 +; VI-NEXT: v_and_b32_e32 v3, s13, v3 ; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u16_e32 v1, s10, v1 -; VI-NEXT: v_add_u16_e32 v2, s10, v2 +; VI-NEXT: v_add_u16_e32 v1, s14, v1 +; VI-NEXT: v_add_u16_e32 v2, s14, v2 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -334,35 +329,32 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s18, 0 -; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s14, 0 +; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: s_mov_b64 s[12:13], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64 -; SI-NEXT: s_mov_b32 s12, s4 -; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s4, 0xff00 -; SI-NEXT: s_movk_i32 s5, 0xff +; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 +; SI-NEXT: s_mov_b32 s16, 0xff00 +; SI-NEXT: s_movk_i32 s17, 0xff ; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s0, s2 -; SI-NEXT: s_mov_b32 s1, s3 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v4, s4, v1 +; SI-NEXT: v_and_b32_e32 v4, s16, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1 -; SI-NEXT: v_and_b32_e32 v2, s4, v0 -; SI-NEXT: v_and_b32_e32 v3, s5, v3 +; SI-NEXT: v_and_b32_e32 v2, s16, v0 +; SI-NEXT: v_and_b32_e32 v3, s17, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v1, s5, v1 +; SI-NEXT: v_and_b32_e32 v1, s17, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2 ; SI-NEXT: v_or_b32_e32 v1, v4, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -370,51 +362,49 @@ ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 -; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_x2_extra_use: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_movk_i32 s16, 0xff00 +; VI-NEXT: s_movk_i32 s17, 0xff +; VI-NEXT: s_movk_i32 s18, 0x900 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_movk_i32 s4, 0xff00 -; VI-NEXT: s_mov_b32 s13, s5 -; VI-NEXT: s_movk_i32 s5, 0xff -; VI-NEXT: s_movk_i32 s6, 0x900 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_and_b32_e32 v4, s4, v1 +; VI-NEXT: v_and_b32_e32 v4, s16, v1 ; VI-NEXT: v_add_u16_e32 v1, 9, v1 ; VI-NEXT: v_add_u16_e32 v3, 9, v0 -; VI-NEXT: v_and_b32_e32 v1, s5, v1 +; VI-NEXT: v_and_b32_e32 v1, s17, v1 ; VI-NEXT: v_or_b32_e32 v1, v4, v1 -; VI-NEXT: v_and_b32_e32 v2, s4, v0 -; VI-NEXT: v_and_b32_e32 v3, s5, v3 +; VI-NEXT: v_and_b32_e32 v2, s16, v0 +; VI-NEXT: v_and_b32_e32 v3, s17, v3 ; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u16_e32 v1, s6, v1 -; VI-NEXT: v_add_u16_e32 v2, s6, v2 +; VI-NEXT: v_add_u16_e32 v1, s18, v1 +; VI-NEXT: v_add_u16_e32 v2, s18, v2 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 -; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; VI-NEXT: buffer_store_dword v1, off, s[12:15], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -429,18 +419,18 @@ define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v3i8_align4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b64 s[4:5], s[10:11] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -449,17 +439,15 @@ ; ; VI-LABEL: test_copy_v3i8_align4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -526,27 +526,27 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0 +; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v6, s3 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v3 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v3 +; VI-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v0, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_ffbh_u32_e32 v6, v3 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc -; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; VI-NEXT: v_ffbh_u32_e32 v5, v0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 32, v5 +; VI-NEXT: v_ffbh_u32_e32 v6, v1 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 64, v1, vcc +; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i64: @@ -621,18 +621,18 @@ ; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] ; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0 -; VI-NEXT: flat_load_dwordx2 v[0:1], v[1:2] ; VI-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v2, v0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2 -; VI-NEXT: v_ffbh_u32_e32 v5, v1 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, 64, v1, vcc +; VI-NEXT: v_ffbh_u32_e32 v0, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; VI-NEXT: v_ffbh_u32_e32 v5, v2 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc ; VI-NEXT: flat_store_dword v[3:4], v0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -177,11 +177,11 @@ ; SI-NOSDWA: v_or_b32_e32 [[VAL2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} ; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]] ; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL2]] -; SI-SDWA: v_or_b32_e32 ; SI-SDWA: v_or_b32_sdwa +; SI-SDWA: v_or_b32_e32 +; SI-SDWA: v_or_b32_e32 ; SI-SDWA: v_or_b32_e32 [[VAL1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} ; SI-SDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]] -; SI-SDWA: v_or_b32_e32 ; SI-SDWA: v_or_b32_sdwa ; SI-SDWA: v_or_b32_e32 [[VAL2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} ; SI-SDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL2]] diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -636,20 +636,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_2_uses: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: s_movk_i32 s12, 0xff +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_movk_i32 s8, 0xff +; SI-NEXT: s_mov_b32 s6, s2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v4 @@ -659,57 +658,58 @@ ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v0, s12, v4 +; SI-NEXT: v_and_b32_e32 v0, s8, v4 ; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5 ; SI-NEXT: v_or_b32_e32 v0, v7, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; SI-NEXT: v_and_b32_e32 v2, s12, v2 +; SI-NEXT: v_and_b32_e32 v2, s8, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: v_mov_b32_e32 v5, 9 +; VI-NEXT: s_movk_i32 s8, 0x900 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, 9 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: s_movk_i32 s0, 0x900 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v5 -; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v5 -; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v5 -; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v5 -; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 +; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v4 +; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 +; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v5 -; VI-NEXT: v_add_u16_e32 v8, 9, v5 -; VI-NEXT: v_add_u16_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v4 +; VI-NEXT: v_add_u16_e32 v8, 9, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6 ; VI-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u16_e32 v0, s0, v0 +; VI-NEXT: v_mov_b32_e32 v2, s8 +; VI-NEXT: v_add_u16_e32 v0, s8, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -733,29 +733,30 @@ ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:5 -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:6 -; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 ; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2 -; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:3 -; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[0:3], 0 addr64 offset:4 +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:3 +; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:4 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:5 +; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[0:3], 0 addr64 offset:6 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v3 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v5 -; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v2, v7, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v4 +; SI-NEXT: v_or_b32_e32 v2, v9, v6 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v7, v8 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 +; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v5 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:24 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:24 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll @@ -609,10 +609,10 @@ ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI-DAG: v_cmp_nlt_f32_e32 vcc, v[[A_F32_1]], v[[B_F32_1]] -; VI-DAG: v_cmp_nlt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] +; VI-DAG: v_cmp_nlt_f16_e32 vcc, v[[B_V2_F16]], v[[A_V2_F16]] ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16_1]], v[[B_F16_1]] +; VI: v_cmp_nlt_f16_e32 vcc, v[[B_F16_1]], v[[A_F16_1]] ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -191,7 +191,7 @@ ; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} ; GCN-SAFE-DAG: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] -; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] +; GCN-SAFE-DAG: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] ; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]] ; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} @@ -1343,9 +1343,9 @@ ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] -; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} -; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]] -; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] +; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} +; GCN-SAFE-DAG: v_fma_f32 [[FMA:v[0-9]+]] +; GCN-SAFE-DAG: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] ; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]] ; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]] diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -289,35 +289,35 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) { ; GFX9-LABEL: urem16_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_movk_i32 s6, 0x400 +; GFX9-NEXT: s_movk_i32 s8, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s3, s2, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s5, s4, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: BB5_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_and_b32_e32 v2, s2, v4 +; GFX9-NEXT: v_and_b32_e32 v2, s4, v4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v2 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v7, v8, v1 -; GFX9-NEXT: v_trunc_f32_e32 v7, v7 -; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v7 -; GFX9-NEXT: v_mad_f32 v7, -v7, v0, v8 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, v0 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v9, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v7, v7, s3 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s6, v4 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s8, v4 +; GFX9-NEXT: v_mul_f32_e32 v9, v8, v1 +; GFX9-NEXT: v_trunc_f32_e32 v9, v9 +; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v9 +; GFX9-NEXT: v_mad_f32 v8, -v9, v0, v8 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v8|, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, s7 +; GFX9-NEXT: v_addc_co_u32_e64 v8, s[2:3], 0, v10, s[2:3] +; GFX9-NEXT: v_mul_lo_u32 v8, v8, s5 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s6, v5 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v7 +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] +; GFX9-NEXT: v_sub_u32_e32 v2, v2, v8 ; GFX9-NEXT: global_store_short v[5:6], v2, off ; GFX9-NEXT: s_cbranch_vccz BB5_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -398,38 +398,38 @@ ; GFX9-LABEL: srem16_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_movk_i32 s3, 0x400 +; GFX9-NEXT: s_movk_i32 s5, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX9-NEXT: s_sext_i32_i16 s4, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: BB7_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_bfe_i32 v7, v4, 0, 16 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v4 ; GFX9-NEXT: v_cvt_f32_i32_e32 v10, v7 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX9-NEXT: v_xor_b32_e32 v9, s4, v7 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v8, s5 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v8, v10, v1 -; GFX9-NEXT: v_xor_b32_e32 v9, s2, v7 -; GFX9-NEXT: v_trunc_f32_e32 v8, v8 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v9 -; GFX9-NEXT: v_cvt_i32_f32_e32 v9, v8 -; GFX9-NEXT: v_mad_f32 v8, -v8, v0, v10 +; GFX9-NEXT: v_mul_f32_e32 v9, v10, v1 +; GFX9-NEXT: v_trunc_f32_e32 v9, v9 +; GFX9-NEXT: v_cvt_i32_f32_e32 v11, v9 +; GFX9-NEXT: v_mad_f32 v9, -v9, v0, v10 ; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, |v0| -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v2, v9, v2 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v9|, |v0| +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[2:3] +; GFX9-NEXT: v_add_u32_e32 v2, v11, v2 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4 +; GFX9-NEXT: v_mov_b32_e32 v8, s7 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s6, v5 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc ; GFX9-NEXT: v_sub_u32_e32 v2, v7, v2 +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1] ; GFX9-NEXT: global_store_short v[5:6], v2, off ; GFX9-NEXT: s_cbranch_vccz BB7_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -2591,20 +2591,20 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_lshr_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s2, s0 +; GFX8-NEXT: s_and_b32 s3, s2, s0 ; GFX8-NEXT: s_lshr_b32 s2, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_and_b32 s0, s1, s0 +; GFX8-NEXT: s_lshr_b32 s1, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 @@ -2615,20 +2615,20 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 +; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off @@ -2728,19 +2728,19 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: flat_load_ushort v2, v[2:3] ; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: flat_load_ushort v1, v[2:3] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(0) -; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 8 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 8, v2 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 8 +; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 8 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX8-NEXT: v_mad_i32_i24 v0, v2, v0, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, v3, v1, v0 +; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX8-NEXT: v_mad_i32_i24 v0, v1, v0, s2 +; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2755,20 +2755,20 @@ ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NODL-NEXT: global_load_ushort v2, v[2:3], off ; GFX9-NODL-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NODL-NEXT: global_load_ushort v1, v[2:3], off ; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_bfe_i32 v3, v2, 0, 8 -; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 -; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_bfe_i32 v1, v0, 0, 8 +; GFX9-NODL-NEXT: v_bfe_i32 v2, v0, 0, 8 ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) +; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 +; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v1, 8, v1 ; GFX9-NODL-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v2, v0, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v1, v0, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v2, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off @@ -2783,20 +2783,20 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-DL-NEXT: global_load_ushort v2, v[2:3], off ; GFX9-DL-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-DL-NEXT: global_load_ushort v1, v[2:3], off ; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_bfe_i32 v3, v2, 0, 8 -; GFX9-DL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_bfe_i32 v1, v0, 0, 8 +; GFX9-DL-NEXT: v_bfe_i32 v2, v0, 0, 8 ; GFX9-DL-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 8 +; GFX9-DL-NEXT: v_lshrrev_b16_e32 v1, 8, v1 ; GFX9-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v0, v2, v0, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, v1, v0, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -221,28 +221,28 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i8 s3, s2 -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80008 +; GFX8-NEXT: s_sext_i32_i8 s2, s0 +; GFX8-NEXT: s_sext_i32_i8 s3, s1 +; GFX8-NEXT: s_bfe_i32 s5, s1, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_bfe_i32 s7, s2, 0x80010 -; GFX8-NEXT: s_sext_i32_i8 s1, s0 +; GFX8-NEXT: s_bfe_i32 s7, s1, 0x80010 ; GFX8-NEXT: s_bfe_i32 s4, s0, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: s_bfe_i32 s6, s0, 0x80010 -; GFX8-NEXT: s_ashr_i32 s2, s2, 24 +; GFX8-NEXT: s_ashr_i32 s1, s1, 24 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NEXT: s_ashr_i32 s0, s0, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s4, v4, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s6, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -252,28 +252,28 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s2 -; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80008 +; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s0 +; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s1 +; GFX9-NODL-NEXT: s_bfe_i32 s5, s1, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s0 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80010 ; GFX9-NODL-NEXT: s_bfe_i32 s4, s0, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-NODL-NEXT: s_bfe_i32 s6, s0, 0x80010 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v3, v2 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v4, v2 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm @@ -357,28 +357,28 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_movk_i32 s5, 0xff +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s7, s6, s5 -; GFX7-NEXT: s_bfe_u32 s8, s6, 0x80008 -; GFX7-NEXT: s_and_b32 s5, s4, s5 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80010 +; GFX7-NEXT: s_and_b32 s7, s4, s8 +; GFX7-NEXT: s_and_b32 s6, s5, s8 +; GFX7-NEXT: s_bfe_u32 s8, s5, 0x80008 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80010 ; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 -; GFX7-NEXT: s_lshr_b32 s6, s6, 24 +; GFX7-NEXT: s_lshr_b32 s5, s5, 24 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: s_lshr_b32 s4, s4, 24 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -388,30 +388,30 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_and_b32 s0, s2, s0 -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX8-NEXT: s_and_b32 s3, s2, s0 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GFX8-NEXT: s_and_b32 s0, s1, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x80010 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_lshr_b32 s1, s1, 24 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80010 ; GFX8-NEXT: s_lshr_b32 s2, s2, 24 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_lshr_b32 s1, s1, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -420,30 +420,30 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s2, 0x80010 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -184,28 +184,28 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_movk_i32 s5, 0xff +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s7, s6, s5 -; GFX7-NEXT: s_bfe_u32 s8, s6, 0x80008 -; GFX7-NEXT: s_and_b32 s5, s4, s5 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80010 +; GFX7-NEXT: s_and_b32 s7, s4, s8 +; GFX7-NEXT: s_and_b32 s6, s5, s8 +; GFX7-NEXT: s_bfe_u32 s8, s5, 0x80008 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80010 ; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 -; GFX7-NEXT: s_lshr_b32 s6, s6, 24 +; GFX7-NEXT: s_lshr_b32 s5, s5, 24 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: s_lshr_b32 s4, s4, 24 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -215,20 +215,20 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: s_and_b32 s3, s1, s0 ; GFX8-NEXT: s_and_b32 s0, s2, s0 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 ; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 ; GFX8-NEXT: s_lshr_b32 s2, s2, 24 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 @@ -247,20 +247,20 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 ; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 ; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7 @@ -354,28 +354,28 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_movk_i32 s5, 0xff +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s7, s6, s5 -; GFX7-NEXT: s_bfe_u32 s8, s6, 0x80008 -; GFX7-NEXT: s_and_b32 s5, s4, s5 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80010 +; GFX7-NEXT: s_and_b32 s7, s4, s8 +; GFX7-NEXT: s_and_b32 s6, s5, s8 +; GFX7-NEXT: s_bfe_u32 s8, s5, 0x80008 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80010 ; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 -; GFX7-NEXT: s_lshr_b32 s6, s6, 24 +; GFX7-NEXT: s_lshr_b32 s5, s5, 24 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: s_lshr_b32 s4, s4, 24 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -385,30 +385,30 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_and_b32 s0, s2, s0 -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX8-NEXT: s_and_b32 s3, s2, s0 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GFX8-NEXT: s_and_b32 s0, s1, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x80010 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_lshr_b32 s1, s1, 24 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80010 ; GFX8-NEXT: s_lshr_b32 s2, s2, 24 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_lshr_b32 s1, s1, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -417,30 +417,30 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s2, 0x80010 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -1426,28 +1426,28 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_sext_i32_i8 s7, s6 -; GFX7-NEXT: s_bfe_u32 s9, s6, 0x80008 -; GFX7-NEXT: s_sext_i32_i8 s5, s4 +; GFX7-NEXT: s_sext_i32_i8 s6, s4 +; GFX7-NEXT: s_sext_i32_i8 s7, s5 +; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008 ; GFX7-NEXT: s_and_b32 s7, s7, s8 ; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: s_bfe_u32 s11, s6, 0x80010 -; GFX7-NEXT: s_and_b32 s5, s5, s8 +; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80010 +; GFX7-NEXT: s_and_b32 s6, s6, s8 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-NEXT: s_bfe_u32 s12, s4, 0x80010 -; GFX7-NEXT: s_lshr_b32 s6, s6, 24 +; GFX7-NEXT: s_lshr_b32 s5, s5, 24 ; GFX7-NEXT: v_mov_b32_e32 v2, s11 ; GFX7-NEXT: s_lshr_b32 s4, s4, 24 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s5, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v3, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1457,28 +1457,28 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX8-NEXT: s_sext_i32_i8 s3, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 ; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008 -; GFX8-NEXT: s_sext_i32_i8 s1, s0 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX8-NEXT: s_sext_i32_i8 s3, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX8-NEXT: s_sext_i32_i8 s2, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, s3 ; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80010 -; GFX8-NEXT: s_lshr_b32 s2, s2, 24 +; GFX8-NEXT: s_lshr_b32 s1, s1, 24 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NEXT: s_lshr_b32 s0, s0, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s4, v3, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s2, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -1488,28 +1488,28 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 ; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008 -; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s0 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v3, v2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v4, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm @@ -1519,28 +1519,28 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX9-DL-NEXT: s_sext_i32_i8 s3, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x80010 ; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x80008 -; GFX9-DL-NEXT: s_sext_i32_i8 s1, s0 +; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX9-DL-NEXT: s_sext_i32_i8 s3, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-DL-NEXT: s_sext_i32_i8 s2, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80010 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v3, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v4, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v4, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm @@ -1809,29 +1809,29 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_movk_i32 s7, 0xff +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80008 -; GFX7-NEXT: s_bfe_u32 s12, s6, 0x80010 -; GFX7-NEXT: s_lshr_b32 s9, s6, 24 -; GFX7-NEXT: s_and_b32 s6, s6, s7 -; GFX7-NEXT: s_lshr_b32 s5, s4, 24 -; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008 +; GFX7-NEXT: s_lshr_b32 s6, s4, 24 +; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80008 +; GFX7-NEXT: s_bfe_u32 s12, s5, 0x80010 +; GFX7-NEXT: s_lshr_b32 s9, s5, 24 +; GFX7-NEXT: s_and_b32 s5, s5, s8 +; GFX7-NEXT: s_bfe_u32 s7, s4, 0x80008 ; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 -; GFX7-NEXT: s_and_b32 s4, s4, s7 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: s_and_b32 s4, s4, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-NEXT: v_mov_b32_e32 v3, s12 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -657,43 +657,43 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s6, s3, 0x40000 -; GFX8-NEXT: s_lshr_b32 s4, s3, 12 -; GFX8-NEXT: s_bfe_i32 s8, s3, 0x40004 -; GFX8-NEXT: s_bfe_i32 s10, s3, 0x40008 -; GFX8-NEXT: s_lshr_b32 s1, s0, 12 -; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX8-NEXT: s_lshr_b32 s3, s1, 12 +; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40000 +; GFX8-NEXT: s_lshr_b32 s4, s2, 12 +; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40004 +; GFX8-NEXT: s_bfe_i32 s10, s2, 0x40008 +; GFX8-NEXT: s_bfe_i32 s5, s1, 0x40000 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s1 +; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s3 ; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX8-NEXT: s_bfe_i32 s7, s0, 0x40004 -; GFX8-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX8-NEXT: s_bfe_i32 s7, s1, 0x40004 +; GFX8-NEXT: s_bfe_i32 s9, s1, 0x40008 ; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_mov_b32_e32 v7, s8 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX8-NEXT: v_mul_i32_i24_e32 v3, s9, v3 -; GFX8-NEXT: s_bfe_i32 s12, s3, 0x40010 -; GFX8-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX8-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX8-NEXT: s_bfe_i32 s14, s3, 0x40014 -; GFX8-NEXT: s_bfe_i32 s11, s0, 0x40010 +; GFX8-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX8-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX8-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX8-NEXT: s_bfe_i32 s11, s1, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: s_bfe_i32 s16, s3, 0x40018 -; GFX8-NEXT: s_bfe_i32 s13, s0, 0x40014 +; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX8-NEXT: s_bfe_i32 s13, s1, 0x40014 ; GFX8-NEXT: v_mov_b32_e32 v9, s14 -; GFX8-NEXT: s_bfe_i32 s15, s0, 0x40018 -; GFX8-NEXT: s_ashr_i32 s3, s3, 28 +; GFX8-NEXT: s_bfe_i32 s15, s1, 0x40018 +; GFX8-NEXT: s_ashr_i32 s2, s2, 28 ; GFX8-NEXT: v_mov_b32_e32 v10, s16 -; GFX8-NEXT: s_ashr_i32 s0, s0, 28 +; GFX8-NEXT: s_ashr_i32 s1, s1, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_i32_i24 v2, s5, v6, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s7, v7, v2 @@ -702,8 +702,8 @@ ; GFX8-NEXT: v_mad_i32_i24 v2, s11, v8, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s13, v9, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s15, v10, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -711,43 +711,43 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: s_movk_i32 s2, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s6, s3, 0x40000 -; GFX9-NEXT: s_lshr_b32 s4, s3, 12 -; GFX9-NEXT: s_bfe_i32 s8, s3, 0x40004 -; GFX9-NEXT: s_bfe_i32 s10, s3, 0x40008 -; GFX9-NEXT: s_lshr_b32 s1, s0, 12 -; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX9-NEXT: s_lshr_b32 s3, s1, 12 +; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40000 +; GFX9-NEXT: s_lshr_b32 s4, s2, 12 +; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40004 +; GFX9-NEXT: s_bfe_i32 s10, s2, 0x40008 +; GFX9-NEXT: s_bfe_i32 s5, s1, 0x40000 ; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s1 +; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s3 ; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX9-NEXT: s_bfe_i32 s7, s0, 0x40004 -; GFX9-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40004 +; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: v_mov_b32_e32 v7, s8 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-NEXT: v_mul_i32_i24_e32 v3, s9, v3 -; GFX9-NEXT: s_bfe_i32 s12, s3, 0x40010 -; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX9-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX9-NEXT: s_bfe_i32 s14, s3, 0x40014 -; GFX9-NEXT: s_bfe_i32 s11, s0, 0x40010 +; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX9-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-NEXT: s_bfe_i32 s11, s1, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-NEXT: s_bfe_i32 s16, s3, 0x40018 -; GFX9-NEXT: s_bfe_i32 s13, s0, 0x40014 +; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v9, s14 -; GFX9-NEXT: s_bfe_i32 s15, s0, 0x40018 -; GFX9-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40018 +; GFX9-NEXT: s_ashr_i32 s2, s2, 28 ; GFX9-NEXT: v_mov_b32_e32 v10, s16 -; GFX9-NEXT: s_ashr_i32 s0, s0, 28 +; GFX9-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_i32_i24 v2, s5, v6, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s7, v7, v2 @@ -756,8 +756,8 @@ ; GFX9-NEXT: v_mad_i32_i24 v2, s11, v8, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s13, v9, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s15, v10, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mad_i32_i24 v2, s1, v3, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -765,43 +765,43 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s6, s3, 0x40000 -; GFX9-DL-NEXT: s_lshr_b32 s4, s3, 12 -; GFX9-DL-NEXT: s_bfe_i32 s8, s3, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s10, s3, 0x40008 -; GFX9-DL-NEXT: s_lshr_b32 s1, s0, 12 -; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX9-DL-NEXT: s_lshr_b32 s3, s1, 12 +; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40000 +; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 12 +; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_i32 s5, s1, 0x40000 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s3 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX9-DL-NEXT: s_bfe_i32 s7, s0, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s9, v3 -; GFX9-DL-NEXT: s_bfe_i32 s12, s3, 0x40010 -; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX9-DL-NEXT: s_bfe_i32 s14, s3, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s11, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX9-DL-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s11, s1, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-DL-NEXT: s_bfe_i32 s16, s3, 0x40018 -; GFX9-DL-NEXT: s_bfe_i32 s13, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v9, s14 -; GFX9-DL-NEXT: s_bfe_i32 s15, s0, 0x40018 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40018 +; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 ; GFX9-DL-NEXT: v_mov_b32_e32 v10, s16 -; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v6, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v7, v2 @@ -810,8 +810,8 @@ ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s11, v8, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s13, v9, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s15, v10, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1462,19 +1462,19 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s15, s6, 0x40018 -; GFX7-NEXT: s_bfe_i32 s16, s6, 0x40014 -; GFX7-NEXT: s_bfe_i32 s17, s6, 0x40010 -; GFX7-NEXT: s_bfe_i32 s18, s6, 0x40000 -; GFX7-NEXT: s_bfe_i32 s19, s6, 0x40004 -; GFX7-NEXT: s_bfe_i32 s20, s6, 0x40008 -; GFX7-NEXT: s_ashr_i32 s14, s6, 28 -; GFX7-NEXT: s_bfe_i32 s6, s6, 0x4000c -; GFX7-NEXT: s_ashr_i32 s5, s4, 28 +; GFX7-NEXT: s_ashr_i32 s6, s4, 28 +; GFX7-NEXT: s_bfe_i32 s15, s5, 0x40018 +; GFX7-NEXT: s_bfe_i32 s16, s5, 0x40014 +; GFX7-NEXT: s_bfe_i32 s17, s5, 0x40010 +; GFX7-NEXT: s_bfe_i32 s18, s5, 0x40000 +; GFX7-NEXT: s_bfe_i32 s19, s5, 0x40004 +; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40008 +; GFX7-NEXT: s_ashr_i32 s14, s5, 28 +; GFX7-NEXT: s_bfe_i32 s5, s5, 0x4000c ; GFX7-NEXT: s_bfe_i32 s7, s4, 0x40018 ; GFX7-NEXT: s_bfe_i32 s9, s4, 0x40014 ; GFX7-NEXT: s_bfe_i32 s10, s4, 0x40010 @@ -1485,7 +1485,7 @@ ; GFX7-NEXT: s_bfe_i32 s13, s4, 0x40008 ; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: s_bfe_i32 s4, s4, 0x4000c -; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mul_i32_i24_e32 v1, s4, v1 ; GFX7-NEXT: v_mul_i32_i24_e32 v2, s13, v2 ; GFX7-NEXT: v_mul_i32_i24_e32 v3, s12, v3 @@ -1510,7 +1510,7 @@ ; GFX7-NEXT: v_mad_i32_i24 v0, s9, v6, v0 ; GFX7-NEXT: v_mad_i32_i24 v0, s7, v7, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s14 -; GFX7-NEXT: v_mad_i32_i24 v0, s5, v1, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s6, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -1954,24 +1954,24 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s7, s0, 4 -; GFX9-NEXT: s_lshr_b32 s14, s1, 4 -; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s0 -; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s1 +; GFX9-NEXT: s_lshr_b32 s7, s1, 4 +; GFX9-NEXT: s_lshr_b32 s14, s2, 4 +; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s1 +; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s2 ; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s7 ; GFX9-NEXT: v_lshlrev_b16_e64 v14, 12, s14 -; GFX9-NEXT: s_lshr_b32 s8, s0, 12 -; GFX9-NEXT: s_lshr_b32 s9, s0, 8 -; GFX9-NEXT: s_lshr_b32 s15, s1, 12 -; GFX9-NEXT: s_lshr_b32 s16, s1, 8 +; GFX9-NEXT: s_lshr_b32 s8, s1, 12 +; GFX9-NEXT: s_lshr_b32 s9, s1, 8 +; GFX9-NEXT: s_lshr_b32 s15, s2, 12 +; GFX9-NEXT: s_lshr_b32 s16, s2, 8 ; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s9 ; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s8 ; GFX9-NEXT: v_lshlrev_b16_e64 v12, 12, s16 @@ -1987,21 +1987,21 @@ ; GFX9-NEXT: v_mul_lo_u16_e32 v3, v3, v4 ; GFX9-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_lshr_b32 s3, s0, 20 -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_lshr_b32 s10, s1, 20 -; GFX9-NEXT: s_lshr_b32 s11, s1, 16 +; GFX9-NEXT: s_lshr_b32 s3, s1, 20 +; GFX9-NEXT: s_lshr_b32 s4, s1, 16 +; GFX9-NEXT: s_lshr_b32 s10, s2, 20 +; GFX9-NEXT: s_lshr_b32 s11, s2, 16 ; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v5, v5, v12 ; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s4 ; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s3 ; GFX9-NEXT: v_lshlrev_b16_e64 v17, 12, s11 ; GFX9-NEXT: v_lshlrev_b16_e64 v18, 12, s10 -; GFX9-NEXT: s_lshr_b32 s5, s0, 28 -; GFX9-NEXT: s_lshr_b32 s6, s0, 24 -; GFX9-NEXT: s_lshr_b32 s12, s1, 28 -; GFX9-NEXT: s_lshr_b32 s13, s1, 24 -; GFX9-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-NEXT: s_lshr_b32 s5, s1, 28 +; GFX9-NEXT: s_lshr_b32 s6, s1, 24 +; GFX9-NEXT: s_lshr_b32 s12, s2, 28 +; GFX9-NEXT: s_lshr_b32 s13, s2, 24 +; GFX9-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s6 ; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s5 @@ -2023,7 +2023,7 @@ ; GFX9-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v8, v8, v15 ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX9-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX9-NEXT: v_or_b32_e32 v6, v4, v8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 @@ -2042,24 +2042,24 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s7, s0, 4 -; GFX9-DL-NEXT: s_lshr_b32 s14, s1, 4 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 +; GFX9-DL-NEXT: s_lshr_b32 s7, s1, 4 +; GFX9-DL-NEXT: s_lshr_b32 s14, s2, 4 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s1 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s2 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s14 -; GFX9-DL-NEXT: s_lshr_b32 s8, s0, 12 -; GFX9-DL-NEXT: s_lshr_b32 s9, s0, 8 -; GFX9-DL-NEXT: s_lshr_b32 s15, s1, 12 -; GFX9-DL-NEXT: s_lshr_b32 s16, s1, 8 +; GFX9-DL-NEXT: s_lshr_b32 s8, s1, 12 +; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 8 +; GFX9-DL-NEXT: s_lshr_b32 s15, s2, 12 +; GFX9-DL-NEXT: s_lshr_b32 s16, s2, 8 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s9 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s8 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s16 @@ -2075,21 +2075,21 @@ ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, v3, v4 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: s_lshr_b32 s3, s0, 20 -; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 20 -; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 16 +; GFX9-DL-NEXT: s_lshr_b32 s3, s1, 20 +; GFX9-DL-NEXT: s_lshr_b32 s4, s1, 16 +; GFX9-DL-NEXT: s_lshr_b32 s10, s2, 20 +; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 16 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, v5, v12 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s4 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s3 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s11 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v18, 12, s10 -; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 28 -; GFX9-DL-NEXT: s_lshr_b32 s6, s0, 24 -; GFX9-DL-NEXT: s_lshr_b32 s12, s1, 28 -; GFX9-DL-NEXT: s_lshr_b32 s13, s1, 24 -; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 28 +; GFX9-DL-NEXT: s_lshr_b32 s6, s1, 24 +; GFX9-DL-NEXT: s_lshr_b32 s12, s2, 28 +; GFX9-DL-NEXT: s_lshr_b32 s13, s2, 24 +; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX9-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s6 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s5 @@ -2111,7 +2111,7 @@ ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v8, v15 ; GFX9-DL-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX9-DL-NEXT: v_or_b32_e32 v6, v4, v8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -2426,38 +2426,38 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40010 -; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014 -; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40018 -; GFX9-NEXT: s_lshr_b32 s13, s1, 28 -; GFX9-NEXT: s_and_b32 s14, s1, 15 -; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s3, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40018 +; GFX9-NEXT: s_lshr_b32 s13, s2, 28 +; GFX9-NEXT: s_and_b32 s14, s2, 15 +; GFX9-NEXT: s_bfe_u32 s15, s2, 0x40004 +; GFX9-NEXT: s_bfe_u32 s16, s2, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s4, s1, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s1, 0x40018 ; GFX9-NEXT: v_mov_b32_e32 v5, s12 -; GFX9-NEXT: s_lshr_b32 s6, s0, 28 +; GFX9-NEXT: s_lshr_b32 s6, s1, 28 ; GFX9-NEXT: v_mov_b32_e32 v6, s13 -; GFX9-NEXT: s_and_b32 s7, s0, 15 +; GFX9-NEXT: s_and_b32 s7, s1, 15 ; GFX9-NEXT: v_mov_b32_e32 v7, s14 -; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-NEXT: s_bfe_u32 s8, s1, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v8, s15 -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s9, s1, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v9, s16 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v10, s1 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v10, s2 ; GFX9-NEXT: v_mul_lo_u16_e32 v3, s3, v3 ; GFX9-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v5, s5, v5 @@ -2468,12 +2468,12 @@ ; GFX9-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v5, v7, v8 ; GFX9-NEXT: v_mul_lo_u16_e32 v9, s9, v9 -; GFX9-NEXT: v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v5, s2, v5 +; GFX9-NEXT: v_mul_lo_u16_sdwa v10, s1, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v5, s0, v5 ; GFX9-NEXT: v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v6, v5, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v6 -; GFX9-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX9-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 @@ -2492,38 +2492,38 @@ ; GFX9-DL: ; %bb.0: ; %entry ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s13, s1, 28 -; GFX9-DL-NEXT: s_and_b32 s14, s1, 15 -; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s13, s2, 28 +; GFX9-DL-NEXT: s_and_b32 s14, s2, 15 +; GFX9-DL-NEXT: s_bfe_u32 s15, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s16, s2, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x40018 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s12 -; GFX9-DL-NEXT: s_lshr_b32 s6, s0, 28 +; GFX9-DL-NEXT: s_lshr_b32 s6, s1, 28 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13 -; GFX9-DL-NEXT: s_and_b32 s7, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s7, s1, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s14 -; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s8, s1, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s15 -; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s9, s1, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v9, s16 -; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v10, s1 +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v10, s2 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s3, v3 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s5, v5 @@ -2534,12 +2534,12 @@ ; GFX9-DL-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v5, v7, v8 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, s9, v9 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, s1, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v5, s0, v5 ; GFX9-DL-NEXT: v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v6, v5, v6 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v6 -; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX9-DL-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v2, v5, v2 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -73,12 +73,12 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_load_dword s0, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_and_b32 s1, s4, 0xffff ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s0, s0, 0xffff0000 +; CI-NEXT: s_and_b32 s0, s2, 0xffff0000 ; CI-NEXT: s_or_b32 s0, s1, s0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -95,11 +95,11 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NEXT: s_lshr_b32 s0, s2, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -133,18 +133,18 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_load_dword s0, s[2:3], 0x0 +; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_and_b32 s1, s4, 0xffff +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_and_b32 s0, s4, 0xffff ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s0, s0, 16 -; CI-NEXT: s_lshl_b32 s2, s0, 16 -; CI-NEXT: s_or_b32 s1, s1, s2 -; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: s_lshr_b32 s1, s2, 16 +; CI-NEXT: s_lshl_b32 s2, s1, 16 +; CI-NEXT: s_or_b32 s0, s0, s2 +; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: ;;#ASMSTART -; CI-NEXT: ; use s0 +; CI-NEXT: ; use s1 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr @@ -192,12 +192,12 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_load_dword s0, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_lshr_b32 s1, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s0, s0, 0xffff0000 +; CI-NEXT: s_and_b32 s0, s2, 0xffff0000 ; CI-NEXT: s_or_b32 s0, s1, s0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -216,16 +216,16 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_lshr_b32 s0, s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_lh_b32_b16 s0, s1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_pack_lh_b32_b16 s1, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s1 +; GFX9-NEXT: ; use s0 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; @@ -234,17 +234,17 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_load_dword s0, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshr_b32 s1, s4, 16 +; VI-NEXT: s_lshr_b32 s0, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 -; VI-NEXT: s_or_b32 s0, s1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_and_b32 s1, s2, 0xffff0000 +; VI-NEXT: s_or_b32 s1, s0, s1 +; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: ;;#ASMSTART -; VI-NEXT: ; use s1 +; VI-NEXT: ; use s0 ; VI-NEXT: ;;#ASMEND ; VI-NEXT: s_endpgm ; @@ -253,17 +253,17 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_load_dword s0, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_lshr_b32 s1, s4, 16 +; CI-NEXT: s_lshr_b32 s0, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s0, s0, 0xffff0000 -; CI-NEXT: s_or_b32 s0, s1, s0 -; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_and_b32 s1, s2, 0xffff0000 +; CI-NEXT: s_or_b32 s1, s0, s1 +; CI-NEXT: v_mov_b32_e32 v2, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: ;;#ASMSTART -; CI-NEXT: ; use s1 +; CI-NEXT: ; use s0 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr @@ -426,12 +426,12 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_load_dword s0, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_lshl_b32 s1, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s0, s0, 0xffff +; CI-NEXT: s_and_b32 s0, s2, 0xffff ; CI-NEXT: s_or_b32 s0, s0, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -624,15 +624,15 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v0, v[0:1] -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: flat_load_dword v3, v[0:1] +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_lshr_b32 s0, s4, 16 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; CI-NEXT: v_or_b32_e32 v0, s0, v0 -; CI-NEXT: flat_store_dword v[2:3], v0 +; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; CI-NEXT: v_or_b32_e32 v2, s0, v2 +; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -849,15 +849,15 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x4500 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x4500 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v3 +; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_0: @@ -1107,13 +1107,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; GFX9-NEXT: s_lshl_b32 s0, s4, 4 ; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -1125,13 +1125,13 @@ ; VI-NEXT: v_mov_b32_e32 v2, 0x3e703e7 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x0 +; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_load_dword s0, s[4:5], 0x0 -; VI-NEXT: s_load_dword s1, s[2:3], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s0, s0, 4 +; VI-NEXT: s_lshl_b32 s0, s4, 4 ; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: v_bfi_b32 v2, s0, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -1143,13 +1143,13 @@ ; CI-NEXT: v_mov_b32_e32 v2, 0x3e703e7 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_load_dword s0, s[4:5], 0x0 -; CI-NEXT: s_load_dword s1, s[2:3], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshl_b32 s0, s0, 4 +; CI-NEXT: s_lshl_b32 s0, s4, 4 ; CI-NEXT: s_lshl_b32 s0, 0xffff, s0 -; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v3, s2 ; CI-NEXT: v_bfi_b32 v2, s0, v2, v3 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -1240,24 +1240,25 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX9-NEXT: global_load_dword v2, v[2:3], off +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_mov_b32 s0, 0x12341234 -; GFX9-NEXT: v_bfi_b32 v0, v1, s0, v0 -; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_bfi_b32 v2, v2, s0, v3 +; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr: @@ -1266,24 +1267,25 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] -; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 -; VI-NEXT: s_mov_b32 s0, 0xffff -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; VI-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b32 s2, 0xffff +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_mov_b32 s0, 0x12341234 -; VI-NEXT: v_bfi_b32 v0, v1, s0, v0 -; VI-NEXT: flat_store_dword v[4:5], v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_bfi_b32 v2, v2, s0, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr: @@ -1299,17 +1301,17 @@ ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: flat_load_dword v2, v[2:3] -; CI-NEXT: flat_load_dword v0, v[0:1] -; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 -; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: flat_load_dword v3, v[0:1] +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4 +; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_mov_b32 s0, 0x12341234 -; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v2 -; CI-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 +; CI-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; CI-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_bfi_b32 v0, v1, s0, v0 -; CI-NEXT: flat_store_dword v[4:5], v0 +; CI-NEXT: v_bfi_b32 v2, v2, s0, v3 +; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1684,26 +1686,26 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v2, v[0:1], off ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: s_mov_b32 s1, 0 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v4 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v4, s[0:1] -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s4 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v5, s0, v1 -; GFX9-NEXT: v_bfi_b32 v0, v4, s0, v0 -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: v_bfi_b32 v1, v3, s1, v1 +; GFX9-NEXT: v_bfi_b32 v0, v2, s1, v0 +; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr: @@ -1717,17 +1719,17 @@ ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_mov_b32 s0, 0xffff +; VI-NEXT: s_mov_b32 s2, 0xffff ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_and_b32 s2, s4, s0 -; VI-NEXT: s_mov_b32 s1, 0 -; VI-NEXT: s_lshl_b32 s3, s2, 16 +; VI-NEXT: s_mov_b32 s3, 0 +; VI-NEXT: s_and_b32 s1, s4, s2 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: s_lshl_b32 s0, s1, 16 +; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 -; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[0:1] -; VI-NEXT: s_or_b32 s0, s2, s3 +; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[2:3] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_bfi_b32 v1, v5, s0, v1 ; VI-NEXT: v_bfi_b32 v0, v4, s0, v0 @@ -1736,26 +1738,26 @@ ; ; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; CI: ; %bb.0: -; CI-NEXT: flat_load_dword v4, v[0:1] ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: s_mov_b32 s6, 0xffff -; CI-NEXT: s_mov_b32 s7, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v4, v[0:1] ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; CI-NEXT: s_mov_b32 s2, 0xffff +; CI-NEXT: s_mov_b32 s3, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_lshl_b32 s1, s4, 16 -; CI-NEXT: s_and_b32 s3, s4, s6 +; CI-NEXT: s_and_b32 s4, s4, s2 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_or_b32 s0, s3, s1 +; CI-NEXT: s_or_b32 s0, s4, s1 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 -; CI-NEXT: v_lshl_b64 v[4:5], s[6:7], v4 +; CI-NEXT: v_lshl_b64 v[4:5], s[2:3], v4 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: v_bfi_b32 v1, v5, s0, v1 ; CI-NEXT: v_bfi_b32 v0, v4, s0, v0 @@ -1785,19 +1787,19 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: s_lshl_b32 s1, s5, 4 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: s_mov_b32 s1, 0 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_lshl_b32 s3, s5, 4 -; GFX9-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, s1, v5, v1 -; GFX9-NEXT: v_bfi_b32 v0, s0, v4, v0 +; GFX9-NEXT: v_bfi_b32 v1, s1, v4, v1 +; GFX9-NEXT: v_bfi_b32 v0, s0, v5, v0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -1807,19 +1809,19 @@ ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_mov_b32 s0, 0xffff +; VI-NEXT: s_mov_b32 s2, 0xffff ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_mov_b32 s1, 0 -; VI-NEXT: s_lshl_b32 s2, s5, 4 -; VI-NEXT: s_and_b32 s3, s4, s0 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 -; VI-NEXT: s_lshl_b32 s2, s3, 16 -; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_mov_b32 s3, 0 +; VI-NEXT: s_lshl_b32 s1, s5, 4 +; VI-NEXT: s_and_b32 s4, s4, s2 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], s1 +; VI-NEXT: s_lshl_b32 s2, s4, 16 +; VI-NEXT: s_or_b32 s2, s4, s2 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_mov_b32_e32 v5, s2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1839,15 +1841,15 @@ ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_mov_b32 s0, 0xffff -; CI-NEXT: s_and_b32 s2, s4, s0 -; CI-NEXT: s_lshl_b32 s4, s4, 16 +; CI-NEXT: s_mov_b32 s2, 0xffff ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: s_or_b32 s2, s2, s4 -; CI-NEXT: s_mov_b32 s1, 0 -; CI-NEXT: s_lshl_b32 s3, s5, 4 -; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], s3 +; CI-NEXT: s_and_b32 s6, s4, s2 +; CI-NEXT: s_mov_b32 s3, 0 +; CI-NEXT: s_lshl_b32 s1, s5, 4 +; CI-NEXT: s_lshl_b32 s4, s4, 16 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: s_lshl_b64 s[0:1], s[2:3], s1 +; CI-NEXT: s_or_b32 s2, s6, s4 ; CI-NEXT: v_mov_b32_e32 v4, s2 ; CI-NEXT: v_mov_b32_e32 v5, s2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -11,6 +11,7 @@ ;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen glc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_swap v0, v[1:2], s[0:3], 0 idxen offen glc +;SICI: v_mov_b32_e32 v1, 0x2000 ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen offset:42 glc ;CHECK-DAG: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll @@ -79,7 +79,7 @@ ;CHECK-NOT: s_waitcnt ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 idxen glc ;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 idxen glc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen glc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -1559,24 +1559,22 @@ ; VI-LABEL: simplify_bfe_u32_multi_use_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 -; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 63, v0 ; VI-NEXT: v_bfe_u32 v1, v0, 2, 2 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -118,20 +118,20 @@ ; GFX9-LABEL: cos_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3118 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v0 -; GFX9-NEXT: v_cos_f16_e32 v3, v1 -; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_cos_f16_e32 v2, v0 +; GFX9-NEXT: v_mul_f16_e32 v2, 0.15915494, v0 +; GFX9-NEXT: v_cos_f16_e32 v2, v2 +; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cos_f16_e32 v3, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %a.val = load <2 x half>, <2 x half> addrspace(1)* %a diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll @@ -173,8 +173,8 @@ ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], s[[A_F16]], v[[B_F16_1]] -; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[C_V2_F16]], s[[A_F16]], v[[B_V2_F16]] +; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], s[[A_F16]], v[[C_F16_1]] +; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], s[[A_F16]], v[[C_V2_F16]] ; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], s[[A_F16]], v[[B_V2_F16]] @@ -198,8 +198,9 @@ ; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]] ; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; VIGFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; VIGFX9: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; VIGFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GFX9: buffer_load_dword v[[C_V2_F16:[0-9]+]] ; SI: s_mov_b32 s[[B_F32:[0-9]+]], 0x40400000{{$}} ; VIGFX9: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}} @@ -243,8 +244,9 @@ ; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; VIGFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; VIGFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: s_mov_b32 s[[C_F32:[0-9]+]], 0x40400000{{$}} ; VIGFX9: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -43,17 +43,17 @@ ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -68,17 +68,17 @@ ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s14, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s2 -; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s10, s2 ; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -292,17 +292,17 @@ ; GFX9-LABEL: maxnum_v2f16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 -; GFX9-NEXT: v_pk_max_f16 v0, s5, s5 +; GFX9-NEXT: v_pk_max_f16 v1, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 ; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -362,18 +362,18 @@ ; ; GFX9-LABEL: maxnum_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x44004200 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX9-NEXT: s_mov_b32 s4, 0x44004200 -; GFX9-NEXT: v_pk_max_f16 v0, v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX9-NEXT: v_pk_max_f16 v0, v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) #0 { @@ -429,18 +429,18 @@ ; ; GFX9-LABEL: maxnum_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x42004400 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX9-NEXT: s_mov_b32 s4, 0x42004400 -; GFX9-NEXT: v_pk_max_f16 v0, v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX9-NEXT: v_pk_max_f16 v0, v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) #0 { @@ -735,12 +735,12 @@ ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s5, s5 -; GFX9-NEXT: v_pk_max_f16 v2, s4, s4 +; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 ; GFX9-NEXT: v_pk_max_f16 v1, v0, s8 ; GFX9-NEXT: v_pk_max_f16 v0, v2, s9 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -43,17 +43,17 @@ ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -68,17 +68,17 @@ ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s14, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s2 -; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s10, s2 ; GFX9-NEXT: s_mov_b32 s11, s3 -; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -315,17 +315,17 @@ ; GFX9-LABEL: minnum_v2f16_ieee: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 -; GFX9-NEXT: v_pk_max_f16 v0, s5, s5 +; GFX9-NEXT: v_pk_max_f16 v1, s6, s6 +; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 ; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -415,18 +415,18 @@ ; ; GFX9-LABEL: minnum_v2f16_imm_a: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x44004200 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX9-NEXT: s_mov_b32 s4, 0x44004200 -; GFX9-NEXT: v_pk_min_f16 v0, v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX9-NEXT: v_pk_min_f16 v0, v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) #0 { @@ -482,18 +482,18 @@ ; ; GFX9-LABEL: minnum_v2f16_imm_b: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x42004400 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX9-NEXT: s_mov_b32 s4, 0x42004400 -; GFX9-NEXT: v_pk_min_f16 v0, v0, s4 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX9-NEXT: v_pk_min_f16 v0, v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) #0 { @@ -788,12 +788,12 @@ ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, s5, s5 -; GFX9-NEXT: v_pk_max_f16 v2, s4, s4 +; GFX9-NEXT: v_pk_max_f16 v0, s7, s7 +; GFX9-NEXT: v_pk_max_f16 v2, s6, s6 ; GFX9-NEXT: v_pk_min_f16 v1, v0, s8 ; GFX9-NEXT: v_pk_min_f16 v0, v2, s9 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -77,15 +77,15 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[0:1], s[6:7] ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_movk_i32 s9, 0xfc01 -; SI-NEXT: s_mov_b32 s7, 0xfffff -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_brev_b32 s8, -2 +; SI-NEXT: s_movk_i32 s7, 0xfc01 +; SI-NEXT: s_mov_b32 s1, 0xfffff +; SI-NEXT: s_mov_b32 s0, -1 +; SI-NEXT: s_brev_b32 s6, -2 ; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_u32 v4, v3, 20, 11 -; SI-NEXT: v_add_i32_e32 v6, vcc, s9, v4 -; SI-NEXT: v_lshr_b64 v[4:5], s[6:7], v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v4 +; SI-NEXT: v_lshr_b64 v[4:5], s[0:1], v6 ; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3 ; SI-NEXT: v_not_b32_e32 v4, v4 ; SI-NEXT: v_not_b32_e32 v5, v5 @@ -98,7 +98,7 @@ ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; SI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] -; SI-NEXT: v_bfi_b32 v2, s8, v8, v3 +; SI-NEXT: v_bfi_b32 v2, s6, v8, v3 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 ; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc @@ -117,14 +117,14 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_brev_b32 s6, -2 +; CI-NEXT: s_brev_b32 s0, -2 ; CI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[2:3] ; CI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] -; CI-NEXT: v_bfi_b32 v2, s6, v8, v3 +; CI-NEXT: v_bfi_b32 v2, s0, v8, v3 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc ; CI-NEXT: v_mov_b32_e32 v2, 0 ; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -118,20 +118,20 @@ ; GFX9-LABEL: sin_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3118 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v0 -; GFX9-NEXT: v_sin_f16_e32 v3, v1 -; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_sin_f16_e32 v2, v0 +; GFX9-NEXT: v_mul_f16_e32 v2, 0.15915494, v0 +; GFX9-NEXT: v_sin_f16_e32 v2, v2 +; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_sin_f16_e32 v3, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %a.val = load <2 x half>, <2 x half> addrspace(1)* %a diff --git a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll --- a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll +++ b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll @@ -119,12 +119,12 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[0:1], s[6:7] +; GCN-NEXT: buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; GCN-NEXT: buffer_load_dword v0, v[1:2], s[0:3], 0 addr64 ; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v4, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, 0x800000, v0 +; GCN-NEXT: v_or_b32_e32 v0, 0x800000, v1 ; GCN-NEXT: v_mul_i32_i24_e32 v0, -7, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GCN-NEXT: buffer_store_dwordx2 v[1:2], v[3:4], s[4:7], 0 addr64 diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -87,23 +87,23 @@ ; VI-LABEL: v_lshr_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v5, v[0:1] +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e32 v4, v1, v0 -; VI-NEXT: v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v0, v4, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v5 +; VI-NEXT: v_lshrrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_lshr_v2i16: @@ -117,14 +117,14 @@ ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 -; CI-NEXT: s_mov_b32 s8, 0xffff +; CI-NEXT: s_mov_b32 s0, 0xffff ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; CI-NEXT: v_and_b32_e32 v2, s8, v2 -; CI-NEXT: v_and_b32_e32 v3, s8, v3 +; CI-NEXT: v_and_b32_e32 v2, s0, v2 +; CI-NEXT: v_and_b32_e32 v3, s0, v3 ; CI-NEXT: v_lshr_b32_e32 v2, v2, v3 ; CI-NEXT: v_lshr_b32_e32 v3, v4, v5 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -171,39 +171,39 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_lshr_b32 s1, s0, 16 -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e32 v1, s0, v0 -; VI-NEXT: v_lshrrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_lshrrev_b16_e32 v4, s0, v3 +; VI-NEXT: v_lshrrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v4, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; CI-LABEL: lshr_v_s_v2i16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dword s0, s[0:1], 0xd -; CI-NEXT: s_mov_b32 s8, 0xffff +; CI-NEXT: s_load_dword s8, s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s9, s0, 16 -; CI-NEXT: s_and_b32 s10, s0, s8 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_mov_b32 s0, 0xffff +; CI-NEXT: s_lshr_b32 s1, s8, 16 +; CI-NEXT: s_and_b32 s8, s8, s0 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_and_b32_e32 v2, s8, v2 -; CI-NEXT: v_lshrrev_b32_e32 v3, s9, v3 -; CI-NEXT: v_lshrrev_b32_e32 v2, s10, v2 +; CI-NEXT: v_and_b32_e32 v2, s0, v2 +; CI-NEXT: v_lshrrev_b32_e32 v3, s1, v3 +; CI-NEXT: v_lshrrev_b32_e32 v2, s8, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -246,39 +246,39 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_lshr_b32 s1, s0, 16 -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e64 v1, v0, s0 -; VI-NEXT: v_lshrrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_lshrrev_b16_e64 v4, v3, s0 +; VI-NEXT: v_lshrrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v4, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; CI-LABEL: lshr_s_v_v2i16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dword s0, s[0:1], 0xd -; CI-NEXT: s_mov_b32 s8, 0xffff +; CI-NEXT: s_load_dword s8, s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s9, s0, 16 -; CI-NEXT: s_and_b32 s10, s0, s8 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_mov_b32 s0, 0xffff +; CI-NEXT: s_lshr_b32 s1, s8, 16 +; CI-NEXT: s_and_b32 s8, s8, s0 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_and_b32_e32 v2, s8, v2 -; CI-NEXT: v_lshr_b32_e32 v3, s9, v3 -; CI-NEXT: v_lshr_b32_e32 v2, s10, v2 +; CI-NEXT: v_and_b32_e32 v2, s0, v2 +; CI-NEXT: v_lshr_b32_e32 v3, s1, v3 +; CI-NEXT: v_lshr_b32_e32 v2, s8, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -320,15 +320,15 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e64 v1, v0, 8 -; VI-NEXT: v_lshrrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_lshrrev_b16_e64 v2, v3, 8 +; VI-NEXT: v_lshrrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; CI-LABEL: lshr_imm_v_v2i16: @@ -428,45 +428,45 @@ ; GFX9-LABEL: v_lshr_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, v5 -; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, v4 -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, v3 +; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, v2 +; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_lshr_v4i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e32 v6, v5, v1 -; VI-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_lshrrev_b16_e32 v5, v4, v0 -; VI-NEXT: v_lshrrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshrrev_b16_e32 v6, v3, v1 +; VI-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v0 +; VI-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v1, v6, v1 -; VI-NEXT: v_or_b32_e32 v0, v5, v0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_or_b32_e32 v0, v3, v0 +; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_lshr_v4i16: @@ -480,7 +480,7 @@ ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 -; CI-NEXT: s_mov_b32 s8, 0xffff +; CI-NEXT: s_mov_b32 s0, 0xffff ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 @@ -488,10 +488,10 @@ ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; CI-NEXT: v_and_b32_e32 v2, s8, v2 -; CI-NEXT: v_and_b32_e32 v4, s8, v4 -; CI-NEXT: v_and_b32_e32 v3, s8, v3 -; CI-NEXT: v_and_b32_e32 v5, s8, v5 +; CI-NEXT: v_and_b32_e32 v2, s0, v2 +; CI-NEXT: v_and_b32_e32 v4, s0, v4 +; CI-NEXT: v_and_b32_e32 v3, s0, v3 +; CI-NEXT: v_and_b32_e32 v5, s0, v5 ; CI-NEXT: v_lshr_b32_e32 v3, v3, v5 ; CI-NEXT: v_lshr_b32_e32 v5, v7, v9 ; CI-NEXT: v_lshr_b32_e32 v2, v2, v4 @@ -565,13 +565,13 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_mov_b32 s8, 0xff00ff +; CI-NEXT: s_mov_b32 s0, 0xff00ff ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 8, v3 ; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 -; CI-NEXT: v_and_b32_e32 v3, s8, v3 -; CI-NEXT: v_and_b32_e32 v2, s8, v2 +; CI-NEXT: v_and_b32_e32 v3, s0, v3 +; CI-NEXT: v_and_b32_e32 v2, s0, v2 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll --- a/llvm/test/CodeGen/AMDGPU/madak.ll +++ b/llvm/test/CodeGen/AMDGPU/madak.ll @@ -39,7 +39,8 @@ ; it. ; GCN-LABEL: {{^}}madak_2_use_f32: -; GFX8_9_10: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 +; GFX9: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 +; GFX10: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 ; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GFX6-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GFX6-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 @@ -47,6 +48,7 @@ ; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}} ; GFX8_9_10: {{flat|global}}_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}} ; GFX6-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 +; GFX8-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 ; GFX6_8_9-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 ; GFX10-MAD-DAG:v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 ; FMA-DAG: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -73,16 +73,16 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v5, v[0:1] +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v2, v0, v1 -; VI-NEXT: v_max_i16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: flat_store_dword v[4:5], v0 +; VI-NEXT: v_max_i16_e32 v3, v5, v2 +; VI-NEXT: v_max_i16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sge_v2i16: @@ -124,63 +124,64 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v6 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ushort v6, v[6:7] -; VI-NEXT: flat_load_dword v7, v[0:1] +; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v4, v[4:5] +; VI-NEXT: flat_load_dword v5, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: flat_load_dword v8, v[2:3] -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[2:3] +; VI-NEXT: flat_load_ushort v8, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v6 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_max_i16_e32 v0, v6, v0 +; VI-NEXT: v_max_i16_e32 v6, v5, v7 +; VI-NEXT: v_max_i16_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v1, v7, v8 -; VI-NEXT: v_max_i16_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v1, v1, v7 -; VI-NEXT: flat_store_short v[2:3], v0 -; VI-NEXT: flat_store_dword v[4:5], v1 +; VI-NEXT: v_max_i16_e32 v4, v4, v8 +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: flat_store_short v[2:3], v4 +; VI-NEXT: flat_store_dword v[0:1], v5 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sge_v3i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_short_d16 v7, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_short_d16 v6, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_load_short_d16 v6, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v7, v[0:1], off +; GFX9-NEXT: global_load_short_d16 v4, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v2, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_pk_max_i16 v3, v6, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_i16 v0, v0, v1 -; GFX9-NEXT: v_pk_max_i16 v1, v7, v6 -; GFX9-NEXT: global_store_short v[4:5], v1, off offset:4 -; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: v_pk_max_i16 v2, v7, v2 +; GFX9-NEXT: global_store_short v[0:1], v3, off offset:4 +; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %aptr, i32 %tid @@ -441,16 +442,16 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[2:3] -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v5, v[0:1] +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_u16_e32 v2, v0, v1 -; VI-NEXT: v_max_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: flat_store_dword v[4:5], v0 +; VI-NEXT: v_max_u16_e32 v3, v5, v2 +; VI-NEXT: v_max_u16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_umax_ugt_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -108,46 +108,56 @@ ; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GCN-NEXT: v_add_u32_e32 v0, v0, v2 +; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 +; GCN-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:8 +; GCN-NEXT: buffer_load_dword v6, v0, s[0:3], 0 offen offset:12 +; GCN-NEXT: buffer_load_dword v7, v0, s[0:3], 0 offen offset:16 +; GCN-NEXT: buffer_load_dword v8, v0, s[0:3], 0 offen offset:20 +; GCN-NEXT: buffer_load_dword v9, v0, s[0:3], 0 offen offset:24 +; GCN-NEXT: buffer_load_dword v10, v0, s[0:3], 0 offen offset:28 +; GCN-NEXT: buffer_load_dword v11, v0, s[0:3], 0 offen offset:32 +; GCN-NEXT: buffer_load_dword v12, v0, s[0:3], 0 offen offset:36 +; GCN-NEXT: buffer_load_dword v13, v0, s[0:3], 0 offen offset:40 +; GCN-NEXT: buffer_load_dword v14, v0, s[0:3], 0 offen offset:44 +; GCN-NEXT: buffer_load_dword v15, v0, s[0:3], 0 offen offset:48 +; GCN-NEXT: buffer_load_dword v16, v0, s[0:3], 0 offen offset:52 +; GCN-NEXT: buffer_load_dword v17, v0, s[0:3], 0 offen offset:56 ; GCN-NEXT: v_add_u32_e32 v1, v1, v2 -; GCN-NEXT: buffer_load_dword v6, v0, s[0:3], 0 offen offset:20 -; GCN-NEXT: buffer_load_dword v7, v0, s[0:3], 0 offen offset:24 -; GCN-NEXT: buffer_load_dword v8, v0, s[0:3], 0 offen offset:28 -; GCN-NEXT: buffer_load_dword v9, v0, s[0:3], 0 offen offset:32 -; GCN-NEXT: buffer_load_dword v10, v0, s[0:3], 0 offen offset:36 -; GCN-NEXT: buffer_load_dword v11, v0, s[0:3], 0 offen offset:40 -; GCN-NEXT: buffer_load_dword v12, v0, s[0:3], 0 offen offset:44 -; GCN-NEXT: buffer_load_dword v13, v0, s[0:3], 0 offen offset:48 -; GCN-NEXT: buffer_load_dword v14, v0, s[0:3], 0 offen offset:52 -; GCN-NEXT: buffer_load_dword v15, v0, s[0:3], 0 offen offset:56 -; GCN-NEXT: buffer_load_dword v16, v0, s[0:3], 0 offen offset:60 -; GCN-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 -; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:8 -; GCN-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:12 +; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:60 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen offset:4 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen offset:8 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen offset:12 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 -; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen offset:20 -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen offset:24 -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen offset:28 -; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen offset:32 -; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen offset:36 -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen offset:40 -; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen offset:44 -; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen offset:48 -; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen offset:52 -; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen offset:56 -; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen offset:60 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen offset:4 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen offset:8 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen offset:12 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen offset:16 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen offset:20 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen offset:24 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen offset:28 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen offset:32 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen offset:36 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen offset:40 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen offset:44 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen offset:48 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen offset:52 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen offset:56 +; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:60 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -57,7 +57,7 @@ ; GFX9-NEXT: v_and_b32_e32 v5, 1, v18 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1 -; GFX9-NEXT: s_and_saveexec_b64 s[10:11], s[4:5] +; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX9-NEXT: s_cbranch_execz BB1_3 ; GFX9-NEXT: ; %bb.1: ; %bb19 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v6 @@ -67,7 +67,7 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 2, v2 ; GFX9-NEXT: v_add_u32_e32 v7, v17, v12 -; GFX9-NEXT: s_mov_b64 s[12:13], 0 +; GFX9-NEXT: s_mov_b64 s[10:11], 0 ; GFX9-NEXT: BB1_2: ; %bb23 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 @@ -76,32 +76,32 @@ ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_madak_f32 v8, v8, v4, 0x3727c5ac ; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v18, v8, v5 ; GFX9-NEXT: v_add_u32_e32 v8, v8, v16 -; GFX9-NEXT: v_cmp_lt_u32_e64 s[6:7], v8, v13 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v8, v13 ; GFX9-NEXT: v_mul_lo_u32 v8, v8, v15 ; GFX9-NEXT: v_sub_u32_e32 v19, v9, v18 -; GFX9-NEXT: v_cmp_lt_u32_e64 s[8:9], v19, v14 -; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9] +; GFX9-NEXT: v_cmp_lt_u32_e64 s[6:7], v19, v14 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-NEXT: v_sub_u32_e32 v12, v12, v18 -; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], vcc ; GFX9-NEXT: v_add_u32_e32 v8, v12, v8 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[8:9], 2, v[8:9] -; GFX9-NEXT: s_or_b64 s[12:13], s[4:5], s[12:13] -; GFX9-NEXT: v_add_co_u32_e64 v8, s[4:5], v10, v8 -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], v11, v9, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v8, s[6:7], v10, v8 +; GFX9-NEXT: v_addc_co_u32_e64 v9, s[6:7], v11, v9, s[6:7] ; GFX9-NEXT: global_load_dword v8, v[8:9], off +; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v1 +; GFX9-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] ; GFX9-NEXT: ds_write_b32 v3, v8 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GFX9-NEXT: s_cbranch_execnz BB1_2 ; GFX9-NEXT: BB1_3: ; %Flow3 -; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -98,17 +98,17 @@ ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off ; -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} entry: %call = tail call i64 @_Z13get_global_idj(i32 0) %conv = and i64 %call, 255 diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -166,20 +166,18 @@ ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s12, s4 +; SI-NEXT: s_mov_b32 s13, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, v1, v0 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 @@ -187,19 +185,19 @@ ; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 -; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_saddo_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_mov_b32_e32 v6, s6 -; VI-NEXT: v_mov_b32_e32 v7, s7 -; VI-NEXT: flat_load_dword v4, v[4:5] -; VI-NEXT: flat_load_dword v5, v[6:7] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: flat_load_dword v5, v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -218,12 +216,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mov_b32_e32 v7, s7 -; GFX9-NEXT: global_load_dword v4, v[4:5], off -; GFX9-NEXT: global_load_dword v5, v[6:7], off +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -335,20 +333,18 @@ ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s12, s4 +; SI-NEXT: s_mov_b32 s13, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 ; SI-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc @@ -357,57 +353,57 @@ ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0 ; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_saddo_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_mov_b32_e32 v6, s6 -; VI-NEXT: v_mov_b32_e32 v7, s7 -; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] -; VI-NEXT: flat_load_dwordx2 v[6:7], v[6:7] -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v8, vcc, v4, v6 -; VI-NEXT: v_addc_u32_e32 v9, vcc, v5, v7, vcc -; VI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7] -; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[4:5] -; VI-NEXT: flat_store_dwordx2 v[0:1], v[8:9] +; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v9, vcc, v1, v3, vcc +; VI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1] +; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9] ; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; VI-NEXT: flat_store_byte v[2:3], v0 +; VI-NEXT: flat_store_byte v[6:7], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_saddo_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mov_b32_e32 v7, s7 -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[4:5], off -; GFX9-NEXT: global_load_dwordx2 v[6:7], v[6:7], off -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: v_mov_b32_e32 v7, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v5, v7, vcc -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[4:5] -; GFX9-NEXT: global_store_dwordx2 v[0:1], v[8:9], off +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v1, v3, vcc +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v[4:5], v[8:9], off ; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX9-NEXT: global_store_byte v[2:3], v0, off +; GFX9-NEXT: global_store_byte v[6:7], v0, off ; GFX9-NEXT: s_endpgm %a = load i64, i64 addrspace(1)* %aptr, align 4 %b = load i64, i64 addrspace(1)* %bptr, align 4 @@ -428,20 +424,18 @@ ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s12, s4 +; SI-NEXT: s_mov_b32 s13, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v5, vcc, v1, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 @@ -461,58 +455,58 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_mov_b32_e32 v6, s6 -; VI-NEXT: v_mov_b32_e32 v7, s7 -; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] -; VI-NEXT: flat_load_dwordx2 v[6:7], v[6:7] -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v9, vcc, v5, v7 -; VI-NEXT: v_add_u32_e32 v8, vcc, v4, v6 -; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v7 -; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v5 -; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v6 -; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v4 +; VI-NEXT: v_add_u32_e32 v9, vcc, v1, v3 +; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2 +; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3 +; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1 ; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; VI-NEXT: flat_store_dwordx2 v[0:1], v[8:9] +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 +; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v0 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9] +; VI-NEXT: flat_store_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_saddo_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mov_b32_e32 v7, s7 -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[4:5], off -; GFX9-NEXT: global_load_dwordx2 v[6:7], v[6:7], off -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: v_mov_b32_e32 v7, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v9, v5, v7 -; GFX9-NEXT: v_add_u32_e32 v8, v4, v6 -; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v7 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v5 -; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v4 +; GFX9-NEXT: v_add_u32_e32 v9, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v8, v0, v2 +; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1 ; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: global_store_dwordx2 v[0:1], v[8:9], off +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v[4:5], v[8:9], off +; GFX9-NEXT: global_store_dwordx2 v[6:7], v[0:1], off ; GFX9-NEXT: s_endpgm %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll --- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll +++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -173,9 +173,9 @@ ; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}} ; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}} ; CI-NOHSA-NOT: v_add -; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 ; CI-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}} ; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} +; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} @@ -205,8 +205,8 @@ ; SI: s_mov_b32 {{s[0-9]+}}, 0x13480 ; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 ; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32 -; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48 ; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], {{s[0-9]+}} addr64 +; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48 ; CI-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}} ; CI-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} ; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir --- a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir @@ -36,10 +36,10 @@ ; CHECK: INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_LO16 */, def dead [[COPY1]], 851978 /* regdef:VGPR_LO16 */, def dead [[COPY]].sub1, 2147483657 /* reguse tiedto:$0 */, [[COPY1]], 2147549193 /* reguse tiedto:$1 */, [[COPY]].sub1 ; CHECK: %11.sub0:vreg_512 = COPY [[COPY]].sub0 ; CHECK: %11.sub3:vreg_512 = COPY [[COPY]].sub3 - ; CHECK: dead %10:vgpr_32 = V_ADD_CO_U32_e32 4, [[V_MOV_B32_e32_1]], implicit-def dead $vcc, implicit $exec ; CHECK: %11.sub2:vreg_512 = COPY undef [[V_MOV_B32_e32_]] ; CHECK: %11.sub5:vreg_512 = COPY undef [[V_MOV_B32_e32_]] ; CHECK: [[COPY2:%[0-9]+]]:vreg_512 = COPY %11 + ; CHECK: dead %10:vgpr_32 = V_ADD_CO_U32_e32 4, [[V_MOV_B32_e32_1]], implicit-def dead $vcc, implicit $exec ; CHECK: S_BRANCH %bb.1 bb.0: liveins: $sgpr6_sgpr7 diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir --- a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir @@ -25,6 +25,10 @@ ; CHECK: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, 0, 0, implicit $exec ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 8, 0, 0, 0, implicit $exec + ; CHECK: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_]] + ; CHECK: undef %6.sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $mode, implicit $exec + ; CHECK: dead undef %6.sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $mode, implicit $exec + ; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec ; CHECK: undef %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec ; CHECK: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK: [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF @@ -32,11 +36,7 @@ ; CHECK: undef %11.sub1:vreg_64 = IMPLICIT_DEF ; CHECK: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_]] - ; CHECK: undef %6.sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $mode, implicit $exec - ; CHECK: dead undef %6.sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $mode, implicit $exec ; CHECK: [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec ; CHECK: undef %19.sub0:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD1]], [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $mode, implicit $exec ; CHECK: [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK: %19.sub1:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -203,14 +203,14 @@ ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s10, s2 +; GCN-NEXT: s_mov_b32 s11, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s8, s6 +; GCN-NEXT: s_mov_b32 s9, s7 +; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s7 -; GCN-NEXT: s_mov_b32 s6, s2 -; GCN-NEXT: s_mov_b32 s7, s3 -; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 30, v1 @@ -224,14 +224,14 @@ ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 +; TONGA-NEXT: s_mov_b32 s10, s2 +; TONGA-NEXT: s_mov_b32 s11, s3 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) +; TONGA-NEXT: s_mov_b32 s8, s6 +; TONGA-NEXT: s_mov_b32 s9, s7 +; TONGA-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; TONGA-NEXT: s_mov_b32 s0, s4 ; TONGA-NEXT: s_mov_b32 s1, s5 -; TONGA-NEXT: s_mov_b32 s4, s6 -; TONGA-NEXT: s_mov_b32 s5, s7 -; TONGA-NEXT: s_mov_b32 s6, s2 -; TONGA-NEXT: s_mov_b32 s7, s3 -; TONGA-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; TONGA-NEXT: v_lshrrev_b32_e32 v1, 30, v1 @@ -694,14 +694,14 @@ ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s10, s2 +; GCN-NEXT: s_mov_b32 s11, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s8, s6 +; GCN-NEXT: s_mov_b32 s9, s7 +; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s7 -; GCN-NEXT: s_mov_b32 s6, s2 -; GCN-NEXT: s_mov_b32 s7, s3 -; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v1 @@ -719,14 +719,14 @@ ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 +; TONGA-NEXT: s_mov_b32 s10, s2 +; TONGA-NEXT: s_mov_b32 s11, s3 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) +; TONGA-NEXT: s_mov_b32 s8, s6 +; TONGA-NEXT: s_mov_b32 s9, s7 +; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; TONGA-NEXT: s_mov_b32 s0, s4 ; TONGA-NEXT: s_mov_b32 s1, s5 -; TONGA-NEXT: s_mov_b32 s4, s6 -; TONGA-NEXT: s_mov_b32 s5, s7 -; TONGA-NEXT: s_mov_b32 s6, s2 -; TONGA-NEXT: s_mov_b32 s7, s3 -; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v1 @@ -744,14 +744,14 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s2 -; GFX9-NEXT: s_mov_b32 s7, s3 -; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v1 @@ -1073,16 +1073,16 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s4, 0x4f7ffffe +; GFX9-NEXT: s_mov_b32 s6, s10 +; GFX9-NEXT: s_mov_b32 s7, s11 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s2 +; GFX9-NEXT: s_mov_b32 s5, s3 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GFX9-NEXT: s_mov_b32 s8, s0 ; GFX9-NEXT: s_mov_b32 s9, s1 -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 -; GFX9-NEXT: s_mov_b32 s2, s10 -; GFX9-NEXT: s_mov_b32 s3, s11 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1120,14 +1120,14 @@ ; GFX9-NEXT: v_cvt_f32_u32_e32 v14, v7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v10, v10 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v12, v12 -; GFX9-NEXT: v_mul_f32_e32 v8, s4, v8 +; GFX9-NEXT: v_mul_f32_e32 v8, s2, v8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v14, v14 ; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX9-NEXT: v_mul_f32_e32 v10, s4, v10 -; GFX9-NEXT: v_mul_f32_e32 v12, s4, v12 +; GFX9-NEXT: v_mul_f32_e32 v10, s2, v10 +; GFX9-NEXT: v_mul_f32_e32 v12, s2, v12 ; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v10 ; GFX9-NEXT: v_sub_u32_e32 v9, 0, v4 -; GFX9-NEXT: v_mul_f32_e32 v14, s4, v14 +; GFX9-NEXT: v_mul_f32_e32 v14, s2, v14 ; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v12 ; GFX9-NEXT: v_mul_lo_u32 v9, v9, v8 ; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v14 @@ -1330,14 +1330,14 @@ ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s10, s2 +; GCN-NEXT: s_mov_b32 s11, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s8, s6 +; GCN-NEXT: s_mov_b32 s9, s7 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s7 -; GCN-NEXT: s_mov_b32 s6, s2 -; GCN-NEXT: s_mov_b32 s7, s3 -; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v1 @@ -1363,14 +1363,14 @@ ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 +; TONGA-NEXT: s_mov_b32 s10, s2 +; TONGA-NEXT: s_mov_b32 s11, s3 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) +; TONGA-NEXT: s_mov_b32 s8, s6 +; TONGA-NEXT: s_mov_b32 s9, s7 +; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; TONGA-NEXT: s_mov_b32 s0, s4 ; TONGA-NEXT: s_mov_b32 s1, s5 -; TONGA-NEXT: s_mov_b32 s4, s6 -; TONGA-NEXT: s_mov_b32 s5, s7 -; TONGA-NEXT: s_mov_b32 s6, s2 -; TONGA-NEXT: s_mov_b32 s7, s3 -; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v1 @@ -1396,14 +1396,14 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s2 -; GFX9-NEXT: s_mov_b32 s7, s3 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v1 @@ -1619,17 +1619,17 @@ ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s10, s2 +; GCN-NEXT: s_mov_b32 s11, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s8, s6 +; GCN-NEXT: s_mov_b32 s9, s7 +; GCN-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GCN-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2 +; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 +; GCN-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:6 ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s7 -; GCN-NEXT: s_mov_b32 s6, s2 -; GCN-NEXT: s_mov_b32 s7, s3 -; GCN-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; GCN-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:2 -; GCN-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4 -; GCN-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:6 ; GCN-NEXT: s_waitcnt vmcnt(2) ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1660,17 +1660,17 @@ ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 +; TONGA-NEXT: s_mov_b32 s10, s2 +; TONGA-NEXT: s_mov_b32 s11, s3 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) +; TONGA-NEXT: s_mov_b32 s8, s6 +; TONGA-NEXT: s_mov_b32 s9, s7 +; TONGA-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; TONGA-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2 +; TONGA-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 +; TONGA-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:6 ; TONGA-NEXT: s_mov_b32 s0, s4 ; TONGA-NEXT: s_mov_b32 s1, s5 -; TONGA-NEXT: s_mov_b32 s4, s6 -; TONGA-NEXT: s_mov_b32 s5, s7 -; TONGA-NEXT: s_mov_b32 s6, s2 -; TONGA-NEXT: s_mov_b32 s7, s3 -; TONGA-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; TONGA-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:2 -; TONGA-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4 -; TONGA-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:6 ; TONGA-NEXT: s_waitcnt vmcnt(2) ; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; TONGA-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1701,17 +1701,17 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2 +; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 +; GFX9-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:6 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s2 -; GFX9-NEXT: s_mov_b32 s7, s3 -; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; GFX9-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:2 -; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4 -; GFX9-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1802,17 +1802,17 @@ ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s10, s2 +; GCN-NEXT: s_mov_b32 s11, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s8, s6 +; GCN-NEXT: s_mov_b32 s9, s7 +; GCN-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GCN-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:2 +; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 +; GCN-NEXT: buffer_load_sbyte v3, off, s[8:11], 0 offset:6 ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s7 -; GCN-NEXT: s_mov_b32 s6, s2 -; GCN-NEXT: s_mov_b32 s7, s3 -; GCN-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; GCN-NEXT: buffer_load_sbyte v1, off, s[4:7], 0 offset:2 -; GCN-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4 -; GCN-NEXT: buffer_load_sbyte v3, off, s[4:7], 0 offset:6 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GCN-NEXT: v_or_b32_e32 v2, v2, v4 @@ -1840,17 +1840,17 @@ ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s3, 0xf000 ; TONGA-NEXT: s_mov_b32 s2, -1 +; TONGA-NEXT: s_mov_b32 s10, s2 +; TONGA-NEXT: s_mov_b32 s11, s3 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) +; TONGA-NEXT: s_mov_b32 s8, s6 +; TONGA-NEXT: s_mov_b32 s9, s7 +; TONGA-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; TONGA-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:2 +; TONGA-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 +; TONGA-NEXT: buffer_load_sbyte v3, off, s[8:11], 0 offset:6 ; TONGA-NEXT: s_mov_b32 s0, s4 ; TONGA-NEXT: s_mov_b32 s1, s5 -; TONGA-NEXT: s_mov_b32 s4, s6 -; TONGA-NEXT: s_mov_b32 s5, s7 -; TONGA-NEXT: s_mov_b32 s6, s2 -; TONGA-NEXT: s_mov_b32 s7, s3 -; TONGA-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; TONGA-NEXT: buffer_load_sbyte v1, off, s[4:7], 0 offset:2 -; TONGA-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4 -; TONGA-NEXT: buffer_load_sbyte v3, off, s[4:7], 0 offset:6 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; TONGA-NEXT: v_or_b32_e32 v2, v2, v4 @@ -2214,16 +2214,14 @@ ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; GCN-NEXT: s_mov_b32 s0, 0x1389c755 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s7 -; GCN-NEXT: s_mov_b32 s6, s2 -; GCN-NEXT: s_mov_b32 s7, s3 +; GCN-NEXT: s_mov_b32 s4, 0x1389c755 +; GCN-NEXT: s_mov_b32 s0, s6 +; GCN-NEXT: s_mov_b32 s1, s7 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_hi_i32 v0, v0, s0 -; GCN-NEXT: v_mul_hi_i32 v1, v1, s0 -; GCN-NEXT: v_mul_hi_i32 v2, v2, s0 -; GCN-NEXT: v_mul_hi_i32 v3, v3, s0 +; GCN-NEXT: v_mul_hi_i32 v0, v0, s4 +; GCN-NEXT: v_mul_hi_i32 v1, v1, s4 +; GCN-NEXT: v_mul_hi_i32 v2, v2, s4 +; GCN-NEXT: v_mul_hi_i32 v3, v3, s4 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 31, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 12, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 31, v1 @@ -2236,7 +2234,7 @@ ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: scalarize_mulhs_4xi32: @@ -2248,16 +2246,14 @@ ; TONGA-NEXT: s_mov_b32 s0, s4 ; TONGA-NEXT: s_mov_b32 s1, s5 ; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; TONGA-NEXT: s_mov_b32 s0, 0x1389c755 -; TONGA-NEXT: s_mov_b32 s4, s6 -; TONGA-NEXT: s_mov_b32 s5, s7 -; TONGA-NEXT: s_mov_b32 s6, s2 -; TONGA-NEXT: s_mov_b32 s7, s3 +; TONGA-NEXT: s_mov_b32 s4, 0x1389c755 +; TONGA-NEXT: s_mov_b32 s0, s6 +; TONGA-NEXT: s_mov_b32 s1, s7 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_mul_hi_i32 v0, v0, s0 -; TONGA-NEXT: v_mul_hi_i32 v1, v1, s0 -; TONGA-NEXT: v_mul_hi_i32 v2, v2, s0 -; TONGA-NEXT: v_mul_hi_i32 v3, v3, s0 +; TONGA-NEXT: v_mul_hi_i32 v0, v0, s4 +; TONGA-NEXT: v_mul_hi_i32 v1, v1, s4 +; TONGA-NEXT: v_mul_hi_i32 v2, v2, s4 +; TONGA-NEXT: v_mul_hi_i32 v3, v3, s4 ; TONGA-NEXT: v_lshrrev_b32_e32 v4, 31, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 12, v0 ; TONGA-NEXT: v_lshrrev_b32_e32 v5, 31, v1 @@ -2270,7 +2266,7 @@ ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5 ; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7 -; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: scalarize_mulhs_4xi32: @@ -2282,16 +2278,14 @@ ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; GFX9-NEXT: s_mov_b32 s0, 0x1389c755 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: s_mov_b32 s6, s2 -; GFX9-NEXT: s_mov_b32 s7, s3 +; GFX9-NEXT: s_mov_b32 s4, 0x1389c755 +; GFX9-NEXT: s_mov_b32 s0, s6 +; GFX9-NEXT: s_mov_b32 s1, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_hi_i32 v0, v0, s0 -; GFX9-NEXT: v_mul_hi_i32 v1, v1, s0 -; GFX9-NEXT: v_mul_hi_i32 v2, v2, s0 -; GFX9-NEXT: v_mul_hi_i32 v3, v3, s0 +; GFX9-NEXT: v_mul_hi_i32 v0, v0, s4 +; GFX9-NEXT: v_mul_hi_i32 v1, v1, s4 +; GFX9-NEXT: v_mul_hi_i32 v2, v2, s4 +; GFX9-NEXT: v_mul_hi_i32 v3, v3, s4 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 12, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 31, v1 @@ -2304,7 +2298,7 @@ ; GFX9-NEXT: v_add_u32_e32 v1, v1, v5 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v7 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: scalarize_mulhs_4xi32: diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -1867,56 +1867,56 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(i64 addrspace(1)* %out, i64 %x) { ; GCN-LABEL: s_test_sdiv24_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GCN-NEXT: s_mov_b32 s7, 0x41c00000 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_ashr_i32 s4, s6, 30 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-NEXT: s_mov_b32 s3, 0x41c00000 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_ashr_i32 s0, s2, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_or_b32 s6, s4, 1 -; GCN-NEXT: v_mul_f32_e32 v1, s7, v1 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_or_b32 s2, s0, 1 +; GCN-NEXT: v_mul_f32_e32 v1, s3, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mad_f32 v2, -v1, v0, s7 +; GCN-NEXT: v_mad_f32 v2, -v1, v0, s3 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0| -; GCN-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s4, s6, 0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v1 +; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_cselect_b32 s0, s2, 0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_sdiv24_k_num_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GCN-IR-NEXT: s_mov_b32 s7, 0x41c00000 -; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_ashr_i32 s4, s6, 30 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-IR-NEXT: s_mov_b32 s3, 0x41c00000 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_ashr_i32 s0, s2, 30 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-IR-NEXT: s_mov_b32 s1, s5 -; GCN-IR-NEXT: s_or_b32 s6, s4, 1 -; GCN-IR-NEXT: v_mul_f32_e32 v1, s7, v1 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_or_b32 s2, s0, 1 +; GCN-IR-NEXT: v_mul_f32_e32 v1, s3, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s7 +; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s3 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0| -; GCN-IR-NEXT: s_cmp_lg_u32 s4, 0 -; GCN-IR-NEXT: s_cselect_b32 s4, s6, 0 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s4, v1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| +; GCN-IR-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-IR-NEXT: s_cselect_b32 s0, s2, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v1 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %x.shr = ashr i64 %x, 40 %result = sdiv i64 24, %x.shr diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -73,7 +73,7 @@ ; GCN-LABEL: {{^}}mul_v2i16: ; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} ; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} -; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]] +; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST0]], v[[DST1]] ; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]] ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]] ; NOSDWA-NOT: v_mul_u32_u24_sdwa diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -52,25 +52,25 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: s_mov_b32 s16, s8 -; VI-NEXT: s_mov_b32 s17, s9 +; VI-NEXT: s_mov_b32 s16, s6 +; VI-NEXT: s_mov_b32 s17, s7 +; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_mov_b32 s20, s8 +; VI-NEXT: s_mov_b32 s21, s9 ; VI-NEXT: s_mov_b32 s8, s10 ; VI-NEXT: s_mov_b32 s9, s11 -; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_mov_b32 s22, s2 +; VI-NEXT: s_mov_b32 s23, s3 ; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 ; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 ; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -137,21 +137,21 @@ ; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 ; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -216,21 +216,21 @@ ; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 ; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -295,26 +295,26 @@ ; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_ushort v3, off, s[4:7], 0 -; VI-NEXT: v_mov_b32_e32 v2, 0x3800 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v3, 0x3800 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm half addrspace(1)* %r, @@ -375,26 +375,26 @@ ; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_ushort v3, off, s[4:7], 0 -; VI-NEXT: v_mov_b32_e32 v2, 0x3800 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v3, 0x3800 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm half addrspace(1)* %r, @@ -474,25 +474,25 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: s_mov_b32 s16, s8 -; VI-NEXT: s_mov_b32 s17, s9 +; VI-NEXT: s_mov_b32 s16, s6 +; VI-NEXT: s_mov_b32 s17, s7 +; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_mov_b32 s20, s8 +; VI-NEXT: s_mov_b32 s21, s9 ; VI-NEXT: s_mov_b32 s8, s10 ; VI-NEXT: s_mov_b32 s9, s11 -; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_mov_b32 s22, s2 +; VI-NEXT: s_mov_b32 s23, s3 ; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v1, off, s[20:23], 0 ; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; VI-NEXT: buffer_load_dword v3, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -534,15 +534,15 @@ ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 @@ -580,22 +580,22 @@ ; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 -; VI-NEXT: s_movk_i32 s0, 0x3900 +; VI-NEXT: s_movk_i32 s2, 0x3900 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0 @@ -603,7 +603,7 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_cmp_lt_f16_e32 vcc, s0, v3 +; VI-NEXT: v_cmp_lt_f16_e32 vcc, s2, v3 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -633,15 +633,15 @@ ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 @@ -679,22 +679,22 @@ ; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 -; VI-NEXT: s_movk_i32 s0, 0x3900 +; VI-NEXT: s_movk_i32 s2, 0x3900 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0 @@ -702,7 +702,7 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_cmp_gt_f16_e32 vcc, s0, v3 +; VI-NEXT: v_cmp_gt_f16_e32 vcc, s2, v3 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -731,38 +731,39 @@ ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s16, s4 -; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 -; SI-NEXT: buffer_load_dword v3, off, s[16:19], 0 -; SI-NEXT: v_mov_b32_e32 v2, 0x3f200000 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v5 -; SI-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc -; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v3 +; SI-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cndmask_b32_e32 v1, 0.5, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, 0.5, v2, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 @@ -777,32 +778,33 @@ ; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0 -; VI-NEXT: buffer_load_dword v4, off, s[12:15], 0 -; VI-NEXT: v_mov_b32_e32 v2, 0x3800 -; VI-NEXT: v_mov_b32_e32 v3, 0x3900 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v3, 0x3800 +; VI-NEXT: v_mov_b32_e32 v4, 0x3900 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v4 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v6, v5 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 @@ -830,41 +832,41 @@ ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s16, s4 -; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 -; SI-NEXT: buffer_load_dword v3, off, s[16:19], 0 -; SI-NEXT: v_mov_b32_e32 v2, 0x3f200000 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5 -; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 -; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v1, vcc -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -877,32 +879,33 @@ ; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s0, s2 -; VI-NEXT: s_mov_b32 s1, s3 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s5 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 ; VI-NEXT: s_mov_b32 s4, s6 ; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0 -; VI-NEXT: buffer_load_dword v4, off, s[12:15], 0 -; VI-NEXT: v_mov_b32_e32 v2, 0x3800 -; VI-NEXT: v_mov_b32_e32 v3, 0x3900 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v3, 0x3800 +; VI-NEXT: v_mov_b32_e32 v4, 0x3900 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v4 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 diff --git a/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll b/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll @@ -54,8 +54,8 @@ ; after 64-bit shift is split. ; GCN-LABEL: {{^}}lshr_and_i64_35: -; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: buffer_load_dword v[[LO:[0-9]+]] +; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN-DAG: buffer_load_dword v[[LO:[0-9]+]] ; GCN: v_bfe_u32 v[[BFE:[0-9]+]], v[[LO]], 8, 23 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} define amdgpu_kernel void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -13,14 +13,14 @@ ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s10, s2 +; GCN-NEXT: s_mov_b32 s11, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s8, s6 +; GCN-NEXT: s_mov_b32 s9, s7 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s7 -; GCN-NEXT: s_mov_b32 s6, s2 -; GCN-NEXT: s_mov_b32 s7, s3 -; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_lshl_b32_e32 v1, v1, v3 ; GCN-NEXT: v_lshl_b32_e32 v0, v0, v2 @@ -59,15 +59,15 @@ ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s10, s2 +; GCN-NEXT: s_mov_b32 s11, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s8, s6 +; GCN-NEXT: s_mov_b32 s9, s7 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s7 -; GCN-NEXT: s_mov_b32 s6, s2 -; GCN-NEXT: s_mov_b32 s7, s3 -; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_lshl_b32_e32 v3, v3, v7 ; GCN-NEXT: v_lshl_b32_e32 v2, v2, v6 @@ -411,23 +411,23 @@ ; GCN-NEXT: s_mov_b32 s8, s6 ; GCN-NEXT: s_mov_b32 s9, s7 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_mov_b64 s[12:13], s[6:7] ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s14, 0 ; GCN-NEXT: s_mov_b32 s15, s3 -; GCN-NEXT: s_mov_b64 s[12:13], s[6:7] ; GCN-NEXT: buffer_load_dword v2, off, s[8:11], 0 ; GCN-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 offset:4 +; GCN-NEXT: s_mov_b32 s6, 0xffff ; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_mov_b32 s4, 0xffff ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GCN-NEXT: v_and_b32_e32 v0, s4, v0 +; GCN-NEXT: v_and_b32_e32 v0, s6, v0 ; GCN-NEXT: v_lshl_b32_e32 v0, v2, v0 ; GCN-NEXT: v_lshl_b32_e32 v1, v1, v3 -; GCN-NEXT: v_and_b32_e32 v0, s4, v0 +; GCN-NEXT: v_and_b32_e32 v0, s6, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -490,14 +490,14 @@ ; GCN-NEXT: s_mov_b64 s[0:1], s[6:7] ; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; GCN-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 -; GCN-NEXT: s_mov_b32 s8, 0xffff +; GCN-NEXT: s_mov_b32 s0, 0xffff ; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v8, s8, v4 +; GCN-NEXT: v_and_b32_e32 v8, s0, v4 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_and_b32_e32 v9, s8, v5 +; GCN-NEXT: v_and_b32_e32 v9, s0, v5 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GCN-NEXT: v_lshl_b32_e32 v5, v7, v5 @@ -505,9 +505,9 @@ ; GCN-NEXT: v_lshl_b32_e32 v4, v6, v4 ; GCN-NEXT: v_lshl_b32_e32 v2, v2, v8 ; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_and_b32_e32 v3, s8, v3 +; GCN-NEXT: v_and_b32_e32 v3, s0, v3 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_and_b32_e32 v2, s8, v2 +; GCN-NEXT: v_and_b32_e32 v2, s0, v2 ; GCN-NEXT: v_or_b32_e32 v3, v3, v5 ; GCN-NEXT: v_or_b32_e32 v2, v2, v4 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 @@ -732,17 +732,17 @@ ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s10, s2 +; GCN-NEXT: s_mov_b32 s11, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s8, s6 +; GCN-NEXT: s_mov_b32 s9, s7 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; GCN-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 ; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s7 -; GCN-NEXT: s_mov_b32 s6, s2 -; GCN-NEXT: s_mov_b32 s7, s3 -; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; GCN-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 -; GCN-NEXT: buffer_load_dwordx4 v[11:14], off, s[4:7], 0 offset:48 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v10 ; GCN-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -86,23 +86,23 @@ ; VI-LABEL: v_shl_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v5, v[0:1] +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: flat_load_dword v1, v[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v4, v1, v0 -; VI-NEXT: v_lshlrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v0, v4, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v5 +; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_shl_v2i16: @@ -116,17 +116,17 @@ ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 -; CI-NEXT: s_mov_b32 s8, 0xffff +; CI-NEXT: s_mov_b32 s0, 0xffff ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v5, s8, v3 +; CI-NEXT: v_and_b32_e32 v5, s0, v3 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_lshl_b32_e32 v3, v4, v3 ; CI-NEXT: v_lshl_b32_e32 v2, v2, v5 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_and_b32_e32 v2, s8, v2 +; CI-NEXT: v_and_b32_e32 v2, s0, v2 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm @@ -170,39 +170,39 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_lshr_b32 s1, s0, 16 -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v1, s0, v0 -; VI-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_lshlrev_b16_e32 v4, s0, v3 +; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v4, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; CI-LABEL: shl_v_s_v2i16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dword s0, s[0:1], 0xd -; CI-NEXT: s_mov_b32 s8, 0xffff +; CI-NEXT: s_load_dword s8, s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s9, s0, 16 -; CI-NEXT: s_and_b32 s10, s0, s8 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_mov_b32 s0, 0xffff +; CI-NEXT: s_lshr_b32 s1, s8, 16 +; CI-NEXT: s_and_b32 s8, s8, s0 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_lshlrev_b32_e32 v2, s10, v2 -; CI-NEXT: v_lshlrev_b32_e32 v3, s9, v3 -; CI-NEXT: v_and_b32_e32 v2, s8, v2 +; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, s1, v3 +; CI-NEXT: v_and_b32_e32 v2, s0, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -245,17 +245,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_lshr_b32 s1, s0, 16 -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e64 v1, v0, s0 -; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_lshlrev_b16_e64 v4, v3, s0 +; VI-NEXT: v_lshlrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v4, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; CI-LABEL: shl_s_v_v2i16: @@ -270,12 +270,12 @@ ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_mov_b32 s0, 0xffff -; CI-NEXT: s_lshr_b32 s9, s8, 16 +; CI-NEXT: s_lshr_b32 s1, s8, 16 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_and_b32_e32 v3, s0, v2 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_lshl_b32_e32 v2, s9, v2 +; CI-NEXT: v_lshl_b32_e32 v2, s1, v2 ; CI-NEXT: v_lshl_b32_e32 v3, s8, v3 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: v_and_b32_e32 v3, s0, v3 @@ -319,15 +319,15 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e64 v1, v0, 8 -; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_lshlrev_b16_e64 v2, v3, 8 +; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; CI-LABEL: shl_imm_v_v2i16: @@ -387,16 +387,16 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; VI-NEXT: v_and_b32_e32 v1, 0xff000000, v1 -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 +; VI-NEXT: v_and_b32_e32 v2, 0xff000000, v2 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; CI-LABEL: shl_v_imm_v2i16: @@ -429,45 +429,45 @@ ; GFX9-LABEL: v_shl_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v5 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v4 -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v3 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v2 +; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_shl_v4i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v6, v5, v1 -; VI-NEXT: v_lshlrev_b16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b16_e32 v5, v4, v0 -; VI-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1 +; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0 +; VI-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v1, v6, v1 -; VI-NEXT: v_or_b32_e32 v0, v5, v0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_or_b32_e32 v0, v3, v0 +; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_shl_v4i16: @@ -481,14 +481,14 @@ ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 -; CI-NEXT: s_mov_b32 s8, 0xffff +; CI-NEXT: s_mov_b32 s0, 0xffff ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v8, s8, v4 +; CI-NEXT: v_and_b32_e32 v8, s0, v4 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; CI-NEXT: v_and_b32_e32 v9, s8, v5 +; CI-NEXT: v_and_b32_e32 v9, s0, v5 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; CI-NEXT: v_lshl_b32_e32 v5, v7, v5 @@ -496,9 +496,9 @@ ; CI-NEXT: v_lshl_b32_e32 v4, v6, v4 ; CI-NEXT: v_lshl_b32_e32 v2, v2, v8 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; CI-NEXT: v_and_b32_e32 v3, s8, v3 +; CI-NEXT: v_and_b32_e32 v3, s0, v3 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; CI-NEXT: v_and_b32_e32 v2, s8, v2 +; CI-NEXT: v_and_b32_e32 v2, s0, v2 ; CI-NEXT: v_or_b32_e32 v3, v3, v5 ; CI-NEXT: v_or_b32_e32 v2, v2, v4 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 @@ -539,21 +539,21 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_mov_b32 s0, 0xff000000 +; VI-NEXT: s_mov_b32 s2, 0xff000000 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: v_and_b32_e32 v0, s0, v0 +; VI-NEXT: v_and_b32_e32 v0, s2, v0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_and_b32_e32 v4, s0, v4 +; VI-NEXT: v_and_b32_e32 v4, s2, v4 ; VI-NEXT: v_or_b32_e32 v1, v1, v4 ; VI-NEXT: v_or_b32_e32 v0, v5, v0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -569,14 +569,14 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_mov_b32 s8, 0xff00 +; CI-NEXT: s_mov_b32 s0, 0xff00 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v4, 8, v3 ; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; CI-NEXT: v_and_b32_e32 v4, s8, v4 +; CI-NEXT: v_and_b32_e32 v4, s0, v4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; CI-NEXT: v_and_b32_e32 v3, s8, v3 +; CI-NEXT: v_and_b32_e32 v3, s0, v3 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; CI-NEXT: v_or_b32_e32 v3, v3, v4 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -112,17 +112,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_subrev_u32_e32 v1, vcc, 64, v4 +; VI-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_subrev_u32_e32 v0, vcc, 64, v0 -; VI-NEXT: flat_store_dword v[2:3], v1 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_subrev_u32_e32 v3, vcc, 64, v4 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_x_sub_64_multi_use: @@ -133,17 +133,17 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v4 +; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u32_e32 v0, 64, v0 -; GFX9-NEXT: global_store_dword v[2:3], v1, off -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_subrev_u32_e32 v3, 64, v4 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: global_store_dword v[0:1], v3, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_x_sub_64_multi_use: @@ -945,17 +945,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v3, v[0:1] ; VI-NEXT: flat_load_ushort v4, v[0:1] -; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_subrev_u16_e32 v1, 64, v4 +; VI-NEXT: v_subrev_u16_e32 v2, 64, v3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_subrev_u16_e32 v0, 64, v0 -; VI-NEXT: flat_store_short v[2:3], v1 -; VI-NEXT: flat_store_short v[2:3], v0 +; VI-NEXT: v_subrev_u16_e32 v3, 64, v4 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: flat_store_short v[0:1], v3 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i16_x_sub_64_multi_use: @@ -966,17 +966,17 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ushort v3, v[0:1], off ; GFX9-NEXT: global_load_ushort v4, v[0:1], off -; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v4 +; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u16_e32 v0, 64, v0 -; GFX9-NEXT: global_store_short v[2:3], v1, off -; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: v_subrev_u16_e32 v3, 64, v4 +; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: global_store_short v[0:1], v3, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i16_x_sub_64_multi_use: @@ -1037,20 +1037,20 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v4, 64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, 64 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_subrev_u16_e32 v0, 64, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_subrev_u16_e32 v3, 64, v3 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_64_64: @@ -1125,15 +1125,15 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v1, -7, v0 -; VI-NEXT: v_sub_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_add_u16_e32 v2, -7, v3 +; VI-NEXT: v_sub_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_7_64: @@ -1204,20 +1204,20 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0xffffff85 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, 0xffffff85 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_subrev_u16_e32 v0, 64, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_add_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_subrev_u16_e32 v3, 64, v3 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_64_123: @@ -1292,15 +1292,15 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; VI-NEXT: v_add_u16_e32 v0, -7, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; VI-NEXT: v_add_u16_e32 v3, -7, v3 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_7_0: @@ -1608,20 +1608,20 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v4, 32 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, 32 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_subrev_u16_e32 v0, 32, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_subrev_u16_e32 v3, 32, v3 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32: @@ -1772,15 +1772,15 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; VI-NEXT: v_subrev_u16_e32 v0, 32, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; VI-NEXT: v_subrev_u16_e32 v3, 32, v3 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg32_0: @@ -1856,15 +1856,15 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v1, -16, v0 -; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_add_u16_e32 v2, -16, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16: @@ -2015,15 +2015,15 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; VI-NEXT: v_add_u16_e32 v0, -16, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; VI-NEXT: v_add_u16_e32 v3, -16, v3 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg16_0: @@ -2094,20 +2094,20 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_movk_i32 s2, 0xc400 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_movk_i32 s0, 0xc400 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v1, s0, v0 -; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_add_u16_e32 v2, s2, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_fpone: @@ -2179,20 +2179,20 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_movk_i32 s2, 0x4400 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_movk_i32 s0, 0x4400 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v1, s0, v0 -; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_add_u16_e32 v2, s2, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfpone: @@ -2264,20 +2264,20 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_movk_i32 s2, 0x4000 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_movk_i32 s0, 0x4000 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v1, s0, v0 -; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_add_u16_e32 v2, s2, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo: @@ -2349,20 +2349,20 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_movk_i32 s2, 0xc000 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_movk_i32 s0, 0xc000 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v1, s0, v0 -; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: v_add_u16_e32 v2, s2, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo: diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -399,14 +399,14 @@ ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ashrrev_i32_e32 v1, 24, v0 ; SI-NEXT: v_bfe_i32 v2, v0, 16, 8 @@ -423,14 +423,14 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0 ; VI-NEXT: v_ashrrev_i32_e32 v2, 24, v0 @@ -523,14 +523,14 @@ ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ashr_i64 v[2:3], v[0:1], 48 ; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0 @@ -547,14 +547,14 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ashrrev_i32_e32 v3, 16, v0 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -30,26 +30,24 @@ ; VI-LABEL: v_test_sub_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: flat_load_dword v1, v[2:3] -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_sub_u16_e32 v2, v0, v1 ; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -88,14 +86,14 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_load_dword s4, s[6:7], 0x0 -; VI-NEXT: s_load_dword s5, s[8:9], 0x0 +; VI-NEXT: s_load_dword s6, s[8:9], 0x0 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s6, s4, 16 -; VI-NEXT: s_lshr_b32 s7, s5, 16 -; VI-NEXT: s_sub_i32 s4, s4, s5 -; VI-NEXT: s_sub_i32 s5, s6, s7 +; VI-NEXT: s_lshr_b32 s5, s4, 16 +; VI-NEXT: s_lshr_b32 s7, s6, 16 +; VI-NEXT: s_sub_i32 s4, s4, s6 +; VI-NEXT: s_sub_i32 s5, s5, s7 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -185,22 +183,20 @@ ; ; VI-LABEL: v_test_sub_v2i16_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v2, 0xfffffe38 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_mov_b32_e32 v1, 0xfffffe38 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v1, 0xffffff85, v0 -; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_add_u16_e32 v2, 0xffffff85, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -235,22 +231,20 @@ ; ; VI-LABEL: v_test_sub_v2i16_neg_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v2, 0x3df -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_mov_b32_e32 v1, 0x3df +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v1, 0x34d, v0 -; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_add_u16_e32 v2, 0x34d, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -283,22 +277,20 @@ ; ; VI-LABEL: v_test_sub_v2i16_inline_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v2, 1 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_mov_b32_e32 v1, 1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v1, 1, v0 -; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_add_u16_e32 v2, 1, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -331,17 +323,15 @@ ; ; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; VI-NEXT: v_subrev_u16_e32 v0, 32, v0 @@ -411,50 +401,46 @@ ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: global_load_dword v1, v[2:3], off -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_sub_u16_e32 v0, v1, v2 ; VI-NEXT: v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid @@ -473,54 +459,50 @@ ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s8, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[4:5], off -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v1, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: flat_load_dword v2, v[2:3] ; VI-NEXT: v_mov_b32_e32 v1, 0 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_sub_u16_e32 v0, v4, v2 ; VI-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid @@ -539,52 +521,48 @@ ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: global_load_dword v1, v[2:3], off -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: flat_load_dword v1, v[2:3] -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_sub_u16_e32 v0, v0, v1 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 ; VI-NEXT: v_bfe_i32 v1, v2, 0, 16 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid @@ -603,21 +581,19 @@ ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: global_load_dword v1, v[2:3], off -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v1, v0, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 @@ -625,27 +601,25 @@ ; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: flat_load_dword v1, v[2:3] -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_sub_u16_e32 v0, v0, v1 @@ -653,7 +627,7 @@ ; VI-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -106,13 +106,13 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s2, s[4:5], 0x0 +; VI-NEXT: s_load_dword s3, s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_load_dword s0, s[4:5], 0x0 -; VI-NEXT: s_load_dword s1, s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i16 s0, s0 -; VI-NEXT: s_sext_i32_i16 s1, s1 +; VI-NEXT: s_sext_i32_i16 s0, s2 +; VI-NEXT: s_sext_i32_i16 s1, s3 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mul_i32_i24_e32 v2, s1, v2 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -1824,46 +1824,46 @@ define amdgpu_kernel void @s_test_udiv24_k_den_i64(i64 addrspace(1)* %out, i64 %x) { ; GCN-LABEL: s_test_udiv24_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0x46b6fe00 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_lshr_b32 s0, s7, 8 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_lshr_b32 s2, s3, 8 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-NEXT: s_mov_b32 s2, 0x46b6fe00 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 -; GCN-NEXT: v_mad_f32 v0, -v1, s6, v0 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s6 +; GCN-NEXT: v_mad_f32 v0, -v1, s2, v0 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s2 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_udiv24_k_den_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s6, 0x46b6fe00 -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 -; GCN-IR-NEXT: s_lshr_b32 s0, s7, 8 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_lshr_b32 s2, s3, 8 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-IR-NEXT: s_mov_b32 s2, 0x46b6fe00 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 -; GCN-IR-NEXT: v_mad_f32 v0, -v1, s6, v0 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s6 +; GCN-IR-NEXT: v_mad_f32 v0, -v1, s2, v0 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s2 ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc ; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %x.shr = lshr i64 %x, 40 %result = udiv i64 %x.shr, 23423 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -1479,52 +1479,52 @@ define amdgpu_kernel void @s_test_urem24_k_den_i64(i64 addrspace(1)* %out, i64 %x) { ; GCN-LABEL: s_test_urem24_k_den_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s1, 0x46b6fe00 -; GCN-NEXT: s_movk_i32 s0, 0x5b7f -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s4, 0x46b6fe00 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s6, s7, 8 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GCN-NEXT: s_lshr_b32 s2, s3, 8 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-NEXT: s_movk_i32 s3, 0x5b7f +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 -; GCN-NEXT: v_mad_f32 v0, -v1, s1, v0 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s1 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: v_mad_f32 v0, -v1, s4, v0 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s3 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_urem24_k_den_i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_mov_b32 s1, 0x46b6fe00 -; GCN-IR-NEXT: s_movk_i32 s0, 0x5b7f -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s4, 0x46b6fe00 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s6, s7, 8 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GCN-IR-NEXT: s_lshr_b32 s2, s3, 8 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-IR-NEXT: s_movk_i32 s3, 0x5b7f +; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 -; GCN-IR-NEXT: v_mad_f32 v0, -v1, s1, v0 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s1 -; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: v_mad_f32 v0, -v1, s4, v0 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4 +; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s3 ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %x.shr = lshr i64 %x, 40 %result = urem i64 %x.shr, 23423 diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll --- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -80,12 +80,12 @@ ; SI-NEXT: s_mov_b32 s9, s11 ; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 ; SI-NEXT: s_mov_b32 s14, s2 ; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 -; SI-NEXT: v_mov_b32_e32 v2, 0x41200000 +; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 +; SI-NEXT: v_mov_b32_e32 v3, 0x41200000 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_mov_b32 s8, s6 @@ -95,11 +95,11 @@ ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_madak_f32 v1, v0, v1, 0x41200000 -; SI-NEXT: v_mac_f32_e32 v2, v0, v3 +; SI-NEXT: v_mac_f32_e32 v3, v0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: buffer_store_short v1, off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -119,24 +119,22 @@ ; VI-NEXT: s_mov_b32 s9, s11 ; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 -; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 ; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 -; VI-NEXT: v_mov_b32_e32 v2, 0x4900 +; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 +; VI-NEXT: v_mov_b32_e32 v3, 0x4900 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_madak_f16 v1, v0, v1, 0x4900 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mac_f16_e32 v2, v0, v3 +; VI-NEXT: v_mac_f16_e32 v3, v0, v2 ; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 -; VI-NEXT: buffer_store_short v2, off, s[4:7], 0 +; VI-NEXT: buffer_store_short v3, off, s[8:11], 0 ; VI-NEXT: s_endpgm half addrspace(1)* %r0, half addrspace(1)* %r1, diff --git a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll --- a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll @@ -36,33 +36,33 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[0:1], s[6:7] -; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; GCN-NEXT: v_mov_b32_e32 v5, v2 -; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 addr64 -; GCN-NEXT: v_mov_b32_e32 v6, s8 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[0:3], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s9, 3 +; GCN-NEXT: v_mov_b32_e32 v7, v5 ; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc ; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s9, 2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc ; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s9, 1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s9, 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s10, 1 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s10, 2 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s10, 3 +; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s10, 2 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GCN-NEXT: buffer_store_dword v0, v[4:5], s[4:7], 0 addr64 +; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s10, 3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: buffer_store_dword v0, v[6:7], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = sext i32 %id to i64 diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -108,12 +108,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v[2:3], off ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -128,11 +128,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -6,17 +6,17 @@ ; SI-LABEL: widen_i16_constant_load: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s5, s4 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_load_dword s1, s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_addk_i32 s0, 0x3e7 -; SI-NEXT: s_or_b32 s0, s0, 4 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_addk_i32 s1, 0x3e7 +; SI-NEXT: s_or_b32 s4, s1, 4 +; SI-NEXT: s_mov_b32 s1, s0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: widen_i16_constant_load: @@ -43,18 +43,18 @@ ; SI-LABEL: widen_i16_constant_load_zext_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s5, s4 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_load_dword s1, s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s0, s0, 0xffff -; SI-NEXT: s_addk_i32 s0, 0x3e7 -; SI-NEXT: s_or_b32 s0, s0, 4 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_and_b32 s1, s1, 0xffff +; SI-NEXT: s_addk_i32 s1, 0x3e7 +; SI-NEXT: s_or_b32 s4, s1, 4 +; SI-NEXT: s_mov_b32 s1, s0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: widen_i16_constant_load_zext_i32: @@ -83,18 +83,18 @@ ; SI-LABEL: widen_i16_constant_load_sext_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s5, s4 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_load_dword s1, s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sext_i32_i16 s0, s0 -; SI-NEXT: s_addk_i32 s0, 0x3e7 -; SI-NEXT: s_or_b32 s0, s0, 4 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_sext_i32_i16 s1, s1 +; SI-NEXT: s_addk_i32 s1, 0x3e7 +; SI-NEXT: s_or_b32 s4, s1, 4 +; SI-NEXT: s_mov_b32 s1, s0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: widen_i16_constant_load_sext_i32: @@ -122,13 +122,13 @@ define amdgpu_kernel void @widen_i17_constant_load(i17 addrspace(4)* %arg) { ; SI-LABEL: widen_i17_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s1, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s7, s[8:9], 0x0 +; SI-NEXT: s_load_dword s7, s[6:7], 0x0 ; SI-NEXT: s_mov_b32 s4, 2 ; SI-NEXT: s_mov_b32 s5, s0 ; SI-NEXT: s_mov_b32 s6, s2 @@ -206,23 +206,23 @@ define amdgpu_kernel void @widen_v2i8_constant_load(<2 x i8> addrspace(4)* %arg) { ; SI-LABEL: widen_v2i8_constant_load: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s5, s4 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_load_dword s1, s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s1, s0, 0xff00 -; SI-NEXT: s_add_i32 s0, s0, 12 -; SI-NEXT: s_or_b32 s0, s0, 4 -; SI-NEXT: s_and_b32 s0, s0, 0xff -; SI-NEXT: s_or_b32 s0, s1, s0 -; SI-NEXT: s_addk_i32 s0, 0x2c00 -; SI-NEXT: s_or_b32 s0, s0, 0x300 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_and_b32 s4, s1, 0xff00 +; SI-NEXT: s_add_i32 s1, s1, 12 +; SI-NEXT: s_or_b32 s1, s1, 4 +; SI-NEXT: s_and_b32 s1, s1, 0xff +; SI-NEXT: s_or_b32 s1, s4, s1 +; SI-NEXT: s_addk_i32 s1, 0x2c00 +; SI-NEXT: s_or_b32 s4, s1, 0x300 +; SI-NEXT: s_mov_b32 s1, s0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: widen_v2i8_constant_load: @@ -302,16 +302,16 @@ ; SI-LABEL: widen_i1_constant_load: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s5, s4 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_load_dword s1, s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s0, s0, 1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_and_b32 s4, s1, 1 +; SI-NEXT: s_mov_b32 s1, s0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: widen_i1_constant_load: @@ -336,18 +336,18 @@ ; SI-LABEL: widen_i16_zextload_i64_constant_load: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s5, s4 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_load_dword s1, s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s0, s0, 0xffff -; SI-NEXT: s_addk_i32 s0, 0x3e7 -; SI-NEXT: s_or_b32 s0, s0, 4 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_and_b32 s1, s1, 0xffff +; SI-NEXT: s_addk_i32 s1, 0x3e7 +; SI-NEXT: s_or_b32 s4, s1, 4 +; SI-NEXT: s_mov_b32 s1, s0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: widen_i16_zextload_i64_constant_load: @@ -376,19 +376,19 @@ ; SI-LABEL: widen_i1_zext_to_i64_constant_load: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s5, s4 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_load_dword s1, s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s0, s0, 1 -; SI-NEXT: s_add_u32 s0, s0, 0x3e7 -; SI-NEXT: s_addc_u32 s1, 0, 0 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_and_b32 s1, s1, 1 +; SI-NEXT: s_add_u32 s4, s1, 0x3e7 +; SI-NEXT: s_addc_u32 s5, 0, 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_mov_b32 s1, s0 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: widen_i1_zext_to_i64_constant_load: @@ -455,17 +455,17 @@ ; SI-LABEL: widen_i16_global_invariant_load: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s4, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s5, s4 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_load_dword s1, s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_addk_i32 s0, 0x3e7 -; SI-NEXT: s_or_b32 s0, s0, 1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_addk_i32 s1, 0x3e7 +; SI-NEXT: s_or_b32 s4, s1, 1 +; SI-NEXT: s_mov_b32 s1, s0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: widen_i16_global_invariant_load: diff --git a/llvm/test/CodeGen/PowerPC/PR33671.ll b/llvm/test/CodeGen/PowerPC/PR33671.ll --- a/llvm/test/CodeGen/PowerPC/PR33671.ll +++ b/llvm/test/CodeGen/PowerPC/PR33671.ll @@ -26,7 +26,7 @@ ret void ; CHECK-LABEL: test2 ; CHECK: addi 3, 3, 8 -; CHECK: lxvx [[LD:[0-9]+]], 0, 3 ; CHECK: addi [[REG:[0-9]+]], 4, 4 +; CHECK: lxvx [[LD:[0-9]+]], 0, 3 ; CHECK: stxvx [[LD]], 0, [[REG]] } diff --git a/llvm/test/CodeGen/PowerPC/botheightreduce.mir b/llvm/test/CodeGen/PowerPC/botheightreduce.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/botheightreduce.mir @@ -0,0 +1,92 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -run-pass=machine-scheduler -o - %s | FileCheck %s +--- +# Check that machine-scheduler's BotHeightReduce heuristic puts the LD 8 in +# between the final run of MULLDs and the LDXs that feed them, to try to hide +# the latency of the LDXs. +name: test +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: test + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $x3, $x4 + ; CHECK: [[COPY:%[0-9]+]]:g8rc_and_g8rc_nox0 = COPY $x4 + ; CHECK: [[COPY1:%[0-9]+]]:g8rc_and_g8rc_nox0 = COPY $x3 + ; CHECK: [[ADDI8_:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 [[COPY1]], 1 + ; CHECK: [[CMPLDI:%[0-9]+]]:crrc = CMPLDI [[COPY]], 1 + ; CHECK: [[LI8_:%[0-9]+]]:g8rc_and_g8rc_nox0 = LI8 1 + ; CHECK: [[ISEL8_:%[0-9]+]]:g8rc = ISEL8 [[COPY]], [[LI8_]], [[CMPLDI]].sub_gt + ; CHECK: MTCTR8loop [[ISEL8_]], implicit-def dead $ctr8 + ; CHECK: [[LI8_1:%[0-9]+]]:g8rc = LI8 0 + ; CHECK: [[LI8_2:%[0-9]+]]:g8rc = LI8 2 + ; CHECK: [[LI8_3:%[0-9]+]]:g8rc = LI8 3 + ; CHECK: [[LI8_4:%[0-9]+]]:g8rc = LI8 5 + ; CHECK: [[LI8_5:%[0-9]+]]:g8rc = LI8 6 + ; CHECK: [[LI8_6:%[0-9]+]]:g8rc = LI8 7 + ; CHECK: bb.1: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: [[ADDI8_1:%[0-9]+]]:g8rc = ADDI8 [[ADDI8_]], 1 + ; CHECK: [[LD:%[0-9]+]]:g8rc = LD 0, [[ADDI8_]] :: (load 8) + ; CHECK: [[LDX:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_]] :: (load 8) + ; CHECK: [[LDX1:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_3]] :: (load 8) + ; CHECK: [[LD1:%[0-9]+]]:g8rc = LD 4, [[ADDI8_]] :: (load 8) + ; CHECK: [[LDX2:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_4]] :: (load 8) + ; CHECK: [[LDX3:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_5]] :: (load 8) + ; CHECK: [[LDX4:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_6]] :: (load 8) + ; CHECK: [[LDX5:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_2]] :: (load 8) + ; CHECK: [[MULLD:%[0-9]+]]:g8rc = MULLD [[LDX]], [[LD]] + ; CHECK: [[LD2:%[0-9]+]]:g8rc = LD 8, [[ADDI8_]] :: (load 8) + ; CHECK: [[MULLD1:%[0-9]+]]:g8rc = MULLD [[MULLD]], [[LDX5]] + ; CHECK: [[MULLD2:%[0-9]+]]:g8rc = MULLD [[MULLD1]], [[LDX1]] + ; CHECK: [[MULLD3:%[0-9]+]]:g8rc = MULLD [[MULLD2]], [[LD1]] + ; CHECK: [[MULLD4:%[0-9]+]]:g8rc = MULLD [[MULLD3]], [[LDX2]] + ; CHECK: [[MULLD5:%[0-9]+]]:g8rc = MULLD [[MULLD4]], [[LDX3]] + ; CHECK: [[MULLD6:%[0-9]+]]:g8rc = MULLD [[MULLD5]], [[LDX4]] + ; CHECK: [[MADDLD8_:%[0-9]+]]:g8rc = MADDLD8 [[MULLD6]], [[LD2]], [[MADDLD8_]] + ; CHECK: [[COPY2:%[0-9]+]]:g8rc_and_g8rc_nox0 = COPY [[ADDI8_1]] + ; CHECK: BDNZ8 %bb.1, implicit-def dead $ctr8, implicit $ctr8 + ; CHECK: B %bb.2 + ; CHECK: bb.2: + bb.0: + liveins: $x3, $x4 + + %0:g8rc_and_g8rc_nox0 = COPY $x4 + %1:g8rc_and_g8rc_nox0 = COPY $x3 + %2:g8rc_and_g8rc_nox0 = ADDI8 %1, 1 + %3:crrc = CMPLDI %0, 1 + %4:g8rc_and_g8rc_nox0 = LI8 1 + %5:g8rc = ISEL8 %0, %4, %3.sub_gt + MTCTR8loop %5, implicit-def dead $ctr8 + %6:g8rc = LI8 0 + %7:g8rc = LI8 2 + %8:g8rc = LI8 3 + %9:g8rc = LI8 5 + %10:g8rc = LI8 6 + %11:g8rc = LI8 7 + + bb.1: + %12:g8rc = ADDI8 %2, 1 + %13:g8rc = LD 0, %2 :: (load 8) + %14:g8rc = LDX %2, %4 :: (load 8) + %16:g8rc = LDX %2, %8 :: (load 8) + %17:g8rc = LD 4, %2 :: (load 8) + %18:g8rc = LDX %2, %9 :: (load 8) + %19:g8rc = LDX %2, %10 :: (load 8) + %20:g8rc = LDX %2, %11 :: (load 8) + %21:g8rc = LD 8, %2 :: (load 8) + %22:g8rc = MULLD %14, %13 + %15:g8rc = LDX %2, %7 :: (load 8) + %23:g8rc = MULLD %22, %15 + %24:g8rc = MULLD %23, %16 + %25:g8rc = MULLD %24, %17 + %26:g8rc = MULLD %25, %18 + %27:g8rc = MULLD %26, %19 + %28:g8rc = MULLD %27, %20 + %6:g8rc = MADDLD8 %28, %21, %6 + %2:g8rc_and_g8rc_nox0 = COPY %12 + BDNZ8 %bb.1, implicit-def dead $ctr8, implicit $ctr8 + B %bb.2 + + bb.2: +... diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll --- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll +++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll @@ -417,11 +417,11 @@ ; CHECK-P9-LABEL: no_RAUW_in_combine_during_legalize: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: sldi r4, r4, 2 +; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: lxsiwzx v2, r3, r4 ; CHECK-P9-NEXT: addis r3, r2, .LCPI16_0@toc@ha ; CHECK-P9-NEXT: addi r3, r3, .LCPI16_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r3 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: vperm v2, v4, v2, v3 ; CHECK-P9-NEXT: blr ; diff --git a/llvm/test/CodeGen/PowerPC/dform-adjust.ll b/llvm/test/CodeGen/PowerPC/dform-adjust.ll --- a/llvm/test/CodeGen/PowerPC/dform-adjust.ll +++ b/llvm/test/CodeGen/PowerPC/dform-adjust.ll @@ -5,18 +5,18 @@ ; CHECK-LABEL: test1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li 5, -13 -; CHECK-NEXT: lxvx 0, 3, 5 -; CHECK-NEXT: li 5, 19 -; CHECK-NEXT: lxvx 1, 3, 5 -; CHECK-NEXT: li 5, 3 ; CHECK-NEXT: li 6, 7 ; CHECK-NEXT: li 7, 11 ; CHECK-NEXT: li 8, 15 -; CHECK-NEXT: mfvsrld 9, 0 -; CHECK-NEXT: ldx 5, 3, 5 +; CHECK-NEXT: lxvx 0, 3, 5 +; CHECK-NEXT: li 5, 19 ; CHECK-NEXT: ldx 6, 3, 6 ; CHECK-NEXT: ldx 7, 3, 7 +; CHECK-NEXT: lxvx 1, 3, 5 +; CHECK-NEXT: li 5, 3 +; CHECK-NEXT: ldx 5, 3, 5 ; CHECK-NEXT: ldx 3, 3, 8 +; CHECK-NEXT: mfvsrld 9, 0 ; CHECK-NEXT: mffprd 8, 0 ; CHECK-NEXT: mfvsrld 10, 1 ; CHECK-NEXT: mffprd 11, 1 diff --git a/llvm/test/CodeGen/PowerPC/extract-and-store.ll b/llvm/test/CodeGen/PowerPC/extract-and-store.ll --- a/llvm/test/CodeGen/PowerPC/extract-and-store.ll +++ b/llvm/test/CodeGen/PowerPC/extract-and-store.ll @@ -508,9 +508,9 @@ ; CHECK-P9-BE-LABEL: test_consecutive_i32: ; CHECK-P9-BE: # %bb.0: # %entry ; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 3 +; CHECK-P9-BE-NEXT: li r3, 4 ; CHECK-P9-BE-NEXT: stfiwx f0, 0, r5 ; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 1 -; CHECK-P9-BE-NEXT: li r3, 4 ; CHECK-P9-BE-NEXT: stfiwx f0, r5, r3 ; CHECK-P9-BE-NEXT: blr entry: @@ -544,9 +544,9 @@ ; CHECK-P9-LABEL: test_consecutive_float: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 1 +; CHECK-P9-NEXT: li r3, 4 ; CHECK-P9-NEXT: stfiwx f0, 0, r5 ; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 3 -; CHECK-P9-NEXT: li r3, 4 ; CHECK-P9-NEXT: stfiwx f0, r5, r3 ; CHECK-P9-NEXT: blr ; @@ -597,9 +597,9 @@ ; CHECK-P9-LABEL: test_stores_exceed_vec_size: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: addis r3, r2, .LCPI16_0@toc@ha +; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 1 ; CHECK-P9-NEXT: addi r3, r3, .LCPI16_0@toc@l ; CHECK-P9-NEXT: lxvx vs35, 0, r3 -; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 1 ; CHECK-P9-NEXT: li r3, 16 ; CHECK-P9-NEXT: stfiwx f0, r5, r3 ; CHECK-P9-NEXT: li r3, 20 @@ -611,10 +611,10 @@ ; CHECK-P9-BE-LABEL: test_stores_exceed_vec_size: ; CHECK-P9-BE: # %bb.0: # %entry ; CHECK-P9-BE-NEXT: xxspltw vs0, vs34, 0 -; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs0, 2 ; CHECK-P9-BE-NEXT: li r3, 16 ; CHECK-P9-BE-NEXT: stxsiwx vs34, r5, r3 ; CHECK-P9-BE-NEXT: li r3, 20 +; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs0, 2 ; CHECK-P9-BE-NEXT: stxv vs0, 0(r5) ; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 1 ; CHECK-P9-BE-NEXT: stfiwx f0, r5, r3 @@ -676,9 +676,9 @@ ; CHECK-P9-LABEL: test_5_consecutive_stores_of_bytes: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: vsldoi v3, v2, v2, 4 +; CHECK-P9-NEXT: li r3, 1 ; CHECK-P9-NEXT: stxsibx vs35, 0, r5 ; CHECK-P9-NEXT: vsldoi v3, v2, v2, 12 -; CHECK-P9-NEXT: li r3, 1 ; CHECK-P9-NEXT: stxsibx vs35, r5, r3 ; CHECK-P9-NEXT: vsldoi v3, v2, v2, 15 ; CHECK-P9-NEXT: li r3, 2 @@ -694,9 +694,9 @@ ; CHECK-P9-BE-LABEL: test_5_consecutive_stores_of_bytes: ; CHECK-P9-BE: # %bb.0: # %entry ; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 13 +; CHECK-P9-BE-NEXT: li r3, 1 ; CHECK-P9-BE-NEXT: stxsibx vs35, 0, r5 ; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 5 -; CHECK-P9-BE-NEXT: li r3, 1 ; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3 ; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 2 ; CHECK-P9-BE-NEXT: li r3, 2 @@ -807,9 +807,9 @@ ; CHECK-P9-NEXT: li r3, 4 ; CHECK-P9-NEXT: stxsibx vs35, r5, r3 ; CHECK-P9-NEXT: vsldoi v3, v2, v2, 4 +; CHECK-P9-NEXT: li r3, 5 ; CHECK-P9-NEXT: stxsibx vs35, 0, r5 ; CHECK-P9-NEXT: vsldoi v3, v2, v2, 8 -; CHECK-P9-NEXT: li r3, 5 ; CHECK-P9-NEXT: stxsibx vs35, r5, r3 ; CHECK-P9-NEXT: vsldoi v3, v2, v2, 13 ; CHECK-P9-NEXT: li r3, 6 @@ -848,9 +848,9 @@ ; CHECK-P9-BE-NEXT: li r3, 4 ; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3 ; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 13 +; CHECK-P9-BE-NEXT: li r3, 5 ; CHECK-P9-BE-NEXT: stxsibx vs35, 0, r5 ; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 9 -; CHECK-P9-BE-NEXT: li r3, 5 ; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3 ; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 4 ; CHECK-P9-BE-NEXT: li r3, 6 @@ -947,8 +947,8 @@ ; CHECK-P9-BE: # %bb.0: # %entry ; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 3 ; CHECK-P9-BE-NEXT: li r3, 4 -; CHECK-P9-BE-NEXT: stfiwx f0, r7, r3 ; CHECK-P9-BE-NEXT: stxsiwx vs35, 0, r7 +; CHECK-P9-BE-NEXT: stfiwx f0, r7, r3 ; CHECK-P9-BE-NEXT: blr entry: %vecext = extractelement <4 x i32> %a, i32 0 @@ -996,9 +996,9 @@ ; CHECK-P9-BE-LABEL: test_elements_from_three_vec: ; CHECK-P9-BE: # %bb.0: # %entry ; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 2 +; CHECK-P9-BE-NEXT: li r3, 4 ; CHECK-P9-BE-NEXT: stfiwx f0, 0, r9 ; CHECK-P9-BE-NEXT: xxsldwi vs0, vs35, vs35, 1 -; CHECK-P9-BE-NEXT: li r3, 4 ; CHECK-P9-BE-NEXT: stfiwx f0, r9, r3 ; CHECK-P9-BE-NEXT: li r3, 8 ; CHECK-P9-BE-NEXT: stxsiwx vs36, r9, r3 diff --git a/llvm/test/CodeGen/PowerPC/f128-aggregates.ll b/llvm/test/CodeGen/PowerPC/f128-aggregates.ll --- a/llvm/test/CodeGen/PowerPC/f128-aggregates.ll +++ b/llvm/test/CodeGen/PowerPC/f128-aggregates.ll @@ -228,8 +228,8 @@ ; CHECK-LABEL: testMixedAggregate_03: ; CHECK: # %bb.0: # %entry ; CHECK: mtvsrwa v2, r3 -; CHECK: xscvsdqp v2, v2 -; CHECK: mtvsrdd v3, r6, r5 +; CHECK-DAG: xscvsdqp v2, v2 +; CHECK-DAG: mtvsrdd v3, r6, r5 ; CHECK: xsaddqp v2, v3, v2 ; CHECK: mtvsrd v[[REG1:[0-9]+]], r10 ; CHECK: xscvsdqp v[[REG:[0-9]+]], v[[REG1]] @@ -350,12 +350,12 @@ ; CHECK-NEXT: bltlr cr0 ; CHECK-NEXT: # %bb.1: # %if.end ; CHECK-NEXT: addi r3, r1, 40 +; CHECK-NEXT: addi [[REG2:r[0-9]+]], r1, 72 ; CHECK-NEXT: lxvx v3, 0, r3 +; CHECK-NEXT: std [[REG2]], -8(r1) ; CHECK-NEXT: xsaddqp v2, v3, v2 ; CHECK-NEXT: lxv v3, 16(r3) ; CHECK-NEXT: xsaddqp v2, v2, v3 -; CHECK-NEXT: addi [[REG2:r[0-9]+]], r1, 72 -; CHECK-NEXT: std [[REG2]], -8(r1) ; CHECK-NEXT: blr entry: %ap = alloca i8*, align 8 diff --git a/llvm/test/CodeGen/PowerPC/f128-conv.ll b/llvm/test/CodeGen/PowerPC/f128-conv.ll --- a/llvm/test/CodeGen/PowerPC/f128-conv.ll +++ b/llvm/test/CodeGen/PowerPC/f128-conv.ll @@ -444,10 +444,10 @@ ; CHECK-LABEL: qpConv2dp_03: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addis r5, r2, .LC7@toc@ha +; CHECK-NEXT: sldi r4, r4, 3 ; CHECK-NEXT: ld r5, .LC7@toc@l(r5) ; CHECK-NEXT: lxvx v2, 0, r5 ; CHECK-NEXT: xscvqpdp v2, v2 -; CHECK-NEXT: sldi r4, r4, 3 ; CHECK-NEXT: stxsdx v2, r3, r4 ; CHECK-NEXT: blr entry: @@ -517,11 +517,11 @@ ; CHECK-LABEL: qpConv2sp_03: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addis r5, r2, .LC7@toc@ha +; CHECK-NEXT: sldi r4, r4, 2 ; CHECK-NEXT: ld r5, .LC7@toc@l(r5) ; CHECK-NEXT: lxv v2, 48(r5) ; CHECK-NEXT: xscvqpdpo v2, v2 ; CHECK-NEXT: xsrsp f0, v2 -; CHECK-NEXT: sldi r4, r4, 2 ; CHECK-NEXT: stfsx f0, r3, r4 ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/f128-passByValue.ll b/llvm/test/CodeGen/PowerPC/f128-passByValue.ll --- a/llvm/test/CodeGen/PowerPC/f128-passByValue.ll +++ b/llvm/test/CodeGen/PowerPC/f128-passByValue.ll @@ -153,13 +153,13 @@ ; CHECK: # %bb.0: # %entry ; CHECK: lwz r3, 96(r1) ; CHECK: add r4, r7, r9 +; CHECK: xscpsgndp v[[REG0:[0-9]+]], f1, f1 ; CHECK: add r4, r4, r10 +; CHECK: xscvdpqp v[[REG0]], v[[REG0]] ; CHECK: add r3, r4, r3 ; CHECK: clrldi r3, r3, 32 ; CHECK: std r3, 0(r6) ; CHECK: lxv v[[REG1:[0-9]+]], 0(r8) -; CHECK: xscpsgndp v[[REG0:[0-9]+]], f1, f1 -; CHECK: xscvdpqp v[[REG0]], v[[REG0]] ; CHECK: xsaddqp v2, v[[REG1]], v2 ; CHECK: xsaddqp v2, v2, v3 ; CHECK-NEXT: blr @@ -185,13 +185,13 @@ ; CHECK-LABEL: mixParam_02f: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: add r4, r4, r6 +; CHECK-NEXT: xscpsgndp v[[REG0:[0-9]+]], f1, f1 ; CHECK-NEXT: add r4, r4, r7 +; CHECK-NEXT: xscvdpqp v[[REG0]], v[[REG0]] ; CHECK-NEXT: add r4, r4, r8 ; CHECK-NEXT: clrldi r4, r4, 32 ; CHECK-DAG: std r4, 0(r3) ; CHECK-DAG: lxv v[[REG1:[0-9]+]], 0(r5) -; CHECK-NEXT: xscpsgndp v[[REG0:[0-9]+]], f1, f1 -; CHECK-NEXT: xscvdpqp v[[REG0]], v[[REG0]] ; CHECK-NEXT: xsaddqp v2, v[[REG1]], v2 ; CHECK-NEXT: xsaddqp v2, v2, v[[REG0]] ; CHECK-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/float-load-store-pair.ll b/llvm/test/CodeGen/PowerPC/float-load-store-pair.ll --- a/llvm/test/CodeGen/PowerPC/float-load-store-pair.ll +++ b/llvm/test/CodeGen/PowerPC/float-load-store-pair.ll @@ -32,10 +32,19 @@ ; CHECK-NEXT: std 0, 16(1) ; CHECK-NEXT: stdu 1, -192(1) ; CHECK-NEXT: addis 3, 2, a1@toc@ha +; CHECK-NEXT: addis 5, 2, a16@toc@ha +; CHECK-NEXT: addis 6, 2, a17@toc@ha +; CHECK-NEXT: addis 4, 2, a15@toc@ha ; CHECK-NEXT: lfd 1, a1@toc@l(3) ; CHECK-NEXT: addis 3, 2, a2@toc@ha +; CHECK-NEXT: addi 5, 5, a16@toc@l +; CHECK-NEXT: addi 6, 6, a17@toc@l +; CHECK-NEXT: ld 4, a15@toc@l(4) ; CHECK-NEXT: lfd 2, a2@toc@l(3) ; CHECK-NEXT: addis 3, 2, a3@toc@ha +; CHECK-NEXT: lxvx 34, 0, 6 +; CHECK-NEXT: lxvx 0, 0, 5 +; CHECK-NEXT: li 5, 152 ; CHECK-NEXT: lfd 3, a3@toc@l(3) ; CHECK-NEXT: addis 3, 2, a4@toc@ha ; CHECK-NEXT: lfd 4, a4@toc@l(3) @@ -54,17 +63,8 @@ ; CHECK-NEXT: addis 3, 2, a11@toc@ha ; CHECK-NEXT: lfd 11, a11@toc@l(3) ; CHECK-NEXT: addis 3, 2, a12@toc@ha -; CHECK-NEXT: addis 5, 2, a16@toc@ha -; CHECK-NEXT: addis 6, 2, a17@toc@ha -; CHECK-NEXT: addi 6, 6, a17@toc@l -; CHECK-NEXT: lxvx 34, 0, 6 ; CHECK-NEXT: lfd 12, a12@toc@l(3) ; CHECK-NEXT: addis 3, 2, a13@toc@ha -; CHECK-NEXT: addi 5, 5, a16@toc@l -; CHECK-NEXT: addis 4, 2, a15@toc@ha -; CHECK-NEXT: lxvx 0, 0, 5 -; CHECK-NEXT: ld 4, a15@toc@l(4) -; CHECK-NEXT: li 5, 152 ; CHECK-NEXT: lfd 13, a13@toc@l(3) ; CHECK-NEXT: addis 3, 2, a14@toc@ha ; CHECK-NEXT: ld 3, a14@toc@l(3) diff --git a/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll b/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll --- a/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll +++ b/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll @@ -697,10 +697,10 @@ ; CHECK-NEXT: lhz r3, 0(r3) ; CHECK-NEXT: xxmrghd vs0, vs0, vs1 ; CHECK-NEXT: mtfprwz f3, r3 +; CHECK-NEXT: xvcvdpsp vs35, vs0 ; CHECK-NEXT: xscvhpdp f3, f3 ; CHECK-NEXT: xxmrghd vs2, vs2, vs3 ; CHECK-NEXT: xvcvdpsp vs34, vs2 -; CHECK-NEXT: xvcvdpsp vs35, vs0 ; CHECK-NEXT: vmrgew v2, v3, v2 ; CHECK-NEXT: blr ; @@ -906,12 +906,12 @@ ; CHECK-LABEL: test_trunc32_vec4: ; CHECK: # %bb.0: ; CHECK-NEXT: xxsldwi vs0, vs34, vs34, 3 +; CHECK-NEXT: xxsldwi vs1, vs34, vs34, 1 ; CHECK-NEXT: xscvspdpn f0, vs0 +; CHECK-NEXT: xscvspdpn f1, vs1 ; CHECK-NEXT: xscvdphp f0, f0 ; CHECK-NEXT: mffprwz r3, f0 ; CHECK-NEXT: xxswapd vs0, vs34 -; CHECK-NEXT: xxsldwi vs1, vs34, vs34, 1 -; CHECK-NEXT: xscvspdpn f1, vs1 ; CHECK-NEXT: xscvspdpn f0, vs0 ; CHECK-NEXT: xscvdphp f0, f0 ; CHECK-NEXT: xscvdphp f1, f1 @@ -920,8 +920,8 @@ ; CHECK-NEXT: xscvdphp f1, f1 ; CHECK-NEXT: sth r4, 4(r5) ; CHECK-NEXT: mffprwz r4, f0 -; CHECK-NEXT: sth r4, 2(r5) ; CHECK-NEXT: sth r3, 0(r5) +; CHECK-NEXT: sth r4, 2(r5) ; CHECK-NEXT: mffprwz r6, f1 ; CHECK-NEXT: sth r6, 6(r5) ; CHECK-NEXT: blr @@ -1059,10 +1059,10 @@ ; CHECK-NEXT: xscvdphp f1, vs34 ; CHECK-NEXT: mffprwz r4, f1 ; CHECK-NEXT: xscvdphp f1, vs35 +; CHECK-NEXT: sth r3, 0(r7) ; CHECK-NEXT: sth r4, 2(r7) ; CHECK-NEXT: mffprwz r4, f0 ; CHECK-NEXT: sth r4, 4(r7) -; CHECK-NEXT: sth r3, 0(r7) ; CHECK-NEXT: mffprwz r5, f1 ; CHECK-NEXT: sth r5, 6(r7) ; CHECK-NEXT: blr @@ -1169,8 +1169,8 @@ ; CHECK-LABEL: test_sitofp_fadd_i32: ; CHECK: # %bb.0: ; CHECK-NEXT: mtfprwa f1, r3 -; CHECK-NEXT: xscvsxdsp f1, f1 ; CHECK-NEXT: lhz r4, 0(r4) +; CHECK-NEXT: xscvsxdsp f1, f1 ; CHECK-NEXT: mtfprwz f0, r4 ; CHECK-NEXT: xscvhpdp f0, f0 ; CHECK-NEXT: xscvdphp f1, f1 diff --git a/llvm/test/CodeGen/PowerPC/load-shuffle-and-shuffle-store.ll b/llvm/test/CodeGen/PowerPC/load-shuffle-and-shuffle-store.ll --- a/llvm/test/CodeGen/PowerPC/load-shuffle-and-shuffle-store.ll +++ b/llvm/test/CodeGen/PowerPC/load-shuffle-and-shuffle-store.ll @@ -132,8 +132,8 @@ ; CHECK-P9-BE-LABEL: load_swap11: ; CHECK-P9-BE: # %bb.0: ; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI3_0@toc@ha -; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI3_0@toc@l ; CHECK-P9-BE-NEXT: lxv v2, 0(r4) +; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI3_0@toc@l ; CHECK-P9-BE-NEXT: lxvx v3, 0, r3 ; CHECK-P9-BE-NEXT: vperm v2, v2, v2, v3 ; CHECK-P9-BE-NEXT: blr @@ -208,8 +208,8 @@ ; CHECK-P9-BE-LABEL: load_swap21: ; CHECK-P9-BE: # %bb.0: ; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI5_0@toc@ha -; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI5_0@toc@l ; CHECK-P9-BE-NEXT: lxv v2, 0(r4) +; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI5_0@toc@l ; CHECK-P9-BE-NEXT: lxvx v3, 0, r3 ; CHECK-P9-BE-NEXT: vperm v2, v2, v2, v3 ; CHECK-P9-BE-NEXT: blr @@ -382,8 +382,8 @@ ; CHECK-P9-BE-LABEL: load_swap51: ; CHECK-P9-BE: # %bb.0: ; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI10_0@toc@ha -; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI10_0@toc@l ; CHECK-P9-BE-NEXT: lxv v2, 0(r4) +; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI10_0@toc@l ; CHECK-P9-BE-NEXT: lxvx v3, 0, r3 ; CHECK-P9-BE-NEXT: vperm v2, v2, v2, v3 ; CHECK-P9-BE-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll --- a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll @@ -23,11 +23,11 @@ ; CHECK: .LBB0_2: # ; CHECK-NEXT: ldx r9, r3, r6 ; CHECK-NEXT: ldx r10, r3, r7 -; CHECK-NEXT: mulld r9, r10, r9 ; CHECK-NEXT: ldx r11, r3, r8 -; CHECK-NEXT: mulld r9, r9, r11 ; CHECK-NEXT: ld r12, 0(r3) ; CHECK-NEXT: addi r3, r3, 1 +; CHECK-NEXT: mulld r9, r10, r9 +; CHECK-NEXT: mulld r9, r9, r11 ; CHECK-NEXT: maddld r5, r9, r12, r5 ; CHECK-NEXT: bdnz .LBB0_2 %3 = sext i32 %1 to i64 @@ -87,11 +87,11 @@ ; CHECK: .LBB1_2: # ; CHECK-NEXT: ldx r9, r6, r7 ; CHECK-NEXT: ld r10, 0(r6) -; CHECK-NEXT: mulld r9, r10, r9 ; CHECK-NEXT: ldx r11, r6, r5 -; CHECK-NEXT: mulld r9, r9, r11 ; CHECK-NEXT: addi r8, r6, 1 ; CHECK-NEXT: ld r6, 4(r6) +; CHECK-NEXT: mulld r9, r10, r9 +; CHECK-NEXT: mulld r9, r9, r11 ; CHECK-NEXT: maddld r3, r9, r6, r3 ; CHECK-NEXT: mr r6, r8 ; CHECK-NEXT: bdnz .LBB1_2 @@ -162,22 +162,22 @@ ; CHECK: .LBB2_2: # ; CHECK-NEXT: ldx r12, r9, r6 ; CHECK-NEXT: ld r0, 0(r9) -; CHECK-NEXT: mulld r12, r0, r12 +; CHECK-NEXT: ldx r30, r9, r5 +; CHECK-NEXT: ldx r29, r9, r7 ; CHECK-NEXT: addi r11, r9, 1 -; CHECK-NEXT: ldx r30, r9, r7 -; CHECK-NEXT: ld r29, 4(r9) -; CHECK-NEXT: ldx r28, r9, r8 -; CHECK-NEXT: ld r27, 12(r9) -; CHECK-NEXT: ld r26, 8(r9) -; CHECK-NEXT: ldx r25, r9, r10 -; CHECK-NEXT: ldx r9, r9, r5 -; CHECK-NEXT: mulld r9, r12, r9 -; CHECK-NEXT: mulld r9, r9, r30 -; CHECK-NEXT: mulld r9, r9, r29 -; CHECK-NEXT: mulld r9, r9, r28 -; CHECK-NEXT: mulld r9, r9, r27 -; CHECK-NEXT: mulld r9, r9, r26 -; CHECK-NEXT: maddld r3, r9, r25, r3 +; CHECK-NEXT: mulld r12, r0, r12 +; CHECK-NEXT: ld r28, 4(r9) +; CHECK-NEXT: ldx r27, r9, r8 +; CHECK-NEXT: ld r26, 12(r9) +; CHECK-NEXT: ld r25, 8(r9) +; CHECK-NEXT: ldx r9, r9, r10 +; CHECK-NEXT: mulld r12, r12, r30 +; CHECK-NEXT: mulld r12, r12, r29 +; CHECK-NEXT: mulld r12, r12, r28 +; CHECK-NEXT: mulld r12, r12, r27 +; CHECK-NEXT: mulld r12, r12, r26 +; CHECK-NEXT: mulld r12, r12, r25 +; CHECK-NEXT: maddld r3, r12, r9, r3 ; CHECK-NEXT: mr r9, r11 ; CHECK-NEXT: bdnz .LBB2_2 %3 = sext i32 %1 to i64 @@ -257,10 +257,10 @@ ; CHECK: .LBB3_2: # ; CHECK-NEXT: ldu r8, 4(r3) ; CHECK-NEXT: ldx r9, r3, r7 -; CHECK-NEXT: mulld r8, r8, r9 ; CHECK-NEXT: ldx r10, r3, r6 -; CHECK-NEXT: mulld r8, r8, r10 ; CHECK-NEXT: ld r11, 4(r3) +; CHECK-NEXT: mulld r8, r8, r9 +; CHECK-NEXT: mulld r8, r8, r10 ; CHECK-NEXT: maddld r5, r8, r11, r5 ; CHECK-NEXT: bdnz .LBB3_2 %3 = sext i32 %1 to i64 @@ -391,21 +391,21 @@ ; CHECK: .LBB5_2: # ; CHECK-NEXT: ld r8, 0(r3) ; CHECK-NEXT: ldx r9, r3, r7 -; CHECK-NEXT: mulld r8, r9, r8 -; CHECK-NEXT: ld r9, 4(r3) -; CHECK-NEXT: mulld r8, r8, r9 -; CHECK-NEXT: ld r10, 8(r3) +; CHECK-NEXT: ld r10, 4(r3) +; CHECK-NEXT: ld r11, 8(r3) ; CHECK-NEXT: addi r3, r3, 1 +; CHECK-NEXT: mulld r8, r9, r8 +; CHECK-NEXT: ld r12, 0(r4) +; CHECK-NEXT: ldx r0, r4, r7 +; CHECK-NEXT: ld r30, 4(r4) +; CHECK-NEXT: ld r9, 8(r4) +; CHECK-NEXT: addi r4, r4, 1 ; CHECK-NEXT: mulld r8, r8, r10 -; CHECK-NEXT: ld r11, 0(r4) ; CHECK-NEXT: mulld r8, r8, r11 -; CHECK-NEXT: ldx r12, r4, r7 ; CHECK-NEXT: mulld r8, r8, r12 -; CHECK-NEXT: ld r0, 4(r4) ; CHECK-NEXT: mulld r8, r8, r0 -; CHECK-NEXT: ld r30, 8(r4) -; CHECK-NEXT: addi r4, r4, 1 -; CHECK-NEXT: maddld r6, r8, r30, r6 +; CHECK-NEXT: mulld r8, r8, r30 +; CHECK-NEXT: maddld r6, r8, r9, r6 ; CHECK-NEXT: bdnz .LBB5_2 %4 = sext i32 %2 to i64 %5 = icmp eq i32 %2, 0 @@ -710,10 +710,10 @@ ; CHECK-NEXT: lfsx f0, r3, r4 ; CHECK-NEXT: xscvuxdsp f4, f4 ; CHECK-NEXT: lfs f2, 20(r3) -; CHECK-NEXT: xsmulsp f0, f0, f4 -; CHECK-NEXT: xsmulsp f0, f2, f0 ; CHECK-NEXT: lfs f3, 60(r3) ; CHECK-NEXT: addi r3, r3, 1 +; CHECK-NEXT: xsmulsp f0, f0, f4 +; CHECK-NEXT: xsmulsp f0, f2, f0 ; CHECK-NEXT: xsmulsp f0, f3, f0 ; CHECK-NEXT: xsaddsp f1, f1, f0 ; CHECK-NEXT: bdnz .LBB8_2 diff --git a/llvm/test/CodeGen/PowerPC/machine-pre.ll b/llvm/test/CodeGen/PowerPC/machine-pre.ll --- a/llvm/test/CodeGen/PowerPC/machine-pre.ll +++ b/llvm/test/CodeGen/PowerPC/machine-pre.ll @@ -109,10 +109,10 @@ ; CHECK-P9-NEXT: b .LBB1_2 ; CHECK-P9-NEXT: .LBB1_7: # %while.end ; CHECK-P9-NEXT: lis r3, -13108 -; CHECK-P9-NEXT: ori r3, r3, 52429 -; CHECK-P9-NEXT: mullw r3, r28, r3 ; CHECK-P9-NEXT: lis r4, 13107 +; CHECK-P9-NEXT: ori r3, r3, 52429 ; CHECK-P9-NEXT: ori r4, r4, 13108 +; CHECK-P9-NEXT: mullw r3, r28, r3 ; CHECK-P9-NEXT: cmplw r3, r4 ; CHECK-P9-NEXT: blt cr0, .LBB1_9 ; CHECK-P9-NEXT: # %bb.8: # %if.then8 diff --git a/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll @@ -1397,10 +1397,10 @@ ; PC64LE9-NEXT: li 3, 0 ; PC64LE9-NEXT: xxlxor 2, 2, 2 ; PC64LE9-NEXT: xxlxor 4, 4, 4 +; PC64LE9-NEXT: mr 30, 4 ; PC64LE9-NEXT: std 3, 8(4) ; PC64LE9-NEXT: fmr 1, 31 ; PC64LE9-NEXT: fmr 3, 31 -; PC64LE9-NEXT: mr 30, 4 ; PC64LE9-NEXT: stfd 31, 0(4) ; PC64LE9-NEXT: bl __gcc_qadd ; PC64LE9-NEXT: nop diff --git a/llvm/test/CodeGen/PowerPC/pr45432.ll b/llvm/test/CodeGen/PowerPC/pr45432.ll --- a/llvm/test/CodeGen/PowerPC/pr45432.ll +++ b/llvm/test/CodeGen/PowerPC/pr45432.ll @@ -14,8 +14,8 @@ ; CHECK-NEXT: std 0, 16(1) ; CHECK-NEXT: stdu 1, -64(1) ; CHECK-NEXT: addis 3, 2, g@toc@ha -; CHECK-NEXT: lwz 3, g@toc@l(3) ; CHECK-NEXT: std 30, 48(1) # 8-byte Folded Spill +; CHECK-NEXT: lwz 3, g@toc@l(3) ; CHECK-NEXT: extswsli 30, 3, 2 ; CHECK-NEXT: addis 3, 2, f@got@tlsld@ha ; CHECK-NEXT: addi 3, 3, f@got@tlsld@l diff --git a/llvm/test/CodeGen/PowerPC/pr45448.ll b/llvm/test/CodeGen/PowerPC/pr45448.ll --- a/llvm/test/CodeGen/PowerPC/pr45448.ll +++ b/llvm/test/CodeGen/PowerPC/pr45448.ll @@ -20,13 +20,13 @@ ; CHECK-NEXT: .LBB0_6: # %L1057.preheader ; CHECK-NEXT: .LBB0_7: # %L670 ; CHECK-NEXT: lis r5, 4095 -; CHECK-NEXT: ori r5, r5, 65533 -; CHECK-NEXT: sldi r5, r5, 4 ; CHECK-NEXT: cmpdi r3, 0 ; CHECK-NEXT: sradi r4, r3, 63 +; CHECK-NEXT: ori r5, r5, 65533 +; CHECK-NEXT: crnot 4*cr5+gt, eq +; CHECK-NEXT: sldi r5, r5, 4 ; CHECK-NEXT: mulhdu r3, r3, r5 ; CHECK-NEXT: maddld r6, r4, r5, r3 -; CHECK-NEXT: crnot 4*cr5+gt, eq ; CHECK-NEXT: cmpld r6, r3 ; CHECK-NEXT: mulld r3, r4, r5 ; CHECK-NEXT: cmpldi cr1, r3, 0 diff --git a/llvm/test/CodeGen/PowerPC/pr45628.ll b/llvm/test/CodeGen/PowerPC/pr45628.ll --- a/llvm/test/CodeGen/PowerPC/pr45628.ll +++ b/llvm/test/CodeGen/PowerPC/pr45628.ll @@ -223,9 +223,9 @@ ; P9-NOVSX-NEXT: rldimi r5, r3, 28, 0 ; P9-NOVSX-NEXT: rotldi r3, r3, 28 ; P9-NOVSX-NEXT: rldimi r3, r4, 28, 0 +; P9-NOVSX-NEXT: std r5, -8(r1) ; P9-NOVSX-NEXT: std r3, -16(r1) ; P9-NOVSX-NEXT: addi r3, r1, -16 -; P9-NOVSX-NEXT: std r5, -8(r1) ; P9-NOVSX-NEXT: lvx v2, 0, r3 ; P9-NOVSX-NEXT: blr ; diff --git a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll --- a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll +++ b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll @@ -13,29 +13,29 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lxsd v5, 0(r5) ; CHECK-NEXT: addis r5, r2, .LCPI0_0@toc@ha +; CHECK-NEXT: xxlxor v3, v3, v3 +; CHECK-NEXT: li r6, 0 ; CHECK-NEXT: addi r5, r5, .LCPI0_0@toc@l ; CHECK-NEXT: lxvx v2, 0, r5 ; CHECK-NEXT: addis r5, r2, .LCPI0_1@toc@ha ; CHECK-NEXT: addi r5, r5, .LCPI0_1@toc@l ; CHECK-NEXT: lxvx v4, 0, r5 ; CHECK-NEXT: li r5, 4 -; CHECK-NEXT: xxlxor v3, v3, v3 ; CHECK-NEXT: vperm v0, v3, v5, v2 ; CHECK-NEXT: mtctr r5 ; CHECK-NEXT: li r5, 0 ; CHECK-NEXT: vperm v1, v3, v5, v4 -; CHECK-NEXT: li r6, 0 ; CHECK-NEXT: xvnegsp v5, v0 ; CHECK-NEXT: xvnegsp v0, v1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: # %for.cond1.preheader ; CHECK-NEXT: # ; CHECK-NEXT: lxsd v1, 0(r3) +; CHECK-NEXT: add r7, r3, r4 ; CHECK-NEXT: vperm v6, v3, v1, v4 ; CHECK-NEXT: vperm v1, v3, v1, v2 ; CHECK-NEXT: xvnegsp v1, v1 ; CHECK-NEXT: xvnegsp v6, v6 -; CHECK-NEXT: add r7, r3, r4 ; CHECK-NEXT: vabsduw v1, v1, v5 ; CHECK-NEXT: vabsduw v6, v6, v0 ; CHECK-NEXT: vadduwm v1, v6, v1 @@ -47,10 +47,11 @@ ; CHECK-NEXT: vextuwrx r3, r5, v1 ; CHECK-NEXT: vperm v7, v3, v6, v4 ; CHECK-NEXT: vperm v6, v3, v6, v2 +; CHECK-NEXT: add r6, r3, r6 +; CHECK-NEXT: add r3, r7, r4 ; CHECK-NEXT: xvnegsp v6, v6 ; CHECK-NEXT: xvnegsp v1, v7 ; CHECK-NEXT: vabsduw v6, v6, v5 -; CHECK-NEXT: add r6, r3, r6 ; CHECK-NEXT: vabsduw v1, v1, v0 ; CHECK-NEXT: vadduwm v1, v1, v6 ; CHECK-NEXT: xxswapd v6, v1 @@ -58,7 +59,6 @@ ; CHECK-NEXT: xxspltw v6, v1, 2 ; CHECK-NEXT: vadduwm v1, v1, v6 ; CHECK-NEXT: vextuwrx r8, r5, v1 -; CHECK-NEXT: add r3, r7, r4 ; CHECK-NEXT: add r6, r8, r6 ; CHECK-NEXT: bdnz .LBB0_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup @@ -69,25 +69,26 @@ ; P9BE: # %bb.0: # %entry ; P9BE-NEXT: lfd f0, 0(r5) ; P9BE-NEXT: addis r5, r2, .LCPI0_0@toc@ha +; P9BE-NEXT: xxlxor v3, v3, v3 +; P9BE-NEXT: li r6, 0 ; P9BE-NEXT: addi r5, r5, .LCPI0_0@toc@l ; P9BE-NEXT: lxvx v2, 0, r5 ; P9BE-NEXT: addis r5, r2, .LCPI0_1@toc@ha +; P9BE-NEXT: xxlor v5, vs0, vs0 ; P9BE-NEXT: addi r5, r5, .LCPI0_1@toc@l ; P9BE-NEXT: lxvx v4, 0, r5 ; P9BE-NEXT: li r5, 4 -; P9BE-NEXT: xxlor v5, vs0, vs0 -; P9BE-NEXT: xxlxor v3, v3, v3 ; P9BE-NEXT: vperm v0, v3, v5, v2 ; P9BE-NEXT: mtctr r5 ; P9BE-NEXT: li r5, 0 ; P9BE-NEXT: vperm v1, v3, v5, v4 -; P9BE-NEXT: li r6, 0 ; P9BE-NEXT: xvnegsp v5, v0 ; P9BE-NEXT: xvnegsp v0, v1 ; P9BE-NEXT: .p2align 4 ; P9BE-NEXT: .LBB0_1: # %for.cond1.preheader ; P9BE-NEXT: # ; P9BE-NEXT: lfd f0, 0(r3) +; P9BE-NEXT: add r7, r3, r4 ; P9BE-NEXT: xxlor v1, vs0, vs0 ; P9BE-NEXT: lfdx f0, r3, r4 ; P9BE-NEXT: vperm v6, v3, v1, v4 @@ -104,20 +105,19 @@ ; P9BE-NEXT: xxlor v6, vs0, vs0 ; P9BE-NEXT: vperm v7, v3, v6, v4 ; P9BE-NEXT: vperm v6, v3, v6, v2 -; P9BE-NEXT: add r7, r3, r4 ; P9BE-NEXT: vextuwlx r3, r5, v1 ; P9BE-NEXT: xvnegsp v6, v6 +; P9BE-NEXT: add r6, r3, r6 ; P9BE-NEXT: xvnegsp v1, v7 -; P9BE-NEXT: vabsduw v1, v1, v0 +; P9BE-NEXT: add r3, r7, r4 ; P9BE-NEXT: vabsduw v6, v6, v5 +; P9BE-NEXT: vabsduw v1, v1, v0 ; P9BE-NEXT: vadduwm v1, v1, v6 ; P9BE-NEXT: xxswapd v6, v1 -; P9BE-NEXT: add r6, r3, r6 ; P9BE-NEXT: vadduwm v1, v1, v6 ; P9BE-NEXT: xxspltw v6, v1, 1 ; P9BE-NEXT: vadduwm v1, v1, v6 ; P9BE-NEXT: vextuwlx r8, r5, v1 -; P9BE-NEXT: add r3, r7, r4 ; P9BE-NEXT: add r6, r8, r6 ; P9BE-NEXT: bdnz .LBB0_1 ; P9BE-NEXT: # %bb.2: # %for.cond.cleanup @@ -180,13 +180,14 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lxsd v2, 0(r3) ; CHECK-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; CHECK-NEXT: lxsd v1, 0(r4) +; CHECK-NEXT: xxlxor v3, v3, v3 ; CHECK-NEXT: addi r3, r3, .LCPI1_0@toc@l ; CHECK-NEXT: lxvx v4, 0, r3 ; CHECK-NEXT: addis r3, r2, .LCPI1_1@toc@ha ; CHECK-NEXT: addi r3, r3, .LCPI1_1@toc@l ; CHECK-NEXT: lxvx v0, 0, r3 -; CHECK-NEXT: lxsd v1, 0(r4) -; CHECK-NEXT: xxlxor v3, v3, v3 +; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: vperm v5, v3, v2, v4 ; CHECK-NEXT: vperm v2, v3, v2, v0 ; CHECK-NEXT: vperm v0, v3, v1, v0 @@ -198,7 +199,6 @@ ; CHECK-NEXT: vadduwm v2, v2, v3 ; CHECK-NEXT: xxspltw v3, v2, 2 ; CHECK-NEXT: vadduwm v2, v2, v3 -; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: vextuwrx r3, r3, v2 ; CHECK-NEXT: extsw r3, r3 ; CHECK-NEXT: blr @@ -207,6 +207,7 @@ ; P9BE: # %bb.0: # %entry ; P9BE-NEXT: lfd f0, 0(r3) ; P9BE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; P9BE-NEXT: xxlxor v3, v3, v3 ; P9BE-NEXT: addi r3, r3, .LCPI1_0@toc@l ; P9BE-NEXT: lxvx v4, 0, r3 ; P9BE-NEXT: addis r3, r2, .LCPI1_1@toc@ha @@ -214,8 +215,8 @@ ; P9BE-NEXT: xxlor v2, vs0, vs0 ; P9BE-NEXT: lfd f0, 0(r4) ; P9BE-NEXT: lxvx v0, 0, r3 -; P9BE-NEXT: xxlxor v3, v3, v3 ; P9BE-NEXT: xxlor v1, vs0, vs0 +; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: vperm v5, v3, v2, v4 ; P9BE-NEXT: vperm v2, v3, v2, v0 ; P9BE-NEXT: vperm v0, v3, v1, v0 @@ -227,7 +228,6 @@ ; P9BE-NEXT: vadduwm v2, v2, v3 ; P9BE-NEXT: xxspltw v3, v2, 1 ; P9BE-NEXT: vadduwm v2, v2, v3 -; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: vextuwlx r3, r3, v2 ; P9BE-NEXT: extsw r3, r3 ; P9BE-NEXT: blr @@ -283,11 +283,11 @@ ; CHECK-NEXT: add r5, r3, r4 ; CHECK-NEXT: lxsiwzx v2, r3, r4 ; CHECK-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; CHECK-NEXT: xxlxor v3, v3, v3 ; CHECK-NEXT: addi r3, r3, .LCPI2_0@toc@l ; CHECK-NEXT: lxvx v4, 0, r3 ; CHECK-NEXT: li r3, 4 ; CHECK-NEXT: lxsiwzx v5, r5, r3 -; CHECK-NEXT: xxlxor v3, v3, v3 ; CHECK-NEXT: vperm v2, v2, v3, v4 ; CHECK-NEXT: vperm v3, v5, v3, v4 ; CHECK-NEXT: vspltisw v4, 8 @@ -304,12 +304,12 @@ ; P9BE-NEXT: add r5, r3, r4 ; P9BE-NEXT: lfiwzx f0, r3, r4 ; P9BE-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; P9BE-NEXT: xxlxor v3, v3, v3 +; P9BE-NEXT: xxsldwi v2, f0, f0, 1 ; P9BE-NEXT: addi r3, r3, .LCPI2_0@toc@l ; P9BE-NEXT: lxvx v4, 0, r3 ; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: xxsldwi v2, f0, f0, 1 ; P9BE-NEXT: lfiwzx f0, r5, r3 -; P9BE-NEXT: xxlxor v3, v3, v3 ; P9BE-NEXT: vperm v2, v3, v2, v4 ; P9BE-NEXT: xxsldwi v5, f0, f0, 1 ; P9BE-NEXT: vperm v3, v3, v5, v4 @@ -349,16 +349,16 @@ ; CHECK-LABEL: test16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: sldi r4, r4, 1 -; CHECK-NEXT: add r6, r3, r4 ; CHECK-NEXT: li r7, 16 -; CHECK-NEXT: lxsihzx v2, r6, r7 +; CHECK-NEXT: add r6, r3, r4 ; CHECK-NEXT: lxsihzx v4, r3, r4 +; CHECK-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; CHECK-NEXT: lxsihzx v2, r6, r7 ; CHECK-NEXT: li r6, 0 +; CHECK-NEXT: addi r3, r3, .LCPI3_0@toc@l ; CHECK-NEXT: mtvsrd v3, r6 ; CHECK-NEXT: vsplth v4, v4, 3 ; CHECK-NEXT: vsplth v2, v2, 3 -; CHECK-NEXT: addis r3, r2, .LCPI3_0@toc@ha -; CHECK-NEXT: addi r3, r3, .LCPI3_0@toc@l ; CHECK-NEXT: vmrghh v4, v3, v4 ; CHECK-NEXT: vmrghh v2, v3, v2 ; CHECK-NEXT: vsplth v3, v3, 3 @@ -376,17 +376,17 @@ ; P9BE-LABEL: test16: ; P9BE: # %bb.0: # %entry ; P9BE-NEXT: sldi r4, r4, 1 -; P9BE-NEXT: add r6, r3, r4 ; P9BE-NEXT: li r7, 16 -; P9BE-NEXT: lxsihzx v2, r6, r7 +; P9BE-NEXT: add r6, r3, r4 ; P9BE-NEXT: lxsihzx v4, r3, r4 +; P9BE-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; P9BE-NEXT: lxsihzx v2, r6, r7 ; P9BE-NEXT: li r6, 0 +; P9BE-NEXT: addi r3, r3, .LCPI3_0@toc@l ; P9BE-NEXT: sldi r6, r6, 48 ; P9BE-NEXT: vsplth v4, v4, 3 ; P9BE-NEXT: mtvsrd v3, r6 ; P9BE-NEXT: vsplth v2, v2, 3 -; P9BE-NEXT: addis r3, r2, .LCPI3_0@toc@ha -; P9BE-NEXT: addi r3, r3, .LCPI3_0@toc@l ; P9BE-NEXT: vmrghh v4, v3, v4 ; P9BE-NEXT: vmrghh v2, v3, v2 ; P9BE-NEXT: vsplth v3, v3, 0 @@ -441,11 +441,11 @@ ; CHECK-NEXT: mtvsrd v3, r3 ; CHECK-NEXT: li r3, 8 ; CHECK-NEXT: lxsibzx v5, r6, r3 +; CHECK-NEXT: vspltb v4, v3, 7 ; CHECK-NEXT: addis r3, r2, .LCPI4_0@toc@ha -; CHECK-NEXT: addi r3, r3, .LCPI4_0@toc@l ; CHECK-NEXT: vspltb v2, v2, 7 +; CHECK-NEXT: addi r3, r3, .LCPI4_0@toc@l ; CHECK-NEXT: vmrghb v2, v3, v2 -; CHECK-NEXT: vspltb v4, v3, 7 ; CHECK-NEXT: vspltb v5, v5, 7 ; CHECK-NEXT: vmrglh v2, v2, v4 ; CHECK-NEXT: vmrghb v3, v3, v5 @@ -466,9 +466,11 @@ ; P9BE: # %bb.0: # %entry ; P9BE-NEXT: add r6, r3, r4 ; P9BE-NEXT: li r7, 8 -; P9BE-NEXT: lxsibzx v2, r6, r7 ; P9BE-NEXT: lxsibzx v4, r3, r4 +; P9BE-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; P9BE-NEXT: lxsibzx v2, r6, r7 ; P9BE-NEXT: li r6, 0 +; P9BE-NEXT: addi r3, r3, .LCPI4_0@toc@l ; P9BE-NEXT: sldi r6, r6, 56 ; P9BE-NEXT: vspltb v4, v4, 7 ; P9BE-NEXT: mtvsrd v3, r6 @@ -476,8 +478,6 @@ ; P9BE-NEXT: vmrghb v4, v3, v4 ; P9BE-NEXT: vmrghb v2, v3, v2 ; P9BE-NEXT: vspltb v3, v3, 0 -; P9BE-NEXT: addis r3, r2, .LCPI4_0@toc@ha -; P9BE-NEXT: addi r3, r3, .LCPI4_0@toc@l ; P9BE-NEXT: vmrghh v4, v4, v3 ; P9BE-NEXT: xxspltw v3, v3, 0 ; P9BE-NEXT: vmrghw v2, v4, v2 diff --git a/llvm/test/CodeGen/PowerPC/recipest.ll b/llvm/test/CodeGen/PowerPC/recipest.ll --- a/llvm/test/CodeGen/PowerPC/recipest.ll +++ b/llvm/test/CodeGen/PowerPC/recipest.ll @@ -804,8 +804,8 @@ ; CHECK-P9-LABEL: foo3_fmf: ; CHECK-P9: # %bb.0: ; CHECK-P9-NEXT: addis 3, 2, .LCPI20_2@toc@ha -; CHECK-P9-NEXT: lfd 2, .LCPI20_2@toc@l(3) ; CHECK-P9-NEXT: xsabsdp 0, 1 +; CHECK-P9-NEXT: lfd 2, .LCPI20_2@toc@l(3) ; CHECK-P9-NEXT: xscmpudp 0, 0, 2 ; CHECK-P9-NEXT: xxlxor 0, 0, 0 ; CHECK-P9-NEXT: blt 0, .LBB20_2 @@ -899,8 +899,8 @@ ; CHECK-P9-LABEL: goo3_fmf: ; CHECK-P9: # %bb.0: ; CHECK-P9-NEXT: addis 3, 2, .LCPI22_2@toc@ha -; CHECK-P9-NEXT: lfs 2, .LCPI22_2@toc@l(3) ; CHECK-P9-NEXT: xsabsdp 0, 1 +; CHECK-P9-NEXT: lfs 2, .LCPI22_2@toc@l(3) ; CHECK-P9-NEXT: fcmpu 0, 0, 2 ; CHECK-P9-NEXT: xxlxor 0, 0, 0 ; CHECK-P9-NEXT: blt 0, .LBB22_2 diff --git a/llvm/test/CodeGen/PowerPC/rematerializable-instruction-machine-licm.ll b/llvm/test/CodeGen/PowerPC/rematerializable-instruction-machine-licm.ll --- a/llvm/test/CodeGen/PowerPC/rematerializable-instruction-machine-licm.ll +++ b/llvm/test/CodeGen/PowerPC/rematerializable-instruction-machine-licm.ll @@ -28,69 +28,80 @@ ; CHECK-NEXT: .cfi_offset r31, -8 ; CHECK-NEXT: .cfi_offset r2, -152 ; CHECK-NEXT: lis 5, 4 +; CHECK-NEXT: std 30, 704(1) # 8-byte Folded Spill +; CHECK-NEXT: std 29, 696(1) # 8-byte Folded Spill ; CHECK-NEXT: ori 6, 5, 6292 +; CHECK-NEXT: std 28, 688(1) # 8-byte Folded Spill +; CHECK-NEXT: std 27, 680(1) # 8-byte Folded Spill +; CHECK-NEXT: std 26, 672(1) # 8-byte Folded Spill +; CHECK-NEXT: std 25, 664(1) # 8-byte Folded Spill ; CHECK-NEXT: ori 5, 5, 6291 +; CHECK-NEXT: std 14, 576(1) # 8-byte Folded Spill +; CHECK-NEXT: std 15, 584(1) # 8-byte Folded Spill +; CHECK-NEXT: std 16, 592(1) # 8-byte Folded Spill +; CHECK-NEXT: std 17, 600(1) # 8-byte Folded Spill +; CHECK-NEXT: std 18, 608(1) # 8-byte Folded Spill +; CHECK-NEXT: std 19, 616(1) # 8-byte Folded Spill +; CHECK-NEXT: std 20, 624(1) # 8-byte Folded Spill +; CHECK-NEXT: std 21, 632(1) # 8-byte Folded Spill +; CHECK-NEXT: std 22, 640(1) # 8-byte Folded Spill +; CHECK-NEXT: std 23, 648(1) # 8-byte Folded Spill +; CHECK-NEXT: std 24, 656(1) # 8-byte Folded Spill +; CHECK-NEXT: std 31, 712(1) # 8-byte Folded Spill +; CHECK-NEXT: std 2, 568(1) # 8-byte Folded Spill ; CHECK-NEXT: sldi 6, 6, 32 ; CHECK-NEXT: oris 7, 6, 13030 ; CHECK-NEXT: oris 8, 6, 13066 -; CHECK-NEXT: ori 7, 7, 3704 ; CHECK-NEXT: oris 9, 6, 13054 +; CHECK-NEXT: oris 10, 6, 13042 +; CHECK-NEXT: oris 11, 6, 13078 +; CHECK-NEXT: oris 12, 6, 13115 +; CHECK-NEXT: oris 0, 6, 13103 +; CHECK-NEXT: oris 30, 6, 13091 +; CHECK-NEXT: oris 29, 6, 13127 +; CHECK-NEXT: oris 28, 6, 13164 +; CHECK-NEXT: oris 27, 6, 13152 +; CHECK-NEXT: oris 26, 6, 13139 +; CHECK-NEXT: oris 25, 6, 13176 +; CHECK-NEXT: ori 7, 7, 3704 ; CHECK-NEXT: ori 8, 8, 44408 ; CHECK-NEXT: ori 9, 9, 30840 -; CHECK-NEXT: add 7, 4, 7 -; CHECK-NEXT: oris 10, 6, 13042 ; CHECK-NEXT: ori 10, 10, 17272 -; CHECK-NEXT: std 7, 384(1) # 8-byte Folded Spill -; CHECK-NEXT: add 7, 4, 8 -; CHECK-NEXT: oris 11, 6, 13078 ; CHECK-NEXT: ori 11, 11, 57976 -; CHECK-NEXT: std 7, 376(1) # 8-byte Folded Spill -; CHECK-NEXT: add 7, 4, 9 -; CHECK-NEXT: oris 12, 6, 13115 ; CHECK-NEXT: ori 12, 12, 33144 -; CHECK-NEXT: std 7, 368(1) # 8-byte Folded Spill -; CHECK-NEXT: add 7, 4, 10 -; CHECK-NEXT: oris 0, 6, 13103 ; CHECK-NEXT: ori 0, 0, 19576 -; CHECK-NEXT: std 7, 360(1) # 8-byte Folded Spill -; CHECK-NEXT: add 7, 4, 11 -; CHECK-NEXT: std 30, 704(1) # 8-byte Folded Spill -; CHECK-NEXT: oris 30, 6, 13091 ; CHECK-NEXT: ori 30, 30, 6008 -; CHECK-NEXT: std 7, 352(1) # 8-byte Folded Spill -; CHECK-NEXT: add 7, 4, 12 -; CHECK-NEXT: std 29, 696(1) # 8-byte Folded Spill -; CHECK-NEXT: oris 29, 6, 13127 ; CHECK-NEXT: ori 29, 29, 46712 +; CHECK-NEXT: ori 28, 28, 21880 +; CHECK-NEXT: ori 27, 27, 8312 +; CHECK-NEXT: ori 26, 26, 60280 +; CHECK-NEXT: ori 25, 25, 35448 +; CHECK-NEXT: add 7, 4, 7 ; CHECK-NEXT: sldi 5, 5, 32 ; CHECK-NEXT: oris 5, 5, 29347 ; CHECK-NEXT: ori 5, 5, 20088 +; CHECK-NEXT: std 7, 384(1) # 8-byte Folded Spill +; CHECK-NEXT: add 7, 4, 8 ; CHECK-NEXT: lis 8, 402 +; CHECK-NEXT: std 7, 376(1) # 8-byte Folded Spill +; CHECK-NEXT: add 7, 4, 9 ; CHECK-NEXT: lis 9, 451 +; CHECK-NEXT: std 7, 368(1) # 8-byte Folded Spill +; CHECK-NEXT: add 7, 4, 10 ; CHECK-NEXT: lis 10, 500 +; CHECK-NEXT: std 7, 360(1) # 8-byte Folded Spill +; CHECK-NEXT: add 7, 4, 11 ; CHECK-NEXT: lis 11, 549 -; CHECK-NEXT: std 31, 712(1) # 8-byte Folded Spill -; CHECK-NEXT: std 2, 568(1) # 8-byte Folded Spill +; CHECK-NEXT: std 7, 352(1) # 8-byte Folded Spill +; CHECK-NEXT: add 7, 4, 12 ; CHECK-NEXT: std 7, 344(1) # 8-byte Folded Spill ; CHECK-NEXT: add 7, 4, 0 -; CHECK-NEXT: std 28, 688(1) # 8-byte Folded Spill -; CHECK-NEXT: oris 28, 6, 13164 -; CHECK-NEXT: ori 28, 28, 21880 ; CHECK-NEXT: std 7, 336(1) # 8-byte Folded Spill ; CHECK-NEXT: add 7, 4, 30 -; CHECK-NEXT: std 27, 680(1) # 8-byte Folded Spill -; CHECK-NEXT: oris 27, 6, 13152 -; CHECK-NEXT: ori 27, 27, 8312 ; CHECK-NEXT: std 7, 328(1) # 8-byte Folded Spill ; CHECK-NEXT: add 7, 4, 29 -; CHECK-NEXT: std 26, 672(1) # 8-byte Folded Spill -; CHECK-NEXT: oris 26, 6, 13139 -; CHECK-NEXT: ori 26, 26, 60280 ; CHECK-NEXT: std 7, 320(1) # 8-byte Folded Spill ; CHECK-NEXT: add 7, 4, 28 -; CHECK-NEXT: std 25, 664(1) # 8-byte Folded Spill -; CHECK-NEXT: oris 25, 6, 13176 -; CHECK-NEXT: ori 25, 25, 35448 ; CHECK-NEXT: std 7, 312(1) # 8-byte Folded Spill ; CHECK-NEXT: add 7, 4, 27 ; CHECK-NEXT: std 7, 304(1) # 8-byte Folded Spill @@ -112,6 +123,10 @@ ; CHECK-NEXT: lis 5, 268 ; CHECK-NEXT: std 4, 256(1) # 8-byte Folded Spill ; CHECK-NEXT: lis 4, 585 +; CHECK-NEXT: std 6, 264(1) # 8-byte Folded Spill +; CHECK-NEXT: lis 6, 305 +; CHECK-NEXT: std 7, 272(1) # 8-byte Folded Spill +; CHECK-NEXT: lis 7, 354 ; CHECK-NEXT: ori 4, 4, 61440 ; CHECK-NEXT: std 4, 560(1) # 8-byte Folded Spill ; CHECK-NEXT: lis 4, 48 @@ -200,94 +215,79 @@ ; CHECK-NEXT: std 4, 192(1) # 8-byte Folded Spill ; CHECK-NEXT: ori 4, 5, 36352 ; CHECK-NEXT: lis 5, 317 +; CHECK-NEXT: ld 30, 192(1) # 8-byte Folded Reload ; CHECK-NEXT: std 4, 184(1) # 8-byte Folded Spill ; CHECK-NEXT: ori 4, 5, 25088 ; CHECK-NEXT: lis 5, 366 +; CHECK-NEXT: ld 29, 184(1) # 8-byte Folded Reload ; CHECK-NEXT: std 4, 176(1) # 8-byte Folded Spill ; CHECK-NEXT: ori 4, 5, 13824 ; CHECK-NEXT: lis 5, 415 +; CHECK-NEXT: ld 28, 176(1) # 8-byte Folded Reload ; CHECK-NEXT: std 4, 168(1) # 8-byte Folded Spill ; CHECK-NEXT: ori 4, 5, 2560 ; CHECK-NEXT: lis 5, 463 +; CHECK-NEXT: ld 27, 168(1) # 8-byte Folded Reload ; CHECK-NEXT: std 4, 160(1) # 8-byte Folded Spill ; CHECK-NEXT: ori 4, 5, 56832 ; CHECK-NEXT: lis 5, 512 +; CHECK-NEXT: ld 26, 160(1) # 8-byte Folded Reload ; CHECK-NEXT: std 4, 152(1) # 8-byte Folded Spill ; CHECK-NEXT: ori 4, 5, 45568 ; CHECK-NEXT: lis 5, 561 +; CHECK-NEXT: ld 25, 152(1) # 8-byte Folded Reload ; CHECK-NEXT: std 4, 144(1) # 8-byte Folded Spill ; CHECK-NEXT: ori 4, 5, 34304 ; CHECK-NEXT: lis 5, 12 +; CHECK-NEXT: ld 24, 144(1) # 8-byte Folded Reload ; CHECK-NEXT: std 4, 136(1) # 8-byte Folded Spill ; CHECK-NEXT: ori 4, 5, 13568 ; CHECK-NEXT: lis 5, 61 +; CHECK-NEXT: ld 23, 136(1) # 8-byte Folded Reload ; CHECK-NEXT: std 4, 128(1) # 8-byte Folded Spill ; CHECK-NEXT: ori 4, 5, 2304 ; CHECK-NEXT: lis 5, 109 ; CHECK-NEXT: std 4, 120(1) # 8-byte Folded Spill ; CHECK-NEXT: ori 4, 5, 56576 ; CHECK-NEXT: lis 5, 158 +; CHECK-NEXT: ld 0, 120(1) # 8-byte Folded Reload ; CHECK-NEXT: std 4, 112(1) # 8-byte Folded Spill ; CHECK-NEXT: ori 4, 5, 45312 ; CHECK-NEXT: lis 5, 207 +; CHECK-NEXT: ld 22, 112(1) # 8-byte Folded Reload ; CHECK-NEXT: std 4, 104(1) # 8-byte Folded Spill ; CHECK-NEXT: ori 4, 5, 34048 ; CHECK-NEXT: lis 5, 256 -; CHECK-NEXT: std 6, 264(1) # 8-byte Folded Spill -; CHECK-NEXT: lis 6, 305 -; CHECK-NEXT: ld 30, 192(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 29, 184(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 28, 176(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 27, 168(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 26, 160(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 25, 152(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 0, 120(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 21, 104(1) # 8-byte Folded Reload ; CHECK-NEXT: std 4, 96(1) # 8-byte Folded Spill ; CHECK-NEXT: ori 4, 5, 22784 -; CHECK-NEXT: std 7, 272(1) # 8-byte Folded Spill -; CHECK-NEXT: lis 7, 354 +; CHECK-NEXT: ld 5, 248(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 20, 96(1) # 8-byte Folded Reload ; CHECK-NEXT: std 4, 88(1) # 8-byte Folded Spill ; CHECK-NEXT: ori 4, 6, 11520 ; CHECK-NEXT: ld 6, 240(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 19, 88(1) # 8-byte Folded Reload ; CHECK-NEXT: std 4, 80(1) # 8-byte Folded Spill ; CHECK-NEXT: ori 4, 7, 256 ; CHECK-NEXT: ld 7, 232(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 18, 80(1) # 8-byte Folded Reload ; CHECK-NEXT: std 4, 72(1) # 8-byte Folded Spill ; CHECK-NEXT: ori 4, 8, 54528 ; CHECK-NEXT: ld 8, 224(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 17, 72(1) # 8-byte Folded Reload ; CHECK-NEXT: std 4, 64(1) # 8-byte Folded Spill ; CHECK-NEXT: ori 4, 9, 43264 ; CHECK-NEXT: ld 9, 216(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 16, 64(1) # 8-byte Folded Reload ; CHECK-NEXT: std 4, 56(1) # 8-byte Folded Spill ; CHECK-NEXT: ori 4, 10, 32000 ; CHECK-NEXT: ld 10, 208(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 15, 56(1) # 8-byte Folded Reload ; CHECK-NEXT: std 4, 48(1) # 8-byte Folded Spill ; CHECK-NEXT: ori 4, 11, 20736 ; CHECK-NEXT: ld 11, 200(1) # 8-byte Folded Reload -; CHECK-NEXT: std 4, 40(1) # 8-byte Folded Spill -; CHECK-NEXT: std 14, 576(1) # 8-byte Folded Spill -; CHECK-NEXT: std 15, 584(1) # 8-byte Folded Spill -; CHECK-NEXT: std 16, 592(1) # 8-byte Folded Spill -; CHECK-NEXT: std 17, 600(1) # 8-byte Folded Spill -; CHECK-NEXT: std 18, 608(1) # 8-byte Folded Spill -; CHECK-NEXT: std 19, 616(1) # 8-byte Folded Spill -; CHECK-NEXT: std 20, 624(1) # 8-byte Folded Spill -; CHECK-NEXT: std 21, 632(1) # 8-byte Folded Spill -; CHECK-NEXT: std 22, 640(1) # 8-byte Folded Spill -; CHECK-NEXT: std 23, 648(1) # 8-byte Folded Spill -; CHECK-NEXT: std 24, 656(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 5, 248(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 24, 144(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 23, 136(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 22, 112(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 21, 104(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 20, 96(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 19, 88(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 18, 80(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 17, 72(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 16, 64(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 15, 56(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 14, 48(1) # 8-byte Folded Reload +; CHECK-NEXT: std 4, 40(1) # 8-byte Folded Spill ; CHECK-NEXT: li 4, 0 ; CHECK-NEXT: ld 31, 40(1) # 8-byte Folded Reload ; CHECK-NEXT: .p2align 4 @@ -305,6 +305,32 @@ ; CHECK-NEXT: stdux 3, 12, 2 ; CHECK-NEXT: ld 2, 552(1) # 8-byte Folded Reload ; CHECK-NEXT: stdx 3, 12, 5 +; CHECK-NEXT: stdx 3, 12, 6 +; CHECK-NEXT: stdx 3, 12, 7 +; CHECK-NEXT: stdx 3, 12, 8 +; CHECK-NEXT: stdx 3, 12, 9 +; CHECK-NEXT: stdx 3, 12, 10 +; CHECK-NEXT: stdx 3, 12, 11 +; CHECK-NEXT: stdx 3, 12, 30 +; CHECK-NEXT: stdx 3, 12, 29 +; CHECK-NEXT: stdx 3, 12, 28 +; CHECK-NEXT: stdx 3, 12, 27 +; CHECK-NEXT: stdx 3, 12, 26 +; CHECK-NEXT: stdx 3, 12, 25 +; CHECK-NEXT: stdx 3, 12, 24 +; CHECK-NEXT: stdx 3, 12, 23 +; CHECK-NEXT: stdx 3, 12, 4 +; CHECK-NEXT: stdx 3, 12, 0 +; CHECK-NEXT: stdx 3, 12, 22 +; CHECK-NEXT: stdx 3, 12, 21 +; CHECK-NEXT: stdx 3, 12, 20 +; CHECK-NEXT: stdx 3, 12, 19 +; CHECK-NEXT: stdx 3, 12, 18 +; CHECK-NEXT: stdx 3, 12, 17 +; CHECK-NEXT: stdx 3, 12, 16 +; CHECK-NEXT: stdx 3, 12, 15 +; CHECK-NEXT: stdx 3, 12, 14 +; CHECK-NEXT: stdx 3, 12, 31 ; CHECK-NEXT: stdx 3, 12, 2 ; CHECK-NEXT: ld 2, 544(1) # 8-byte Folded Reload ; CHECK-NEXT: stdx 3, 12, 2 @@ -344,35 +370,11 @@ ; CHECK-NEXT: stdx 3, 12, 2 ; CHECK-NEXT: ld 2, 400(1) # 8-byte Folded Reload ; CHECK-NEXT: stdx 3, 12, 2 -; CHECK-NEXT: stdx 3, 12, 6 -; CHECK-NEXT: stdx 3, 12, 7 -; CHECK-NEXT: stdx 3, 12, 8 -; CHECK-NEXT: stdx 3, 12, 9 -; CHECK-NEXT: stdx 3, 12, 10 -; CHECK-NEXT: stdx 3, 12, 11 -; CHECK-NEXT: stdx 3, 12, 30 -; CHECK-NEXT: stdx 3, 12, 29 -; CHECK-NEXT: stdx 3, 12, 28 -; CHECK-NEXT: stdx 3, 12, 27 -; CHECK-NEXT: stdx 3, 12, 26 -; CHECK-NEXT: stdx 3, 12, 25 -; CHECK-NEXT: stdx 3, 12, 24 -; CHECK-NEXT: stdx 3, 12, 23 -; CHECK-NEXT: stdx 3, 12, 4 -; CHECK-NEXT: stdx 3, 12, 0 -; CHECK-NEXT: stdx 3, 12, 22 -; CHECK-NEXT: stdx 3, 12, 21 -; CHECK-NEXT: stdx 3, 12, 20 -; CHECK-NEXT: stdx 3, 12, 19 -; CHECK-NEXT: stdx 3, 12, 18 -; CHECK-NEXT: stdx 3, 12, 17 -; CHECK-NEXT: stdx 3, 12, 16 -; CHECK-NEXT: stdx 3, 12, 15 -; CHECK-NEXT: stdx 3, 12, 14 -; CHECK-NEXT: stdx 3, 12, 31 ; CHECK-NEXT: bdnz .LBB0_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: ld 12, 384(1) # 8-byte Folded Reload +; CHECK-NEXT: lwz 4, 396(1) # 4-byte Folded Reload +; CHECK-NEXT: addi 4, 4, 1 ; CHECK-NEXT: std 3, 0(12) ; CHECK-NEXT: ld 12, 376(1) # 8-byte Folded Reload ; CHECK-NEXT: std 3, 0(12) @@ -399,8 +401,6 @@ ; CHECK-NEXT: ld 12, 288(1) # 8-byte Folded Reload ; CHECK-NEXT: std 3, 0(12) ; CHECK-NEXT: ld 12, 280(1) # 8-byte Folded Reload -; CHECK-NEXT: lwz 4, 396(1) # 4-byte Folded Reload -; CHECK-NEXT: addi 4, 4, 1 ; CHECK-NEXT: std 3, 0(12) ; CHECK-NEXT: ld 12, 272(1) # 8-byte Folded Reload ; CHECK-NEXT: std 3, 0(12) diff --git a/llvm/test/CodeGen/PowerPC/remove-redundant-load-imm.ll b/llvm/test/CodeGen/PowerPC/remove-redundant-load-imm.ll --- a/llvm/test/CodeGen/PowerPC/remove-redundant-load-imm.ll +++ b/llvm/test/CodeGen/PowerPC/remove-redundant-load-imm.ll @@ -40,8 +40,8 @@ ; PPC64LE-NEXT: std 0, 16(1) ; PPC64LE-NEXT: stdu 1, -32(1) ; PPC64LE-NEXT: addis 3, 2, .LC0@toc@ha -; PPC64LE-NEXT: ld 3, .LC0@toc@l(3) ; PPC64LE-NEXT: li 4, 0 +; PPC64LE-NEXT: ld 3, .LC0@toc@l(3) ; PPC64LE-NEXT: std 4, 0(3) ; PPC64LE-NEXT: bl barney.94 ; PPC64LE-NEXT: nop diff --git a/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll b/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll --- a/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll +++ b/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll @@ -166,8 +166,8 @@ ; P9LE-LABEL: s2v_test_f2: ; P9LE: # %bb.0: # %entry ; P9LE-NEXT: addi r3, r3, 4 -; P9LE-NEXT: lxsiwzx v3, 0, r3 ; P9LE-NEXT: vmrglw v2, v2, v2 +; P9LE-NEXT: lxsiwzx v3, 0, r3 ; P9LE-NEXT: vmrghw v2, v2, v3 ; P9LE-NEXT: blr @@ -208,17 +208,17 @@ ; P9LE-LABEL: s2v_test_f3: ; P9LE: # %bb.0: # %entry ; P9LE-NEXT: sldi r4, r7, 2 -; P9LE-NEXT: lxsiwzx v3, r3, r4 ; P9LE-NEXT: vmrglw v2, v2, v2 +; P9LE-NEXT: lxsiwzx v3, r3, r4 ; P9LE-NEXT: vmrghw v2, v2, v3 ; P9LE-NEXT: blr ; P9BE-LABEL: s2v_test_f3: ; P9BE: # %bb.0: # %entry ; P9BE: sldi r4, r7, 2 -; P9BE: lfiwzx f0, r3, r4 +; P9BE-DAG: lfiwzx f0, r3, r4 ; P9BE-DAG: xxspltw v2, v2, 1 -; P9BE-DAG: xxsldwi v3, f0, f0, 1 +; P9BE: xxsldwi v3, f0, f0, 1 ; P9BE: vmrghw v2, v3, v2 ; P9BE-NEXT: blr @@ -251,17 +251,17 @@ ; P9LE-LABEL: s2v_test_f4: ; P9LE: # %bb.0: # %entry ; P9LE-NEXT: addi r3, r3, 4 -; P9LE-NEXT: lxsiwzx v3, 0, r3 ; P9LE-NEXT: vmrglw v2, v2, v2 +; P9LE-NEXT: lxsiwzx v3, 0, r3 ; P9LE-NEXT: vmrghw v2, v2, v3 ; P9LE-NEXT: blr ; P9BE-LABEL: s2v_test_f4: ; P9BE: # %bb.0: # %entry ; P9BE: addi r3, r3, 4 -; P9BE: lfiwzx f0, 0, r3 +; P9BE-DAG: lfiwzx f0, 0, r3 ; P9BE-DAG: xxspltw v2, v2, 1 -; P9BE-DAG: xxsldwi v3, f0, f0, 1 +; P9BE: xxsldwi v3, f0, f0, 1 ; P9BE: vmrghw v2, v3, v2 ; P9BE-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/sched-addi.ll b/llvm/test/CodeGen/PowerPC/sched-addi.ll --- a/llvm/test/CodeGen/PowerPC/sched-addi.ll +++ b/llvm/test/CodeGen/PowerPC/sched-addi.ll @@ -18,9 +18,9 @@ ; CHECK-P9-NEXT: addi 6, 6, 16 ; CHECK-P9-NEXT: rldicr 5, 5, 0, 58 ; CHECK-P9-NEXT: addi 5, 5, -32 +; CHECK-P9-NEXT: lxvdsx 0, 0, 6 ; CHECK-P9-NEXT: rldicl 5, 5, 59, 5 ; CHECK-P9-NEXT: addi 5, 5, 1 -; CHECK-P9-NEXT: lxvdsx 0, 0, 6 ; CHECK-P9-NEXT: mtctr 5 ; CHECK-P9-NEXT: .p2align 4 ; CHECK-P9-NEXT: .LBB0_1: # %vector.body @@ -36,13 +36,13 @@ ; CHECK-P9-NEXT: xvmuldp 4, 4, 0 ; CHECK-P9-NEXT: xvmuldp 3, 3, 0 ; CHECK-P9-NEXT: xvmuldp 5, 5, 0 +; CHECK-P9-NEXT: addi 4, 4, 256 +; CHECK-P9-NEXT: xvmuldp 6, 6, 0 ; CHECK-P9-NEXT: stxv 1, 16(3) +; CHECK-P9-NEXT: stxv 2, 0(3) ; CHECK-P9-NEXT: stxv 3, 48(3) ; CHECK-P9-NEXT: stxv 4, 32(3) ; CHECK-P9-NEXT: stxv 5, 240(3) -; CHECK-P9-NEXT: addi 4, 4, 256 -; CHECK-P9-NEXT: xvmuldp 6, 6, 0 -; CHECK-P9-NEXT: stxv 2, 0(3) ; CHECK-P9-NEXT: stxv 6, 224(3) ; CHECK-P9-NEXT: addi 3, 3, 256 ; CHECK-P9-NEXT: bdnz .LBB0_1 @@ -57,9 +57,9 @@ ; CHECK-P9-NO-HEURISTIC-NEXT: rldicr 5, 5, 0, 58 ; CHECK-P9-NO-HEURISTIC-NEXT: addi 6, 6, 16 ; CHECK-P9-NO-HEURISTIC-NEXT: addi 5, 5, -32 +; CHECK-P9-NO-HEURISTIC-NEXT: lxvdsx 0, 0, 6 ; CHECK-P9-NO-HEURISTIC-NEXT: rldicl 5, 5, 59, 5 ; CHECK-P9-NO-HEURISTIC-NEXT: addi 5, 5, 1 -; CHECK-P9-NO-HEURISTIC-NEXT: lxvdsx 0, 0, 6 ; CHECK-P9-NO-HEURISTIC-NEXT: mtctr 5 ; CHECK-P9-NO-HEURISTIC-NEXT: .p2align 4 ; CHECK-P9-NO-HEURISTIC-NEXT: .LBB0_1: # %vector.body @@ -76,13 +76,13 @@ ; CHECK-P9-NO-HEURISTIC-NEXT: xvmuldp 3, 3, 0 ; CHECK-P9-NO-HEURISTIC-NEXT: xvmuldp 6, 6, 0 ; CHECK-P9-NO-HEURISTIC-NEXT: xvmuldp 5, 5, 0 +; CHECK-P9-NO-HEURISTIC-NEXT: addi 4, 4, 256 ; CHECK-P9-NO-HEURISTIC-NEXT: stxv 1, 16(3) ; CHECK-P9-NO-HEURISTIC-NEXT: stxv 2, 0(3) ; CHECK-P9-NO-HEURISTIC-NEXT: stxv 3, 48(3) ; CHECK-P9-NO-HEURISTIC-NEXT: stxv 4, 32(3) ; CHECK-P9-NO-HEURISTIC-NEXT: stxv 5, 240(3) ; CHECK-P9-NO-HEURISTIC-NEXT: stxv 6, 224(3) -; CHECK-P9-NO-HEURISTIC-NEXT: addi 4, 4, 256 ; CHECK-P9-NO-HEURISTIC-NEXT: addi 3, 3, 256 ; CHECK-P9-NO-HEURISTIC-NEXT: bdnz .LBB0_1 ; CHECK-P9-NO-HEURISTIC-NEXT: # %bb.2: # %return.block diff --git a/llvm/test/CodeGen/PowerPC/sms-cpy-1.ll b/llvm/test/CodeGen/PowerPC/sms-cpy-1.ll --- a/llvm/test/CodeGen/PowerPC/sms-cpy-1.ll +++ b/llvm/test/CodeGen/PowerPC/sms-cpy-1.ll @@ -22,10 +22,10 @@ ; CHECK-NEXT: isellt 3, 3, 4 ; CHECK-NEXT: li 4, 0 ; CHECK-NEXT: addi 3, 3, 1 -; CHECK-NEXT: mtctr 3 -; CHECK-NEXT: li 3, 1 ; CHECK-NEXT: li 7, -1 ; CHECK-NEXT: li 5, 0 +; CHECK-NEXT: mtctr 3 +; CHECK-NEXT: li 3, 1 ; CHECK-NEXT: lbz 5, 0(5) ; CHECK-NEXT: bdz .LBB0_6 ; CHECK-NEXT: # %bb.1: @@ -62,23 +62,23 @@ ; CHECK-NEXT: add 4, 4, 6 ; CHECK-NEXT: .LBB0_6: ; CHECK-NEXT: xori 5, 5, 84 -; CHECK-NEXT: cntlzw 5, 5 ; CHECK-NEXT: clrldi 3, 3, 32 +; CHECK-NEXT: li 7, 0 +; CHECK-NEXT: li 8, 3 ; CHECK-NEXT: std 3, 104(1) +; CHECK-NEXT: cntlzw 5, 5 ; CHECK-NEXT: addis 3, 2, .LC0@toc@ha +; CHECK-NEXT: li 10, 0 ; CHECK-NEXT: ld 3, .LC0@toc@l(3) -; CHECK-NEXT: li 7, 0 -; CHECK-NEXT: li 8, 3 ; CHECK-NEXT: srwi 5, 5, 5 ; CHECK-NEXT: add 4, 4, 5 ; CHECK-NEXT: li 5, 0 ; CHECK-NEXT: std 5, 120(1) ; CHECK-NEXT: li 5, 3 -; CHECK-NEXT: std 5, 96(1) ; CHECK-NEXT: clrldi 6, 4, 32 ; CHECK-NEXT: li 4, 3 +; CHECK-NEXT: std 5, 96(1) ; CHECK-NEXT: li 5, 0 -; CHECK-NEXT: li 10, 0 ; CHECK-NEXT: bl printf ; CHECK-NEXT: nop %1 = load i32, i32* undef, align 4 diff --git a/llvm/test/CodeGen/PowerPC/sms-grp-order.ll b/llvm/test/CodeGen/PowerPC/sms-grp-order.ll --- a/llvm/test/CodeGen/PowerPC/sms-grp-order.ll +++ b/llvm/test/CodeGen/PowerPC/sms-grp-order.ll @@ -7,8 +7,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: lha 3, 0(3) ; CHECK-NEXT: li 5, 1 -; CHECK-NEXT: sldi 5, 5, 62 ; CHECK-NEXT: lhz 4, 0(0) +; CHECK-NEXT: sldi 5, 5, 62 ; CHECK-NEXT: mtctr 5 ; CHECK-NEXT: srawi 3, 3, 1 ; CHECK-NEXT: addze 3, 3 diff --git a/llvm/test/CodeGen/PowerPC/sms-phi-3.ll b/llvm/test/CodeGen/PowerPC/sms-phi-3.ll --- a/llvm/test/CodeGen/PowerPC/sms-phi-3.ll +++ b/llvm/test/CodeGen/PowerPC/sms-phi-3.ll @@ -21,9 +21,9 @@ ; CHECK-NEXT: nop ; CHECK-NEXT: addi 7, 30, -4 ; CHECK-NEXT: mtctr 3 -; CHECK-NEXT: lwzu 8, 4(7) ; CHECK-NEXT: addi 4, 29, -8 ; CHECK-NEXT: li 5, 0 +; CHECK-NEXT: lwzu 8, 4(7) ; CHECK-NEXT: bdz .LBB0_5 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: extswsli 6, 5, 5 diff --git a/llvm/test/CodeGen/PowerPC/sms-simple.ll b/llvm/test/CodeGen/PowerPC/sms-simple.ll --- a/llvm/test/CodeGen/PowerPC/sms-simple.ll +++ b/llvm/test/CodeGen/PowerPC/sms-simple.ll @@ -10,17 +10,17 @@ ; CHECK-LABEL: foo: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addis r5, r2, x@toc@ha -; CHECK-NEXT: addi r5, r5, x@toc@l -; CHECK-NEXT: addi r5, r5, -8 ; CHECK-NEXT: addis r6, r2, y@toc@ha ; CHECK-NEXT: li r7, 340 +; CHECK-NEXT: addi r5, r5, x@toc@l +; CHECK-NEXT: addi r5, r5, -8 ; CHECK-NEXT: addi r3, r6, y@toc@l ; CHECK-NEXT: lwz r6, y@toc@l(r6) ; CHECK-NEXT: mtctr r7 +; CHECK-NEXT: addi r4, r3, -8 ; CHECK-NEXT: lwzu r7, 12(r5) ; CHECK-NEXT: maddld r6, r7, r7, r6 ; CHECK-NEXT: lwz r7, 4(r5) -; CHECK-NEXT: addi r4, r3, -8 ; CHECK-NEXT: stwu r6, 12(r4) ; CHECK-NEXT: maddld r6, r7, r7, r6 ; CHECK-NEXT: lwz r7, 8(r5) @@ -29,12 +29,12 @@ ; CHECK-NEXT: # ; CHECK-NEXT: maddld r7, r7, r7, r6 ; CHECK-NEXT: lwzu r8, 12(r5) -; CHECK-NEXT: maddld r8, r8, r8, r7 ; CHECK-NEXT: stw r6, 4(r4) ; CHECK-NEXT: lwz r6, 4(r5) -; CHECK-NEXT: maddld r6, r6, r6, r8 +; CHECK-NEXT: maddld r8, r8, r8, r7 ; CHECK-NEXT: stw r7, 8(r4) ; CHECK-NEXT: lwz r7, 8(r5) +; CHECK-NEXT: maddld r6, r6, r6, r8 ; CHECK-NEXT: stwu r8, 12(r4) ; CHECK-NEXT: bdnz .LBB0_1 ; CHECK-NEXT: # %bb.2: diff --git a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll --- a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll @@ -12,8 +12,8 @@ ; P9LE-LABEL: fold_srem_vec_1: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: lis r4, -21386 +; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: ori r4, r4, 37253 ; P9LE-NEXT: extsh r3, r3 ; P9LE-NEXT: mulhw r4, r3, r4 @@ -26,9 +26,9 @@ ; P9LE-NEXT: lis r4, 31710 ; P9LE-NEXT: mtvsrd v3, r3 ; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: ori r4, r4, 63421 ; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: ori r4, r4, 63421 ; P9LE-NEXT: mulhw r4, r3, r4 ; P9LE-NEXT: sub r4, r4, r3 ; P9LE-NEXT: srwi r5, r4, 31 @@ -39,21 +39,21 @@ ; P9LE-NEXT: lis r4, 21399 ; P9LE-NEXT: mtvsrd v4, r3 ; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: ori r4, r4, 33437 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: vmrghh v3, v4, v3 ; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: ori r4, r4, 33437 ; P9LE-NEXT: mulhw r4, r3, r4 ; P9LE-NEXT: srwi r5, r4, 31 ; P9LE-NEXT: srawi r4, r4, 5 ; P9LE-NEXT: add r4, r4, r5 ; P9LE-NEXT: mulli r4, r4, 98 ; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: vmrghh v3, v4, v3 +; P9LE-NEXT: lis r4, -16728 ; P9LE-NEXT: mtvsrd v4, r3 ; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: lis r4, -16728 ; P9LE-NEXT: ori r4, r4, 63249 +; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: extsh r3, r3 ; P9LE-NEXT: mulhw r4, r3, r4 ; P9LE-NEXT: srwi r5, r4, 31 @@ -69,8 +69,8 @@ ; P9BE-LABEL: fold_srem_vec_1: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: lis r4, 31710 +; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: ori r4, r4, 63421 ; P9BE-NEXT: extsh r3, r3 ; P9BE-NEXT: mulhw r4, r3, r4 @@ -82,11 +82,11 @@ ; P9BE-NEXT: sub r3, r3, r4 ; P9BE-NEXT: lis r4, -21386 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: ori r4, r4, 37253 ; P9BE-NEXT: mtvsrd v3, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: ori r4, r4, 37253 ; P9BE-NEXT: mulhw r4, r3, r4 ; P9BE-NEXT: add r4, r4, r3 ; P9BE-NEXT: srwi r5, r4, 31 @@ -96,11 +96,12 @@ ; P9BE-NEXT: sub r3, r3, r4 ; P9BE-NEXT: lis r4, -16728 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: ori r4, r4, 63249 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 6 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: ori r4, r4, 63249 ; P9BE-NEXT: mulhw r4, r3, r4 ; P9BE-NEXT: srwi r5, r4, 31 ; P9BE-NEXT: srawi r4, r4, 8 @@ -109,12 +110,11 @@ ; P9BE-NEXT: sub r3, r3, r4 ; P9BE-NEXT: lis r4, 21399 ; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: ori r4, r4, 33437 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: ori r4, r4, 33437 ; P9BE-NEXT: mulhw r4, r3, r4 ; P9BE-NEXT: srwi r5, r4, 31 ; P9BE-NEXT: srawi r4, r4, 5 @@ -247,8 +247,8 @@ ; P9LE-LABEL: fold_srem_vec_2: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: lis r4, -21386 +; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: ori r4, r4, 37253 ; P9LE-NEXT: extsh r3, r3 ; P9LE-NEXT: mulhw r5, r3, r4 @@ -272,6 +272,7 @@ ; P9LE-NEXT: mtvsrd v4, r3 ; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: vmrghh v3, v4, v3 ; P9LE-NEXT: extsh r3, r3 ; P9LE-NEXT: mulhw r5, r3, r4 ; P9LE-NEXT: add r5, r5, r3 @@ -280,7 +281,6 @@ ; P9LE-NEXT: add r5, r5, r6 ; P9LE-NEXT: mulli r5, r5, 95 ; P9LE-NEXT: sub r3, r3, r5 -; P9LE-NEXT: vmrghh v3, v4, v3 ; P9LE-NEXT: mtvsrd v4, r3 ; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 @@ -300,8 +300,8 @@ ; P9BE-LABEL: fold_srem_vec_2: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: lis r4, -21386 +; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: ori r4, r4, 37253 ; P9BE-NEXT: extsh r3, r3 ; P9BE-NEXT: mulhw r5, r3, r4 @@ -327,6 +327,7 @@ ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: extsh r3, r3 ; P9BE-NEXT: mulhw r5, r3, r4 ; P9BE-NEXT: add r5, r5, r3 @@ -336,7 +337,6 @@ ; P9BE-NEXT: mulli r5, r5, 95 ; P9BE-NEXT: sub r3, r3, r5 ; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: vextuhlx r3, r3, v2 @@ -468,8 +468,8 @@ ; P9LE-LABEL: combine_srem_sdiv: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: lis r4, -21386 +; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: ori r4, r4, 37253 ; P9LE-NEXT: extsh r3, r3 ; P9LE-NEXT: mulhw r5, r3, r4 @@ -493,6 +493,7 @@ ; P9LE-NEXT: mtvsrd v4, r3 ; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: vmrghh v3, v4, v3 ; P9LE-NEXT: extsh r7, r3 ; P9LE-NEXT: mulhw r8, r7, r4 ; P9LE-NEXT: add r7, r8, r7 @@ -501,7 +502,6 @@ ; P9LE-NEXT: add r7, r7, r8 ; P9LE-NEXT: mulli r8, r7, 95 ; P9LE-NEXT: sub r3, r3, r8 -; P9LE-NEXT: vmrghh v3, v4, v3 ; P9LE-NEXT: mtvsrd v4, r3 ; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 @@ -512,6 +512,7 @@ ; P9LE-NEXT: srawi r4, r4, 6 ; P9LE-NEXT: add r4, r4, r8 ; P9LE-NEXT: mulli r8, r4, 95 +; P9LE-NEXT: mtvsrd v5, r4 ; P9LE-NEXT: sub r3, r3, r8 ; P9LE-NEXT: mtvsrd v2, r3 ; P9LE-NEXT: vmrghh v2, v2, v4 @@ -520,7 +521,6 @@ ; P9LE-NEXT: mtvsrd v3, r5 ; P9LE-NEXT: vmrghh v3, v4, v3 ; P9LE-NEXT: mtvsrd v4, r7 -; P9LE-NEXT: mtvsrd v5, r4 ; P9LE-NEXT: vmrghh v4, v5, v4 ; P9LE-NEXT: vmrglw v3, v4, v3 ; P9LE-NEXT: vadduhm v2, v2, v3 @@ -529,8 +529,8 @@ ; P9BE-LABEL: combine_srem_sdiv: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: lis r5, -21386 +; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: ori r5, r5, 37253 ; P9BE-NEXT: extsh r4, r3 ; P9BE-NEXT: mulhw r6, r4, r5 @@ -556,6 +556,7 @@ ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: extsh r7, r3 ; P9BE-NEXT: mulhw r8, r7, r5 ; P9BE-NEXT: add r7, r8, r7 @@ -565,7 +566,6 @@ ; P9BE-NEXT: mulli r8, r7, 95 ; P9BE-NEXT: sub r3, r3, r8 ; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: vextuhlx r3, r3, v2 @@ -747,9 +747,10 @@ ; P9LE-NEXT: lis r4, -21386 ; P9LE-NEXT: mtvsrd v4, r3 ; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: ori r4, r4, 37253 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: vmrghh v3, v4, v3 ; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: ori r4, r4, 37253 ; P9LE-NEXT: mulhw r4, r3, r4 ; P9LE-NEXT: add r4, r4, r3 ; P9LE-NEXT: srwi r5, r4, 31 @@ -757,7 +758,6 @@ ; P9LE-NEXT: add r4, r4, r5 ; P9LE-NEXT: mulli r4, r4, 95 ; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: vmrghh v3, v4, v3 ; P9LE-NEXT: mtvsrd v4, r3 ; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 @@ -791,11 +791,12 @@ ; P9BE-NEXT: sub r3, r3, r4 ; P9BE-NEXT: lis r4, -21386 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: ori r4, r4, 37253 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 6 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: ori r4, r4, 37253 ; P9BE-NEXT: mulhw r4, r3, r4 ; P9BE-NEXT: add r4, r4, r3 ; P9BE-NEXT: srwi r5, r4, 31 @@ -804,7 +805,6 @@ ; P9BE-NEXT: mulli r4, r4, 95 ; P9BE-NEXT: sub r3, r3, r4 ; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 @@ -914,8 +914,8 @@ ; P9LE-LABEL: dont_fold_srem_one: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 2 -; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: lis r4, -14230 +; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: ori r4, r4, 30865 ; P9LE-NEXT: extsh r3, r3 ; P9LE-NEXT: mulhw r4, r3, r4 @@ -928,11 +928,12 @@ ; P9LE-NEXT: lis r4, -19946 ; P9LE-NEXT: mtvsrd v3, r3 ; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: ori r4, r4, 17097 ; P9LE-NEXT: mtvsrd v4, r3 ; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: vmrghh v3, v3, v4 ; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: ori r4, r4, 17097 ; P9LE-NEXT: mulhw r4, r3, r4 ; P9LE-NEXT: add r4, r4, r3 ; P9LE-NEXT: srwi r5, r4, 31 @@ -940,12 +941,11 @@ ; P9LE-NEXT: add r4, r4, r5 ; P9LE-NEXT: mulli r4, r4, 23 ; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: vmrghh v3, v3, v4 +; P9LE-NEXT: lis r4, 24749 ; P9LE-NEXT: mtvsrd v4, r3 ; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: lis r4, 24749 ; P9LE-NEXT: ori r4, r4, 47143 +; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: extsh r3, r3 ; P9LE-NEXT: mulhw r4, r3, r4 ; P9LE-NEXT: srwi r5, r4, 31 @@ -961,8 +961,8 @@ ; P9BE-LABEL: dont_fold_srem_one: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: lis r4, -19946 +; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: ori r4, r4, 17097 ; P9BE-NEXT: extsh r3, r3 ; P9BE-NEXT: mulhw r4, r3, r4 @@ -974,11 +974,11 @@ ; P9BE-NEXT: sub r3, r3, r4 ; P9BE-NEXT: lis r4, 24749 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: ori r4, r4, 47143 ; P9BE-NEXT: mtvsrd v3, r3 ; P9BE-NEXT: li r3, 6 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: ori r4, r4, 47143 ; P9BE-NEXT: mulhw r4, r3, r4 ; P9BE-NEXT: srwi r5, r4, 31 ; P9BE-NEXT: srawi r4, r4, 11 @@ -987,11 +987,12 @@ ; P9BE-NEXT: sub r3, r3, r4 ; P9BE-NEXT: lis r4, -14230 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: ori r4, r4, 30865 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: vmrghh v3, v3, v4 ; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: ori r4, r4, 30865 ; P9BE-NEXT: mulhw r4, r3, r4 ; P9BE-NEXT: add r4, r4, r3 ; P9BE-NEXT: srwi r5, r4, 31 @@ -1003,7 +1004,6 @@ ; P9BE-NEXT: mtvsrd v2, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: vmrghh v3, v3, v4 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: vmrghh v2, v4, v2 ; P9BE-NEXT: vmrghw v2, v2, v3 @@ -1112,8 +1112,8 @@ ; P9LE-LABEL: dont_fold_urem_i16_smax: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: lis r4, -19946 +; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: ori r4, r4, 17097 ; P9LE-NEXT: extsh r3, r3 ; P9LE-NEXT: mulhw r4, r3, r4 @@ -1126,9 +1126,9 @@ ; P9LE-NEXT: lis r4, 24749 ; P9LE-NEXT: mtvsrd v3, r3 ; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: ori r4, r4, 47143 ; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: extsh r3, r3 -; P9LE-NEXT: ori r4, r4, 47143 ; P9LE-NEXT: mulhw r4, r3, r4 ; P9LE-NEXT: srwi r5, r4, 31 ; P9LE-NEXT: srawi r4, r4, 11 @@ -1138,6 +1138,7 @@ ; P9LE-NEXT: mtvsrd v4, r3 ; P9LE-NEXT: li r3, 2 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: vmrghh v3, v4, v3 ; P9LE-NEXT: extsh r3, r3 ; P9LE-NEXT: srawi r4, r3, 15 ; P9LE-NEXT: addze r4, r4 @@ -1145,7 +1146,6 @@ ; P9LE-NEXT: sub r3, r3, r4 ; P9LE-NEXT: mtvsrd v2, r3 ; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: vmrghh v3, v4, v3 ; P9LE-NEXT: mtvsrd v4, r3 ; P9LE-NEXT: vmrghh v2, v2, v4 ; P9LE-NEXT: vmrglw v2, v3, v2 @@ -1154,8 +1154,8 @@ ; P9BE-LABEL: dont_fold_urem_i16_smax: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: lis r4, -19946 +; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: ori r4, r4, 17097 ; P9BE-NEXT: extsh r3, r3 ; P9BE-NEXT: mulhw r4, r3, r4 @@ -1167,11 +1167,11 @@ ; P9BE-NEXT: sub r3, r3, r4 ; P9BE-NEXT: lis r4, 24749 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: ori r4, r4, 47143 ; P9BE-NEXT: mtvsrd v3, r3 ; P9BE-NEXT: li r3, 6 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: ori r4, r4, 47143 ; P9BE-NEXT: mulhw r4, r3, r4 ; P9BE-NEXT: srwi r5, r4, 31 ; P9BE-NEXT: srawi r4, r4, 11 @@ -1182,6 +1182,7 @@ ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: vmrghh v3, v3, v4 ; P9BE-NEXT: extsh r3, r3 ; P9BE-NEXT: srawi r4, r3, 15 ; P9BE-NEXT: addze r4, r4 @@ -1191,7 +1192,6 @@ ; P9BE-NEXT: mtvsrd v2, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: vmrghh v3, v3, v4 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: vmrghh v2, v4, v2 ; P9BE-NEXT: vmrghw v2, v2, v3 @@ -1290,10 +1290,10 @@ ; P9LE-LABEL: dont_fold_srem_i64: ; P9LE: # %bb.0: ; P9LE-NEXT: lis r4, 24749 +; P9LE-NEXT: mfvsrd r3, v3 ; P9LE-NEXT: ori r4, r4, 47142 ; P9LE-NEXT: sldi r4, r4, 32 ; P9LE-NEXT: oris r4, r4, 58853 -; P9LE-NEXT: mfvsrd r3, v3 ; P9LE-NEXT: ori r4, r4, 6055 ; P9LE-NEXT: mulhd r4, r3, r4 ; P9LE-NEXT: rldicl r5, r4, 1, 63 @@ -1316,10 +1316,10 @@ ; P9LE-NEXT: sub r4, r4, r5 ; P9LE-NEXT: mtvsrdd v3, r3, r4 ; P9LE-NEXT: lis r4, 25653 +; P9LE-NEXT: mfvsrd r3, v2 ; P9LE-NEXT: ori r4, r4, 15432 ; P9LE-NEXT: sldi r4, r4, 32 ; P9LE-NEXT: oris r4, r4, 1603 -; P9LE-NEXT: mfvsrd r3, v2 ; P9LE-NEXT: ori r4, r4, 21445 ; P9LE-NEXT: mulhd r4, r3, r4 ; P9LE-NEXT: rldicl r5, r4, 1, 63 @@ -1334,10 +1334,10 @@ ; P9BE-LABEL: dont_fold_srem_i64: ; P9BE: # %bb.0: ; P9BE-NEXT: lis r4, 24749 +; P9BE-NEXT: mfvsrld r3, v3 ; P9BE-NEXT: ori r4, r4, 47142 ; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: oris r4, r4, 58853 -; P9BE-NEXT: mfvsrld r3, v3 ; P9BE-NEXT: ori r4, r4, 6055 ; P9BE-NEXT: mulhd r4, r3, r4 ; P9BE-NEXT: rldicl r5, r4, 1, 63 @@ -1360,10 +1360,10 @@ ; P9BE-NEXT: sub r4, r4, r5 ; P9BE-NEXT: mtvsrdd v3, r4, r3 ; P9BE-NEXT: lis r4, 25653 +; P9BE-NEXT: mfvsrld r3, v2 ; P9BE-NEXT: ori r4, r4, 15432 ; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: oris r4, r4, 1603 -; P9BE-NEXT: mfvsrld r3, v2 ; P9BE-NEXT: ori r4, r4, 21445 ; P9BE-NEXT: mulhd r4, r3, r4 ; P9BE-NEXT: rldicl r5, r4, 1, 63 diff --git a/llvm/test/CodeGen/PowerPC/stack-clash-dynamic-alloca.ll b/llvm/test/CodeGen/PowerPC/stack-clash-dynamic-alloca.ll --- a/llvm/test/CodeGen/PowerPC/stack-clash-dynamic-alloca.ll +++ b/llvm/test/CodeGen/PowerPC/stack-clash-dynamic-alloca.ll @@ -51,15 +51,15 @@ ; CHECK-P9-LE-NEXT: stdu r1, -48(r1) ; CHECK-P9-LE-NEXT: rldic r3, r3, 2, 30 ; CHECK-P9-LE-NEXT: addi r3, r3, 15 +; CHECK-P9-LE-NEXT: li r6, -32768 +; CHECK-P9-LE-NEXT: mr r31, r1 +; CHECK-P9-LE-NEXT: addi r4, r31, 48 ; CHECK-P9-LE-NEXT: rldicl r3, r3, 60, 4 ; CHECK-P9-LE-NEXT: rldicl r3, r3, 4, 29 ; CHECK-P9-LE-NEXT: neg r5, r3 -; CHECK-P9-LE-NEXT: li r6, -32768 ; CHECK-P9-LE-NEXT: divd r7, r5, r6 -; CHECK-P9-LE-NEXT: mulld r6, r7, r6 -; CHECK-P9-LE-NEXT: mr r31, r1 -; CHECK-P9-LE-NEXT: addi r4, r31, 48 ; CHECK-P9-LE-NEXT: add r3, r1, r5 +; CHECK-P9-LE-NEXT: mulld r6, r7, r6 ; CHECK-P9-LE-NEXT: sub r5, r5, r6 ; CHECK-P9-LE-NEXT: stdux r4, r1, r5 ; CHECK-P9-LE-NEXT: cmpd r1, r3 @@ -69,8 +69,8 @@ ; CHECK-P9-LE-NEXT: cmpd r1, r3 ; CHECK-P9-LE-NEXT: bne cr0, .LBB0_1 ; CHECK-P9-LE-NEXT: .LBB0_2: -; CHECK-P9-LE-NEXT: addi r3, r1, 32 ; CHECK-P9-LE-NEXT: li r4, 1 +; CHECK-P9-LE-NEXT: addi r3, r1, 32 ; CHECK-P9-LE-NEXT: stw r4, 4792(r3) ; CHECK-P9-LE-NEXT: lwz r3, 0(r3) ; CHECK-P9-LE-NEXT: ld r1, 0(r1) @@ -190,15 +190,15 @@ ; CHECK-P9-LE-NEXT: stdu r1, -48(r1) ; CHECK-P9-LE-NEXT: rldic r4, r3, 2, 30 ; CHECK-P9-LE-NEXT: addi r4, r4, 15 +; CHECK-P9-LE-NEXT: li r7, -4096 +; CHECK-P9-LE-NEXT: mr r31, r1 +; CHECK-P9-LE-NEXT: addi r5, r31, 48 ; CHECK-P9-LE-NEXT: rldicl r4, r4, 60, 4 ; CHECK-P9-LE-NEXT: rldicl r4, r4, 4, 29 ; CHECK-P9-LE-NEXT: neg r6, r4 -; CHECK-P9-LE-NEXT: li r7, -4096 ; CHECK-P9-LE-NEXT: divd r8, r6, r7 -; CHECK-P9-LE-NEXT: mulld r7, r8, r7 -; CHECK-P9-LE-NEXT: mr r31, r1 -; CHECK-P9-LE-NEXT: addi r5, r31, 48 ; CHECK-P9-LE-NEXT: add r4, r1, r6 +; CHECK-P9-LE-NEXT: mulld r7, r8, r7 ; CHECK-P9-LE-NEXT: sub r6, r6, r7 ; CHECK-P9-LE-NEXT: stdux r5, r1, r6 ; CHECK-P9-LE-NEXT: cmpd r1, r4 @@ -208,10 +208,10 @@ ; CHECK-P9-LE-NEXT: cmpd r1, r4 ; CHECK-P9-LE-NEXT: bne cr0, .LBB1_1 ; CHECK-P9-LE-NEXT: .LBB1_2: -; CHECK-P9-LE-NEXT: addi r4, r1, 32 ; CHECK-P9-LE-NEXT: extswsli r3, r3, 2 -; CHECK-P9-LE-NEXT: add r3, r4, r3 ; CHECK-P9-LE-NEXT: li r5, 1 +; CHECK-P9-LE-NEXT: addi r4, r1, 32 +; CHECK-P9-LE-NEXT: add r3, r4, r3 ; CHECK-P9-LE-NEXT: stw r5, 4096(r3) ; CHECK-P9-LE-NEXT: lwz r3, 0(r4) ; CHECK-P9-LE-NEXT: ld r1, 0(r1) @@ -334,16 +334,16 @@ ; CHECK-P9-LE-NEXT: stdu r1, -48(r1) ; CHECK-P9-LE-NEXT: rldic r3, r3, 2, 30 ; CHECK-P9-LE-NEXT: addi r3, r3, 15 -; CHECK-P9-LE-NEXT: rldicl r3, r3, 60, 4 -; CHECK-P9-LE-NEXT: rldicl r3, r3, 4, 29 ; CHECK-P9-LE-NEXT: lis r5, -1 ; CHECK-P9-LE-NEXT: ori r5, r5, 0 -; CHECK-P9-LE-NEXT: neg r6, r3 -; CHECK-P9-LE-NEXT: divd r7, r6, r5 -; CHECK-P9-LE-NEXT: mulld r7, r7, r5 ; CHECK-P9-LE-NEXT: mr r31, r1 ; CHECK-P9-LE-NEXT: addi r4, r31, 48 +; CHECK-P9-LE-NEXT: rldicl r3, r3, 60, 4 +; CHECK-P9-LE-NEXT: rldicl r3, r3, 4, 29 +; CHECK-P9-LE-NEXT: neg r6, r3 +; CHECK-P9-LE-NEXT: divd r7, r6, r5 ; CHECK-P9-LE-NEXT: add r3, r1, r6 +; CHECK-P9-LE-NEXT: mulld r7, r7, r5 ; CHECK-P9-LE-NEXT: sub r6, r6, r7 ; CHECK-P9-LE-NEXT: stdux r4, r1, r6 ; CHECK-P9-LE-NEXT: cmpd r1, r3 @@ -353,8 +353,8 @@ ; CHECK-P9-LE-NEXT: cmpd r1, r3 ; CHECK-P9-LE-NEXT: bne cr0, .LBB2_1 ; CHECK-P9-LE-NEXT: .LBB2_2: -; CHECK-P9-LE-NEXT: addi r3, r1, 32 ; CHECK-P9-LE-NEXT: li r4, 1 +; CHECK-P9-LE-NEXT: addi r3, r1, 32 ; CHECK-P9-LE-NEXT: stw r4, 4792(r3) ; CHECK-P9-LE-NEXT: lwz r3, 0(r3) ; CHECK-P9-LE-NEXT: ld r1, 0(r1) diff --git a/llvm/test/CodeGen/PowerPC/topdepthreduce-postra.mir b/llvm/test/CodeGen/PowerPC/topdepthreduce-postra.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/topdepthreduce-postra.mir @@ -0,0 +1,18 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -run-pass=postmisched -o - %s | FileCheck %s +--- +# Check that postmisched's TopDepthReduce heuristic moves the MULLD later +# because of the dependency on x5 +name: test +body: | + bb.0: + ; CHECK-LABEL: name: test + ; CHECK: renamable $x5 = LD 0, killed renamable $x5 :: (load 8) + ; CHECK: renamable $x4 = LD 0, killed renamable $x4 :: (load 8) + ; CHECK: renamable $x5 = MULLD killed renamable $x5, renamable $x3 + ; CHECK: renamable $x3 = MADDLD8 killed renamable $x4, killed renamable $x3, killed renamable $x5 + renamable $x5 = LD 0, killed renamable $x5 :: (load 8) + renamable $x5 = MULLD killed renamable $x5, renamable $x3 + renamable $x4 = LD 0, killed renamable $x4 :: (load 8) + renamable $x3 = MADDLD8 killed renamable $x4, killed renamable $x3, killed renamable $x5 +... diff --git a/llvm/test/CodeGen/PowerPC/uint-to-fp-v4i32.ll b/llvm/test/CodeGen/PowerPC/uint-to-fp-v4i32.ll --- a/llvm/test/CodeGen/PowerPC/uint-to-fp-v4i32.ll +++ b/llvm/test/CodeGen/PowerPC/uint-to-fp-v4i32.ll @@ -20,9 +20,9 @@ ; P9BE-NEXT: mtfprwz f0, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: xscvuxddp f0, f0 ; P9BE-NEXT: clrlwi r3, r3, 16 ; P9BE-NEXT: mtfprwz f1, r3 -; P9BE-NEXT: xscvuxddp f0, f0 ; P9BE-NEXT: xscvuxddp f1, f1 ; P9BE-NEXT: xxmrghd v2, vs0, vs1 ; P9BE-NEXT: blr @@ -35,9 +35,9 @@ ; P9LE-NEXT: mtfprwz f0, r3 ; P9LE-NEXT: li r3, 2 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: xscvuxddp f0, f0 ; P9LE-NEXT: clrlwi r3, r3, 16 ; P9LE-NEXT: mtfprwz f1, r3 -; P9LE-NEXT: xscvuxddp f0, f0 ; P9LE-NEXT: xscvuxddp f1, f1 ; P9LE-NEXT: xxmrghd v2, vs1, vs0 ; P9LE-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/unaligned-addressing-mode.ll b/llvm/test/CodeGen/PowerPC/unaligned-addressing-mode.ll --- a/llvm/test/CodeGen/PowerPC/unaligned-addressing-mode.ll +++ b/llvm/test/CodeGen/PowerPC/unaligned-addressing-mode.ll @@ -6,8 +6,8 @@ ; CHECK-LABEL: test_xaddr: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li r4, 0 -; CHECK-NEXT: ori r4, r4, 40000 ; CHECK-NEXT: std r3, -8(r1) +; CHECK-NEXT: ori r4, r4, 40000 ; CHECK-NEXT: lbzx r3, r3, r4 ; CHECK-NEXT: blr entry: @@ -56,8 +56,8 @@ ; CHECK-LABEL: test_xoaddr: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addi r3, r3, 8 -; CHECK-NEXT: lxvx vs0, 0, r3 ; CHECK-NEXT: addi r4, r4, 4 +; CHECK-NEXT: lxvx vs0, 0, r3 ; CHECK-NEXT: stxvx vs0, 0, r4 ; CHECK-NEXT: blr entry: @@ -77,9 +77,9 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addi r4, r3, -8 ; CHECK-NEXT: li r3, 8 +; CHECK-NEXT: li r5, 3 ; CHECK-NEXT: mtctr r3 ; CHECK-NEXT: li r3, 0 -; CHECK-NEXT: li r5, 3 ; loop instruction number is changed from 5 to 4, so its align is changed from 5 to 4. ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB4_1: # %for.body diff --git a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll --- a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll @@ -12,9 +12,11 @@ ; P9LE-LABEL: fold_urem_vec_1: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: lis r4, 21399 +; P9LE-NEXT: lis r5, 8456 +; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: ori r4, r4, 33437 +; P9LE-NEXT: ori r5, r5, 16913 ; P9LE-NEXT: clrlwi r3, r3, 16 ; P9LE-NEXT: mulhwu r4, r3, r4 ; P9LE-NEXT: srwi r4, r4, 5 @@ -23,9 +25,9 @@ ; P9LE-NEXT: lis r4, 16727 ; P9LE-NEXT: mtvsrd v3, r3 ; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: ori r4, r4, 2287 ; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: ori r4, r4, 2287 ; P9LE-NEXT: mulhwu r4, r3, r4 ; P9LE-NEXT: srwi r4, r4, 8 ; P9LE-NEXT: mulli r4, r4, 1003 @@ -33,8 +35,6 @@ ; P9LE-NEXT: mtvsrd v4, r3 ; P9LE-NEXT: li r3, 2 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: lis r5, 8456 -; P9LE-NEXT: ori r5, r5, 16913 ; P9LE-NEXT: vmrghh v3, v4, v3 ; P9LE-NEXT: clrlwi r4, r3, 16 ; P9LE-NEXT: rlwinm r3, r3, 30, 18, 31 @@ -45,9 +45,9 @@ ; P9LE-NEXT: lis r4, 22765 ; P9LE-NEXT: mtvsrd v4, r3 ; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: ori r4, r4, 8969 ; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: ori r4, r4, 8969 ; P9LE-NEXT: mulhwu r4, r3, r4 ; P9LE-NEXT: sub r5, r3, r4 ; P9LE-NEXT: srwi r5, r5, 1 @@ -63,9 +63,11 @@ ; P9BE-LABEL: fold_urem_vec_1: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: lis r4, 16727 +; P9BE-NEXT: lis r5, 8456 +; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: ori r4, r4, 2287 +; P9BE-NEXT: ori r5, r5, 16913 ; P9BE-NEXT: clrlwi r3, r3, 16 ; P9BE-NEXT: mulhwu r4, r3, r4 ; P9BE-NEXT: srwi r4, r4, 8 @@ -73,11 +75,11 @@ ; P9BE-NEXT: sub r3, r3, r4 ; P9BE-NEXT: lis r4, 21399 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: ori r4, r4, 33437 ; P9BE-NEXT: mtvsrd v3, r3 ; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: ori r4, r4, 33437 ; P9BE-NEXT: mulhwu r4, r3, r4 ; P9BE-NEXT: srwi r4, r4, 5 ; P9BE-NEXT: mulli r4, r4, 98 @@ -86,8 +88,6 @@ ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: lis r5, 8456 -; P9BE-NEXT: ori r5, r5, 16913 ; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: clrlwi r4, r3, 16 ; P9BE-NEXT: rlwinm r3, r3, 30, 18, 31 @@ -97,11 +97,11 @@ ; P9BE-NEXT: sub r3, r4, r3 ; P9BE-NEXT: lis r4, 22765 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: ori r4, r4, 8969 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: ori r4, r4, 8969 ; P9BE-NEXT: mulhwu r4, r3, r4 ; P9BE-NEXT: sub r5, r3, r4 ; P9BE-NEXT: srwi r5, r5, 1 @@ -223,8 +223,8 @@ ; P9LE-LABEL: fold_urem_vec_2: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: lis r4, 22765 +; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: ori r4, r4, 8969 ; P9LE-NEXT: clrlwi r3, r3, 16 ; P9LE-NEXT: mulhwu r5, r3, r4 @@ -248,6 +248,7 @@ ; P9LE-NEXT: mtvsrd v4, r3 ; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: vmrghh v3, v4, v3 ; P9LE-NEXT: clrlwi r3, r3, 16 ; P9LE-NEXT: mulhwu r5, r3, r4 ; P9LE-NEXT: sub r6, r3, r5 @@ -256,7 +257,6 @@ ; P9LE-NEXT: srwi r5, r5, 6 ; P9LE-NEXT: mulli r5, r5, 95 ; P9LE-NEXT: sub r3, r3, r5 -; P9LE-NEXT: vmrghh v3, v4, v3 ; P9LE-NEXT: mtvsrd v4, r3 ; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 @@ -276,8 +276,8 @@ ; P9BE-LABEL: fold_urem_vec_2: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: lis r4, 22765 +; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: ori r4, r4, 8969 ; P9BE-NEXT: clrlwi r3, r3, 16 ; P9BE-NEXT: mulhwu r5, r3, r4 @@ -303,6 +303,7 @@ ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: clrlwi r3, r3, 16 ; P9BE-NEXT: mulhwu r5, r3, r4 ; P9BE-NEXT: sub r6, r3, r5 @@ -312,7 +313,6 @@ ; P9BE-NEXT: mulli r5, r5, 95 ; P9BE-NEXT: sub r3, r3, r5 ; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: vextuhlx r3, r3, v2 @@ -444,8 +444,8 @@ ; P9LE-LABEL: combine_urem_udiv: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: lis r4, 22765 +; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: ori r4, r4, 8969 ; P9LE-NEXT: clrlwi r3, r3, 16 ; P9LE-NEXT: mulhwu r5, r3, r4 @@ -469,6 +469,7 @@ ; P9LE-NEXT: mtvsrd v4, r3 ; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: vmrghh v3, v4, v3 ; P9LE-NEXT: clrlwi r7, r3, 16 ; P9LE-NEXT: mulhwu r8, r7, r4 ; P9LE-NEXT: sub r7, r7, r8 @@ -477,7 +478,6 @@ ; P9LE-NEXT: srwi r7, r7, 6 ; P9LE-NEXT: mulli r8, r7, 95 ; P9LE-NEXT: sub r3, r3, r8 -; P9LE-NEXT: vmrghh v3, v4, v3 ; P9LE-NEXT: mtvsrd v4, r3 ; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 @@ -488,6 +488,7 @@ ; P9LE-NEXT: add r4, r8, r4 ; P9LE-NEXT: srwi r4, r4, 6 ; P9LE-NEXT: mulli r8, r4, 95 +; P9LE-NEXT: mtvsrd v5, r4 ; P9LE-NEXT: sub r3, r3, r8 ; P9LE-NEXT: mtvsrd v2, r3 ; P9LE-NEXT: vmrghh v2, v2, v4 @@ -496,7 +497,6 @@ ; P9LE-NEXT: mtvsrd v3, r5 ; P9LE-NEXT: vmrghh v3, v4, v3 ; P9LE-NEXT: mtvsrd v4, r7 -; P9LE-NEXT: mtvsrd v5, r4 ; P9LE-NEXT: vmrghh v4, v5, v4 ; P9LE-NEXT: vmrglw v3, v4, v3 ; P9LE-NEXT: vadduhm v2, v2, v3 @@ -505,8 +505,8 @@ ; P9BE-LABEL: combine_urem_udiv: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: lis r5, 22765 +; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: ori r5, r5, 8969 ; P9BE-NEXT: clrlwi r4, r3, 16 ; P9BE-NEXT: mulhwu r6, r4, r5 @@ -532,6 +532,7 @@ ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: clrlwi r7, r3, 16 ; P9BE-NEXT: mulhwu r8, r7, r5 ; P9BE-NEXT: sub r7, r7, r8 @@ -541,7 +542,6 @@ ; P9BE-NEXT: mulli r8, r7, 95 ; P9BE-NEXT: sub r3, r3, r8 ; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: vextuhlx r3, r3, v2 @@ -708,7 +708,9 @@ ; P9LE-LABEL: dont_fold_urem_power_of_two: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: lis r4, 22765 ; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: ori r4, r4, 8969 ; P9LE-NEXT: clrlwi r3, r3, 26 ; P9LE-NEXT: mtvsrd v3, r3 ; P9LE-NEXT: li r3, 2 @@ -717,8 +719,6 @@ ; P9LE-NEXT: mtvsrd v4, r3 ; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: lis r4, 22765 -; P9LE-NEXT: ori r4, r4, 8969 ; P9LE-NEXT: vmrghh v3, v4, v3 ; P9LE-NEXT: clrlwi r3, r3, 16 ; P9LE-NEXT: mulhwu r4, r3, r4 @@ -740,7 +740,9 @@ ; P9BE-LABEL: dont_fold_urem_power_of_two: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: lis r4, 22765 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: ori r4, r4, 8969 ; P9BE-NEXT: clrlwi r3, r3, 27 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v3, r3 @@ -751,8 +753,6 @@ ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 6 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: lis r4, 22765 -; P9BE-NEXT: ori r4, r4, 8969 ; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: clrlwi r3, r3, 16 ; P9BE-NEXT: mulhwu r4, r3, r4 @@ -844,9 +844,11 @@ ; P9LE-LABEL: dont_fold_urem_one: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: lis r4, -19946 +; P9LE-NEXT: lis r5, -14230 +; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: ori r4, r4, 17097 +; P9LE-NEXT: ori r5, r5, 30865 ; P9LE-NEXT: clrlwi r3, r3, 16 ; P9LE-NEXT: mulhwu r4, r3, r4 ; P9LE-NEXT: srwi r4, r4, 4 @@ -855,9 +857,9 @@ ; P9LE-NEXT: lis r4, 24749 ; P9LE-NEXT: mtvsrd v3, r3 ; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: ori r4, r4, 47143 ; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: clrlwi r3, r3, 16 -; P9LE-NEXT: ori r4, r4, 47143 ; P9LE-NEXT: mulhwu r4, r3, r4 ; P9LE-NEXT: srwi r4, r4, 11 ; P9LE-NEXT: mulli r4, r4, 5423 @@ -865,8 +867,6 @@ ; P9LE-NEXT: mtvsrd v4, r3 ; P9LE-NEXT: li r3, 2 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: lis r5, -14230 -; P9LE-NEXT: ori r5, r5, 30865 ; P9LE-NEXT: vmrghh v3, v4, v3 ; P9LE-NEXT: clrlwi r4, r3, 16 ; P9LE-NEXT: rlwinm r3, r3, 31, 17, 31 @@ -884,9 +884,11 @@ ; P9BE-LABEL: dont_fold_urem_one: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: lis r4, 24749 +; P9BE-NEXT: lis r5, -14230 +; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: ori r4, r4, 47143 +; P9BE-NEXT: ori r5, r5, 30865 ; P9BE-NEXT: clrlwi r3, r3, 16 ; P9BE-NEXT: mulhwu r4, r3, r4 ; P9BE-NEXT: srwi r4, r4, 11 @@ -894,11 +896,11 @@ ; P9BE-NEXT: sub r3, r3, r4 ; P9BE-NEXT: lis r4, -19946 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: ori r4, r4, 17097 ; P9BE-NEXT: mtvsrd v3, r3 ; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: ori r4, r4, 17097 ; P9BE-NEXT: mulhwu r4, r3, r4 ; P9BE-NEXT: srwi r4, r4, 4 ; P9BE-NEXT: mulli r4, r4, 23 @@ -907,8 +909,6 @@ ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: lis r5, -14230 -; P9BE-NEXT: ori r5, r5, 30865 ; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: clrlwi r4, r3, 16 ; P9BE-NEXT: rlwinm r3, r3, 31, 17, 31 @@ -1023,10 +1023,10 @@ ; P9LE-LABEL: dont_fold_urem_i64: ; P9LE: # %bb.0: ; P9LE-NEXT: lis r4, 25644 +; P9LE-NEXT: mfvsrld r3, v3 ; P9LE-NEXT: ori r4, r4, 34192 ; P9LE-NEXT: sldi r4, r4, 32 ; P9LE-NEXT: oris r4, r4, 45590 -; P9LE-NEXT: mfvsrld r3, v3 ; P9LE-NEXT: ori r4, r4, 17097 ; P9LE-NEXT: mulhdu r4, r3, r4 ; P9LE-NEXT: sub r5, r3, r4 @@ -1047,9 +1047,9 @@ ; P9LE-NEXT: sub r4, r4, r5 ; P9LE-NEXT: lis r5, 25653 ; P9LE-NEXT: ori r5, r5, 15432 -; P9LE-NEXT: sldi r5, r5, 32 ; P9LE-NEXT: mtvsrdd v3, r4, r3 ; P9LE-NEXT: mfvsrd r3, v2 +; P9LE-NEXT: sldi r5, r5, 32 ; P9LE-NEXT: rldicl r4, r3, 63, 1 ; P9LE-NEXT: oris r5, r5, 1603 ; P9LE-NEXT: ori r5, r5, 21445 @@ -1064,10 +1064,10 @@ ; P9BE-LABEL: dont_fold_urem_i64: ; P9BE: # %bb.0: ; P9BE-NEXT: lis r4, 25644 +; P9BE-NEXT: mfvsrd r3, v3 ; P9BE-NEXT: ori r4, r4, 34192 ; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: oris r4, r4, 45590 -; P9BE-NEXT: mfvsrd r3, v3 ; P9BE-NEXT: ori r4, r4, 17097 ; P9BE-NEXT: mulhdu r4, r3, r4 ; P9BE-NEXT: sub r5, r3, r4 @@ -1075,8 +1075,8 @@ ; P9BE-NEXT: add r4, r5, r4 ; P9BE-NEXT: lis r5, -16037 ; P9BE-NEXT: rldicl r4, r4, 60, 4 -; P9BE-NEXT: mulli r4, r4, 23 ; P9BE-NEXT: ori r5, r5, 28749 +; P9BE-NEXT: mulli r4, r4, 23 ; P9BE-NEXT: sldi r5, r5, 32 ; P9BE-NEXT: oris r5, r5, 52170 ; P9BE-NEXT: ori r5, r5, 12109 @@ -1088,9 +1088,9 @@ ; P9BE-NEXT: sub r4, r4, r5 ; P9BE-NEXT: lis r5, 25653 ; P9BE-NEXT: ori r5, r5, 15432 -; P9BE-NEXT: sldi r5, r5, 32 ; P9BE-NEXT: mtvsrdd v3, r3, r4 ; P9BE-NEXT: mfvsrld r3, v2 +; P9BE-NEXT: sldi r5, r5, 32 ; P9BE-NEXT: rldicl r4, r3, 63, 1 ; P9BE-NEXT: oris r5, r5, 1603 ; P9BE-NEXT: ori r5, r5, 21445 diff --git a/llvm/test/CodeGen/PowerPC/vavg.ll b/llvm/test/CodeGen/PowerPC/vavg.ll --- a/llvm/test/CodeGen/PowerPC/vavg.ll +++ b/llvm/test/CodeGen/PowerPC/vavg.ll @@ -138,8 +138,8 @@ ; CHECK-P9-LABEL: test_v8i16_sign_negative: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: addis 3, 2, .LCPI6_0@toc@ha -; CHECK-P9-NEXT: addi 3, 3, .LCPI6_0@toc@l ; CHECK-P9-NEXT: vadduhm 2, 2, 3 +; CHECK-P9-NEXT: addi 3, 3, .LCPI6_0@toc@l ; CHECK-P9-NEXT: lxvx 35, 0, 3 ; CHECK-P9-NEXT: vadduhm 2, 2, 3 ; CHECK-P9-NEXT: vspltish 3, 1 diff --git a/llvm/test/CodeGen/PowerPC/vec-bswap.ll b/llvm/test/CodeGen/PowerPC/vec-bswap.ll --- a/llvm/test/CodeGen/PowerPC/vec-bswap.ll +++ b/llvm/test/CodeGen/PowerPC/vec-bswap.ll @@ -3,7 +3,8 @@ define dso_local void @test(i32* %Arr, i32 signext %Len) { ; CHECK-LABEL: test: ; CHECK: lxvx [[REG:vs[0-9]+]], r{{[0-9]+}}, r{{[0-9]+}} -; CHECK-NEXT: xxbrw vs{{[0-9]+}}, [[REG]] +; CHECK-NOT: [[REG]] +; CHECK: xxbrw vs{{[0-9]+}}, [[REG]] entry: %cmp1 = icmp slt i32 0, %Len br i1 %cmp1, label %for.body.lr.ph, label %for.cond.cleanup diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i16_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i16_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i16_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i16_elts.ll @@ -34,9 +34,9 @@ ; CHECK-P9-NEXT: xxswapd v2, vs0 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 ; CHECK-P9-NEXT: xxsldwi vs1, v2, v2, 3 +; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: mffprwz r3, f1 ; CHECK-P9-NEXT: mtvsrd v2, r3 ; CHECK-P9-NEXT: mffprwz r3, f0 @@ -219,10 +219,10 @@ ; CHECK-P9-LABEL: test8elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs1, 0(r3) +; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: xxsldwi vs2, vs1, vs1, 3 ; CHECK-P9-NEXT: xscvspdpn f2, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 -; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: mffprwz r3, f2 ; CHECK-P9-NEXT: xxswapd vs2, vs1 ; CHECK-P9-NEXT: mtvsrd v2, r3 @@ -270,10 +270,10 @@ ; CHECK-BE-LABEL: test8elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: xxsldwi vs2, vs1, vs1, 3 ; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: mffprwz r3, f2 ; CHECK-BE-NEXT: xxswapd vs2, vs1 ; CHECK-BE-NEXT: sldi r3, r3, 48 @@ -298,14 +298,14 @@ ; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v4, r3 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: vmrghh v3, v3, v4 +; CHECK-BE-NEXT: vmrghw v2, v3, v2 ; CHECK-BE-NEXT: mffprwz r3, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs0 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xscvdpsxws f1, f1 -; CHECK-BE-NEXT: vmrghh v3, v3, v4 ; CHECK-BE-NEXT: sldi r3, r3, 48 -; CHECK-BE-NEXT: vmrghw v2, v3, v2 +; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v3, r3 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: mffprwz r3, f1 ; CHECK-BE-NEXT: xscvspdpn f1, vs0 ; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 @@ -438,19 +438,20 @@ ; CHECK-P9-LABEL: test16elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs2, 0(r4) +; CHECK-P9-NEXT: lxv vs1, 16(r4) +; CHECK-P9-NEXT: lxv vs0, 32(r4) ; CHECK-P9-NEXT: xxsldwi vs3, vs2, vs2, 3 ; CHECK-P9-NEXT: xxswapd vs4, vs2 +; CHECK-P9-NEXT: xscvspdpn f5, vs2 +; CHECK-P9-NEXT: xxsldwi vs2, vs2, vs2, 1 +; CHECK-P9-NEXT: xxsldwi vs6, vs1, vs1, 3 ; CHECK-P9-NEXT: xscvspdpn f3, vs3 ; CHECK-P9-NEXT: xscvspdpn f4, vs4 +; CHECK-P9-NEXT: xscvspdpn f2, vs2 ; CHECK-P9-NEXT: xscvdpsxws f3, f3 ; CHECK-P9-NEXT: xscvdpsxws f4, f4 -; CHECK-P9-NEXT: xscvspdpn f5, vs2 -; CHECK-P9-NEXT: xxsldwi vs2, vs2, vs2, 1 -; CHECK-P9-NEXT: xscvspdpn f2, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 ; CHECK-P9-NEXT: mffprwz r5, f3 -; CHECK-P9-NEXT: lxv vs1, 16(r4) -; CHECK-P9-NEXT: xxsldwi vs6, vs1, vs1, 3 ; CHECK-P9-NEXT: xxswapd vs3, vs1 ; CHECK-P9-NEXT: mtvsrd v2, r5 ; CHECK-P9-NEXT: mffprwz r5, f4 @@ -458,6 +459,7 @@ ; CHECK-P9-NEXT: xscvspdpn f3, vs3 ; CHECK-P9-NEXT: mtvsrd v3, r5 ; CHECK-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-P9-NEXT: xscvdpsxws f3, f3 ; CHECK-P9-NEXT: mffprwz r5, f4 ; CHECK-P9-NEXT: xscvspdpn f4, vs6 ; CHECK-P9-NEXT: mtvsrd v3, r5 @@ -465,15 +467,13 @@ ; CHECK-P9-NEXT: xscvspdpn f2, vs1 ; CHECK-P9-NEXT: xxsldwi vs1, vs1, vs1, 1 ; CHECK-P9-NEXT: xscvdpsxws f4, f4 -; CHECK-P9-NEXT: xscvdpsxws f3, f3 -; CHECK-P9-NEXT: lxv vs0, 32(r4) ; CHECK-P9-NEXT: mtvsrd v4, r5 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: vmrghh v3, v3, v4 +; CHECK-P9-NEXT: xscvdpsxws f1, f1 ; CHECK-P9-NEXT: vmrglw v2, v3, v2 ; CHECK-P9-NEXT: mffprwz r5, f4 -; CHECK-P9-NEXT: xscvspdpn f1, vs1 -; CHECK-P9-NEXT: xscvdpsxws f1, f1 ; CHECK-P9-NEXT: mtvsrd v4, r5 ; CHECK-P9-NEXT: mffprwz r5, f3 ; CHECK-P9-NEXT: xxsldwi vs3, vs0, vs0, 3 @@ -506,6 +506,7 @@ ; CHECK-P9-NEXT: mtvsrd v4, r4 ; CHECK-P9-NEXT: mffprwz r4, f0 ; CHECK-P9-NEXT: xxsldwi vs0, vs1, vs1, 3 +; CHECK-P9-NEXT: stxv vs2, 0(r3) ; CHECK-P9-NEXT: mtvsrd v2, r4 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 ; CHECK-P9-NEXT: vmrghh v2, v4, v2 @@ -532,31 +533,31 @@ ; CHECK-P9-NEXT: vmrglw v3, v4, v3 ; CHECK-P9-NEXT: xxmrgld vs0, v3, v2 ; CHECK-P9-NEXT: stxv vs0, 16(r3) -; CHECK-P9-NEXT: stxv vs2, 0(r3) ; CHECK-P9-NEXT: blr ; ; CHECK-BE-LABEL: test16elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 16(r4) +; CHECK-BE-NEXT: lxv vs0, 0(r4) ; CHECK-BE-NEXT: xxsldwi vs2, vs1, vs1, 3 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: xxswapd vs3, vs1 -; CHECK-BE-NEXT: xscvspdpn f3, vs3 -; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: xscvdpsxws f3, f3 -; CHECK-BE-NEXT: mffprwz r5, f2 ; CHECK-BE-NEXT: xscvspdpn f4, vs1 ; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 +; CHECK-BE-NEXT: xscvspdpn f3, vs3 ; CHECK-BE-NEXT: xscvspdpn f1, vs1 +; CHECK-BE-NEXT: xscvdpsxws f2, f2 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: mffprwz r5, f2 +; CHECK-BE-NEXT: xxsldwi vs2, vs0, vs0, 3 ; CHECK-BE-NEXT: sldi r5, r5, 48 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: mtvsrd v2, r5 ; CHECK-BE-NEXT: mffprwz r5, f3 ; CHECK-BE-NEXT: xscvdpsxws f3, f4 -; CHECK-BE-NEXT: lxv vs0, 0(r4) -; CHECK-BE-NEXT: xxsldwi vs2, vs0, vs0, 3 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: sldi r5, r5, 48 +; CHECK-BE-NEXT: xscvdpsxws f2, f2 ; CHECK-BE-NEXT: mtvsrd v3, r5 ; CHECK-BE-NEXT: vmrghh v2, v3, v2 ; CHECK-BE-NEXT: mffprwz r5, f3 @@ -564,7 +565,6 @@ ; CHECK-BE-NEXT: mtvsrd v3, r5 ; CHECK-BE-NEXT: mffprwz r5, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs0 -; CHECK-BE-NEXT: xscvdpsxws f2, f2 ; CHECK-BE-NEXT: sldi r5, r5, 48 ; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v4, r5 @@ -591,24 +591,24 @@ ; CHECK-BE-NEXT: lxv vs0, 32(r4) ; CHECK-BE-NEXT: xscvspdpn f5, vs1 ; CHECK-BE-NEXT: xxsldwi vs2, vs1, vs1, 3 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 -; CHECK-BE-NEXT: xscvdpsxws f5, f5 -; CHECK-BE-NEXT: sldi r5, r5, 48 ; CHECK-BE-NEXT: xxswapd vs3, vs1 +; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 +; CHECK-BE-NEXT: sldi r5, r5, 48 +; CHECK-BE-NEXT: xscvdpsxws f5, f5 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: mtvsrd v0, r5 -; CHECK-BE-NEXT: vmrghh v5, v5, v0 ; CHECK-BE-NEXT: xscvspdpn f3, vs3 -; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 ; CHECK-BE-NEXT: xscvspdpn f1, vs1 +; CHECK-BE-NEXT: vmrghh v5, v5, v0 ; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: vmrghw v3, v5, v4 ; CHECK-BE-NEXT: xscvdpsxws f3, f3 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: vmrghw v3, v5, v4 ; CHECK-BE-NEXT: mffprwz r4, f5 ; CHECK-BE-NEXT: xxmrghd vs4, v3, v2 ; CHECK-BE-NEXT: sldi r4, r4, 48 ; CHECK-BE-NEXT: mtvsrd v2, r4 ; CHECK-BE-NEXT: mffprwz r4, f2 -; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: stxv vs4, 0(r3) ; CHECK-BE-NEXT: sldi r4, r4, 48 ; CHECK-BE-NEXT: mtvsrd v3, r4 @@ -618,18 +618,18 @@ ; CHECK-BE-NEXT: mffprwz r4, f1 ; CHECK-BE-NEXT: xxsldwi vs1, vs0, vs0, 3 ; CHECK-BE-NEXT: sldi r4, r4, 48 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: vmrghh v3, v4, v3 -; CHECK-BE-NEXT: mtvsrd v4, r4 -; CHECK-BE-NEXT: mffprwz r4, f1 -; CHECK-BE-NEXT: xxswapd vs1, vs0 ; CHECK-BE-NEXT: xscvspdpn f1, vs1 +; CHECK-BE-NEXT: mtvsrd v4, r4 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: vmrghh v2, v2, v4 -; CHECK-BE-NEXT: sldi r4, r4, 48 ; CHECK-BE-NEXT: vmrghw v2, v2, v3 +; CHECK-BE-NEXT: mffprwz r4, f1 +; CHECK-BE-NEXT: xxswapd vs1, vs0 +; CHECK-BE-NEXT: sldi r4, r4, 48 +; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v3, r4 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: mffprwz r4, f1 ; CHECK-BE-NEXT: xscvspdpn f1, vs0 ; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 @@ -682,9 +682,9 @@ ; CHECK-P9-NEXT: xxswapd v2, vs0 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 ; CHECK-P9-NEXT: xxsldwi vs1, v2, v2, 3 +; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: mffprwz r3, f1 ; CHECK-P9-NEXT: mtvsrd v2, r3 ; CHECK-P9-NEXT: mffprwz r3, f0 @@ -867,10 +867,10 @@ ; CHECK-P9-LABEL: test8elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs1, 0(r3) +; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: xxsldwi vs2, vs1, vs1, 3 ; CHECK-P9-NEXT: xscvspdpn f2, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 -; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: mffprwz r3, f2 ; CHECK-P9-NEXT: xxswapd vs2, vs1 ; CHECK-P9-NEXT: mtvsrd v2, r3 @@ -918,10 +918,10 @@ ; CHECK-BE-LABEL: test8elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: xxsldwi vs2, vs1, vs1, 3 ; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: mffprwz r3, f2 ; CHECK-BE-NEXT: xxswapd vs2, vs1 ; CHECK-BE-NEXT: sldi r3, r3, 48 @@ -946,14 +946,14 @@ ; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v4, r3 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: vmrghh v3, v3, v4 +; CHECK-BE-NEXT: vmrghw v2, v3, v2 ; CHECK-BE-NEXT: mffprwz r3, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs0 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xscvdpsxws f1, f1 -; CHECK-BE-NEXT: vmrghh v3, v3, v4 ; CHECK-BE-NEXT: sldi r3, r3, 48 -; CHECK-BE-NEXT: vmrghw v2, v3, v2 +; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v3, r3 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: mffprwz r3, f1 ; CHECK-BE-NEXT: xscvspdpn f1, vs0 ; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 @@ -1086,19 +1086,20 @@ ; CHECK-P9-LABEL: test16elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs2, 0(r4) +; CHECK-P9-NEXT: lxv vs1, 16(r4) +; CHECK-P9-NEXT: lxv vs0, 32(r4) ; CHECK-P9-NEXT: xxsldwi vs3, vs2, vs2, 3 ; CHECK-P9-NEXT: xxswapd vs4, vs2 +; CHECK-P9-NEXT: xscvspdpn f5, vs2 +; CHECK-P9-NEXT: xxsldwi vs2, vs2, vs2, 1 +; CHECK-P9-NEXT: xxsldwi vs6, vs1, vs1, 3 ; CHECK-P9-NEXT: xscvspdpn f3, vs3 ; CHECK-P9-NEXT: xscvspdpn f4, vs4 +; CHECK-P9-NEXT: xscvspdpn f2, vs2 ; CHECK-P9-NEXT: xscvdpsxws f3, f3 ; CHECK-P9-NEXT: xscvdpsxws f4, f4 -; CHECK-P9-NEXT: xscvspdpn f5, vs2 -; CHECK-P9-NEXT: xxsldwi vs2, vs2, vs2, 1 -; CHECK-P9-NEXT: xscvspdpn f2, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 ; CHECK-P9-NEXT: mffprwz r5, f3 -; CHECK-P9-NEXT: lxv vs1, 16(r4) -; CHECK-P9-NEXT: xxsldwi vs6, vs1, vs1, 3 ; CHECK-P9-NEXT: xxswapd vs3, vs1 ; CHECK-P9-NEXT: mtvsrd v2, r5 ; CHECK-P9-NEXT: mffprwz r5, f4 @@ -1106,6 +1107,7 @@ ; CHECK-P9-NEXT: xscvspdpn f3, vs3 ; CHECK-P9-NEXT: mtvsrd v3, r5 ; CHECK-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-P9-NEXT: xscvdpsxws f3, f3 ; CHECK-P9-NEXT: mffprwz r5, f4 ; CHECK-P9-NEXT: xscvspdpn f4, vs6 ; CHECK-P9-NEXT: mtvsrd v3, r5 @@ -1113,15 +1115,13 @@ ; CHECK-P9-NEXT: xscvspdpn f2, vs1 ; CHECK-P9-NEXT: xxsldwi vs1, vs1, vs1, 1 ; CHECK-P9-NEXT: xscvdpsxws f4, f4 -; CHECK-P9-NEXT: xscvdpsxws f3, f3 -; CHECK-P9-NEXT: lxv vs0, 32(r4) ; CHECK-P9-NEXT: mtvsrd v4, r5 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: vmrghh v3, v3, v4 +; CHECK-P9-NEXT: xscvdpsxws f1, f1 ; CHECK-P9-NEXT: vmrglw v2, v3, v2 ; CHECK-P9-NEXT: mffprwz r5, f4 -; CHECK-P9-NEXT: xscvspdpn f1, vs1 -; CHECK-P9-NEXT: xscvdpsxws f1, f1 ; CHECK-P9-NEXT: mtvsrd v4, r5 ; CHECK-P9-NEXT: mffprwz r5, f3 ; CHECK-P9-NEXT: xxsldwi vs3, vs0, vs0, 3 @@ -1154,6 +1154,7 @@ ; CHECK-P9-NEXT: mtvsrd v4, r4 ; CHECK-P9-NEXT: mffprwz r4, f0 ; CHECK-P9-NEXT: xxsldwi vs0, vs1, vs1, 3 +; CHECK-P9-NEXT: stxv vs2, 0(r3) ; CHECK-P9-NEXT: mtvsrd v2, r4 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 ; CHECK-P9-NEXT: vmrghh v2, v4, v2 @@ -1180,31 +1181,31 @@ ; CHECK-P9-NEXT: vmrglw v3, v4, v3 ; CHECK-P9-NEXT: xxmrgld vs0, v3, v2 ; CHECK-P9-NEXT: stxv vs0, 16(r3) -; CHECK-P9-NEXT: stxv vs2, 0(r3) ; CHECK-P9-NEXT: blr ; ; CHECK-BE-LABEL: test16elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 16(r4) +; CHECK-BE-NEXT: lxv vs0, 0(r4) ; CHECK-BE-NEXT: xxsldwi vs2, vs1, vs1, 3 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: xxswapd vs3, vs1 -; CHECK-BE-NEXT: xscvspdpn f3, vs3 -; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: xscvdpsxws f3, f3 -; CHECK-BE-NEXT: mffprwz r5, f2 ; CHECK-BE-NEXT: xscvspdpn f4, vs1 ; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 +; CHECK-BE-NEXT: xscvspdpn f3, vs3 ; CHECK-BE-NEXT: xscvspdpn f1, vs1 +; CHECK-BE-NEXT: xscvdpsxws f2, f2 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: mffprwz r5, f2 +; CHECK-BE-NEXT: xxsldwi vs2, vs0, vs0, 3 ; CHECK-BE-NEXT: sldi r5, r5, 48 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: mtvsrd v2, r5 ; CHECK-BE-NEXT: mffprwz r5, f3 ; CHECK-BE-NEXT: xscvdpsxws f3, f4 -; CHECK-BE-NEXT: lxv vs0, 0(r4) -; CHECK-BE-NEXT: xxsldwi vs2, vs0, vs0, 3 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: sldi r5, r5, 48 +; CHECK-BE-NEXT: xscvdpsxws f2, f2 ; CHECK-BE-NEXT: mtvsrd v3, r5 ; CHECK-BE-NEXT: vmrghh v2, v3, v2 ; CHECK-BE-NEXT: mffprwz r5, f3 @@ -1212,7 +1213,6 @@ ; CHECK-BE-NEXT: mtvsrd v3, r5 ; CHECK-BE-NEXT: mffprwz r5, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs0 -; CHECK-BE-NEXT: xscvdpsxws f2, f2 ; CHECK-BE-NEXT: sldi r5, r5, 48 ; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v4, r5 @@ -1239,24 +1239,24 @@ ; CHECK-BE-NEXT: lxv vs0, 32(r4) ; CHECK-BE-NEXT: xscvspdpn f5, vs1 ; CHECK-BE-NEXT: xxsldwi vs2, vs1, vs1, 3 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 -; CHECK-BE-NEXT: xscvdpsxws f5, f5 -; CHECK-BE-NEXT: sldi r5, r5, 48 ; CHECK-BE-NEXT: xxswapd vs3, vs1 +; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 +; CHECK-BE-NEXT: sldi r5, r5, 48 +; CHECK-BE-NEXT: xscvdpsxws f5, f5 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: mtvsrd v0, r5 -; CHECK-BE-NEXT: vmrghh v5, v5, v0 ; CHECK-BE-NEXT: xscvspdpn f3, vs3 -; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 ; CHECK-BE-NEXT: xscvspdpn f1, vs1 +; CHECK-BE-NEXT: vmrghh v5, v5, v0 ; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: vmrghw v3, v5, v4 ; CHECK-BE-NEXT: xscvdpsxws f3, f3 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: vmrghw v3, v5, v4 ; CHECK-BE-NEXT: mffprwz r4, f5 ; CHECK-BE-NEXT: xxmrghd vs4, v3, v2 ; CHECK-BE-NEXT: sldi r4, r4, 48 ; CHECK-BE-NEXT: mtvsrd v2, r4 ; CHECK-BE-NEXT: mffprwz r4, f2 -; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: stxv vs4, 0(r3) ; CHECK-BE-NEXT: sldi r4, r4, 48 ; CHECK-BE-NEXT: mtvsrd v3, r4 @@ -1266,18 +1266,18 @@ ; CHECK-BE-NEXT: mffprwz r4, f1 ; CHECK-BE-NEXT: xxsldwi vs1, vs0, vs0, 3 ; CHECK-BE-NEXT: sldi r4, r4, 48 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: vmrghh v3, v4, v3 -; CHECK-BE-NEXT: mtvsrd v4, r4 -; CHECK-BE-NEXT: mffprwz r4, f1 -; CHECK-BE-NEXT: xxswapd vs1, vs0 ; CHECK-BE-NEXT: xscvspdpn f1, vs1 +; CHECK-BE-NEXT: mtvsrd v4, r4 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: vmrghh v2, v2, v4 -; CHECK-BE-NEXT: sldi r4, r4, 48 ; CHECK-BE-NEXT: vmrghw v2, v2, v3 +; CHECK-BE-NEXT: mffprwz r4, f1 +; CHECK-BE-NEXT: xxswapd vs1, vs0 +; CHECK-BE-NEXT: sldi r4, r4, 48 +; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v3, r4 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: mffprwz r4, f1 ; CHECK-BE-NEXT: xscvspdpn f1, vs0 ; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll @@ -37,9 +37,9 @@ ; CHECK-P9-NEXT: xxswapd v2, vs0 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 ; CHECK-P9-NEXT: xxsldwi vs1, v2, v2, 3 +; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: mffprwz r3, f1 ; CHECK-P9-NEXT: mtvsrd v2, r3 ; CHECK-P9-NEXT: mffprwz r3, f0 @@ -230,10 +230,10 @@ ; CHECK-P9-LABEL: test8elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs1, 0(r3) +; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: xxsldwi vs2, vs1, vs1, 3 ; CHECK-P9-NEXT: xscvspdpn f2, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 -; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: mffprwz r3, f2 ; CHECK-P9-NEXT: xxswapd vs2, vs1 ; CHECK-P9-NEXT: mtvsrd v2, r3 @@ -282,10 +282,10 @@ ; CHECK-BE-LABEL: test8elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: xxsldwi vs2, vs1, vs1, 3 ; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: mffprwz r3, f2 ; CHECK-BE-NEXT: xxswapd vs2, vs1 ; CHECK-BE-NEXT: sldi r3, r3, 56 @@ -310,14 +310,14 @@ ; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v4, r3 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: vmrghb v3, v3, v4 +; CHECK-BE-NEXT: vmrghh v2, v3, v2 ; CHECK-BE-NEXT: mffprwz r3, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs0 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xscvdpsxws f1, f1 -; CHECK-BE-NEXT: vmrghb v3, v3, v4 ; CHECK-BE-NEXT: sldi r3, r3, 56 -; CHECK-BE-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v3, r3 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: mffprwz r3, f1 ; CHECK-BE-NEXT: xscvspdpn f1, vs0 ; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 @@ -451,12 +451,12 @@ ; CHECK-P9-LABEL: test16elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs3, 0(r3) -; CHECK-P9-NEXT: xxsldwi vs4, vs3, vs3, 3 -; CHECK-P9-NEXT: xscvspdpn f4, vs4 -; CHECK-P9-NEXT: xscvdpsxws f4, f4 ; CHECK-P9-NEXT: lxv vs0, 48(r3) ; CHECK-P9-NEXT: lxv vs1, 32(r3) ; CHECK-P9-NEXT: lxv vs2, 16(r3) +; CHECK-P9-NEXT: xxsldwi vs4, vs3, vs3, 3 +; CHECK-P9-NEXT: xscvspdpn f4, vs4 +; CHECK-P9-NEXT: xscvdpsxws f4, f4 ; CHECK-P9-NEXT: mffprwz r3, f4 ; CHECK-P9-NEXT: xxswapd vs4, vs3 ; CHECK-P9-NEXT: mtvsrd v2, r3 @@ -550,12 +550,12 @@ ; CHECK-BE-LABEL: test16elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs3, 48(r3) -; CHECK-BE-NEXT: xxsldwi vs4, vs3, vs3, 3 -; CHECK-BE-NEXT: xscvspdpn f4, vs4 -; CHECK-BE-NEXT: xscvdpsxws f4, f4 ; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: lxv vs1, 16(r3) ; CHECK-BE-NEXT: lxv vs2, 32(r3) +; CHECK-BE-NEXT: xxsldwi vs4, vs3, vs3, 3 +; CHECK-BE-NEXT: xscvspdpn f4, vs4 +; CHECK-BE-NEXT: xscvdpsxws f4, f4 ; CHECK-BE-NEXT: mffprwz r3, f4 ; CHECK-BE-NEXT: xxswapd vs4, vs3 ; CHECK-BE-NEXT: sldi r3, r3, 56 @@ -580,14 +580,14 @@ ; CHECK-BE-NEXT: xscvspdpn f3, vs3 ; CHECK-BE-NEXT: mtvsrd v4, r3 ; CHECK-BE-NEXT: xscvdpsxws f3, f3 +; CHECK-BE-NEXT: vmrghb v3, v3, v4 +; CHECK-BE-NEXT: vmrghh v2, v3, v2 ; CHECK-BE-NEXT: mffprwz r3, f3 ; CHECK-BE-NEXT: xxswapd vs3, vs2 -; CHECK-BE-NEXT: xscvspdpn f3, vs3 -; CHECK-BE-NEXT: xscvdpsxws f3, f3 -; CHECK-BE-NEXT: vmrghb v3, v3, v4 ; CHECK-BE-NEXT: sldi r3, r3, 56 -; CHECK-BE-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-NEXT: xscvspdpn f3, vs3 ; CHECK-BE-NEXT: mtvsrd v3, r3 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: mffprwz r3, f3 ; CHECK-BE-NEXT: xscvspdpn f3, vs2 ; CHECK-BE-NEXT: xxsldwi vs2, vs2, vs2, 1 @@ -606,15 +606,15 @@ ; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: mtvsrd v5, r3 ; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: mffprwz r3, f2 -; CHECK-BE-NEXT: xxswapd vs2, vs1 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 -; CHECK-BE-NEXT: xscvdpsxws f2, f2 ; CHECK-BE-NEXT: vmrghb v4, v4, v5 ; CHECK-BE-NEXT: vmrghh v3, v4, v3 -; CHECK-BE-NEXT: sldi r3, r3, 56 ; CHECK-BE-NEXT: vmrghw v2, v3, v2 +; CHECK-BE-NEXT: mffprwz r3, f2 +; CHECK-BE-NEXT: xxswapd vs2, vs1 +; CHECK-BE-NEXT: sldi r3, r3, 56 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: mtvsrd v3, r3 +; CHECK-BE-NEXT: xscvdpsxws f2, f2 ; CHECK-BE-NEXT: mffprwz r3, f2 ; CHECK-BE-NEXT: xscvspdpn f2, vs1 ; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 @@ -633,14 +633,14 @@ ; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v5, r3 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: vmrghb v4, v4, v5 +; CHECK-BE-NEXT: vmrghh v3, v4, v3 ; CHECK-BE-NEXT: mffprwz r3, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs0 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xscvdpsxws f1, f1 -; CHECK-BE-NEXT: vmrghb v4, v4, v5 ; CHECK-BE-NEXT: sldi r3, r3, 56 -; CHECK-BE-NEXT: vmrghh v3, v4, v3 +; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v4, r3 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: mffprwz r3, f1 ; CHECK-BE-NEXT: xscvspdpn f1, vs0 ; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 @@ -695,9 +695,9 @@ ; CHECK-P9-NEXT: xxswapd v2, vs0 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 ; CHECK-P9-NEXT: xxsldwi vs1, v2, v2, 3 +; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 ; CHECK-P9-NEXT: mffprwz r3, f1 ; CHECK-P9-NEXT: mtvsrd v2, r3 ; CHECK-P9-NEXT: mffprwz r3, f0 @@ -888,10 +888,10 @@ ; CHECK-P9-LABEL: test8elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs1, 0(r3) +; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: xxsldwi vs2, vs1, vs1, 3 ; CHECK-P9-NEXT: xscvspdpn f2, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 -; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: mffprwz r3, f2 ; CHECK-P9-NEXT: xxswapd vs2, vs1 ; CHECK-P9-NEXT: mtvsrd v2, r3 @@ -940,10 +940,10 @@ ; CHECK-BE-LABEL: test8elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: xxsldwi vs2, vs1, vs1, 3 ; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: mffprwz r3, f2 ; CHECK-BE-NEXT: xxswapd vs2, vs1 ; CHECK-BE-NEXT: sldi r3, r3, 56 @@ -968,14 +968,14 @@ ; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v4, r3 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: vmrghb v3, v3, v4 +; CHECK-BE-NEXT: vmrghh v2, v3, v2 ; CHECK-BE-NEXT: mffprwz r3, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs0 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xscvdpsxws f1, f1 -; CHECK-BE-NEXT: vmrghb v3, v3, v4 ; CHECK-BE-NEXT: sldi r3, r3, 56 -; CHECK-BE-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v3, r3 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: mffprwz r3, f1 ; CHECK-BE-NEXT: xscvspdpn f1, vs0 ; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 @@ -1109,12 +1109,12 @@ ; CHECK-P9-LABEL: test16elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs3, 0(r3) -; CHECK-P9-NEXT: xxsldwi vs4, vs3, vs3, 3 -; CHECK-P9-NEXT: xscvspdpn f4, vs4 -; CHECK-P9-NEXT: xscvdpsxws f4, f4 ; CHECK-P9-NEXT: lxv vs0, 48(r3) ; CHECK-P9-NEXT: lxv vs1, 32(r3) ; CHECK-P9-NEXT: lxv vs2, 16(r3) +; CHECK-P9-NEXT: xxsldwi vs4, vs3, vs3, 3 +; CHECK-P9-NEXT: xscvspdpn f4, vs4 +; CHECK-P9-NEXT: xscvdpsxws f4, f4 ; CHECK-P9-NEXT: mffprwz r3, f4 ; CHECK-P9-NEXT: xxswapd vs4, vs3 ; CHECK-P9-NEXT: mtvsrd v2, r3 @@ -1208,12 +1208,12 @@ ; CHECK-BE-LABEL: test16elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs3, 48(r3) -; CHECK-BE-NEXT: xxsldwi vs4, vs3, vs3, 3 -; CHECK-BE-NEXT: xscvspdpn f4, vs4 -; CHECK-BE-NEXT: xscvdpsxws f4, f4 ; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: lxv vs1, 16(r3) ; CHECK-BE-NEXT: lxv vs2, 32(r3) +; CHECK-BE-NEXT: xxsldwi vs4, vs3, vs3, 3 +; CHECK-BE-NEXT: xscvspdpn f4, vs4 +; CHECK-BE-NEXT: xscvdpsxws f4, f4 ; CHECK-BE-NEXT: mffprwz r3, f4 ; CHECK-BE-NEXT: xxswapd vs4, vs3 ; CHECK-BE-NEXT: sldi r3, r3, 56 @@ -1238,14 +1238,14 @@ ; CHECK-BE-NEXT: xscvspdpn f3, vs3 ; CHECK-BE-NEXT: mtvsrd v4, r3 ; CHECK-BE-NEXT: xscvdpsxws f3, f3 +; CHECK-BE-NEXT: vmrghb v3, v3, v4 +; CHECK-BE-NEXT: vmrghh v2, v3, v2 ; CHECK-BE-NEXT: mffprwz r3, f3 ; CHECK-BE-NEXT: xxswapd vs3, vs2 -; CHECK-BE-NEXT: xscvspdpn f3, vs3 -; CHECK-BE-NEXT: xscvdpsxws f3, f3 -; CHECK-BE-NEXT: vmrghb v3, v3, v4 ; CHECK-BE-NEXT: sldi r3, r3, 56 -; CHECK-BE-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-NEXT: xscvspdpn f3, vs3 ; CHECK-BE-NEXT: mtvsrd v3, r3 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: mffprwz r3, f3 ; CHECK-BE-NEXT: xscvspdpn f3, vs2 ; CHECK-BE-NEXT: xxsldwi vs2, vs2, vs2, 1 @@ -1264,15 +1264,15 @@ ; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: mtvsrd v5, r3 ; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: mffprwz r3, f2 -; CHECK-BE-NEXT: xxswapd vs2, vs1 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 -; CHECK-BE-NEXT: xscvdpsxws f2, f2 ; CHECK-BE-NEXT: vmrghb v4, v4, v5 ; CHECK-BE-NEXT: vmrghh v3, v4, v3 -; CHECK-BE-NEXT: sldi r3, r3, 56 ; CHECK-BE-NEXT: vmrghw v2, v3, v2 +; CHECK-BE-NEXT: mffprwz r3, f2 +; CHECK-BE-NEXT: xxswapd vs2, vs1 +; CHECK-BE-NEXT: sldi r3, r3, 56 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: mtvsrd v3, r3 +; CHECK-BE-NEXT: xscvdpsxws f2, f2 ; CHECK-BE-NEXT: mffprwz r3, f2 ; CHECK-BE-NEXT: xscvspdpn f2, vs1 ; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 @@ -1291,14 +1291,14 @@ ; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v5, r3 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: vmrghb v4, v4, v5 +; CHECK-BE-NEXT: vmrghh v3, v4, v3 ; CHECK-BE-NEXT: mffprwz r3, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs0 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xscvdpsxws f1, f1 -; CHECK-BE-NEXT: vmrghb v4, v4, v5 ; CHECK-BE-NEXT: sldi r3, r3, 56 -; CHECK-BE-NEXT: vmrghh v3, v4, v3 +; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: mtvsrd v4, r3 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: mffprwz r3, f1 ; CHECK-BE-NEXT: xscvspdpn f1, vs0 ; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll @@ -89,10 +89,10 @@ ; CHECK-P9-LABEL: test4elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs1, 0(r3) +; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: xscvdpsxws f2, f1 ; CHECK-P9-NEXT: xxswapd vs1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: mffprwz r3, f2 ; CHECK-P9-NEXT: mtvsrd v2, r3 ; CHECK-P9-NEXT: mffprwz r3, f1 @@ -113,10 +113,10 @@ ; CHECK-BE-LABEL: test4elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: xscvdpsxws f2, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs1 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 -; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: mffprwz r3, f2 ; CHECK-BE-NEXT: sldi r3, r3, 48 ; CHECK-BE-NEXT: mtvsrd v2, r3 @@ -194,12 +194,12 @@ ; CHECK-P9-LABEL: test8elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs3, 0(r3) -; CHECK-P9-NEXT: xscvdpsxws f4, f3 -; CHECK-P9-NEXT: xxswapd vs3, vs3 -; CHECK-P9-NEXT: xscvdpsxws f3, f3 ; CHECK-P9-NEXT: lxv vs2, 16(r3) ; CHECK-P9-NEXT: lxv vs0, 48(r3) ; CHECK-P9-NEXT: lxv vs1, 32(r3) +; CHECK-P9-NEXT: xscvdpsxws f4, f3 +; CHECK-P9-NEXT: xxswapd vs3, vs3 +; CHECK-P9-NEXT: xscvdpsxws f3, f3 ; CHECK-P9-NEXT: mffprwz r3, f4 ; CHECK-P9-NEXT: mtvsrd v2, r3 ; CHECK-P9-NEXT: mffprwz r3, f3 @@ -237,12 +237,12 @@ ; CHECK-BE-LABEL: test8elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs3, 48(r3) -; CHECK-BE-NEXT: xscvdpsxws f4, f3 -; CHECK-BE-NEXT: xxswapd vs3, vs3 -; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: lxv vs2, 32(r3) ; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: xscvdpsxws f4, f3 +; CHECK-BE-NEXT: xxswapd vs3, vs3 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: mffprwz r3, f4 ; CHECK-BE-NEXT: sldi r3, r3, 48 ; CHECK-BE-NEXT: mtvsrd v2, r3 @@ -387,18 +387,20 @@ ; CHECK-P9-NEXT: lxv vs3, 0(r4) ; CHECK-P9-NEXT: lxv vs2, 16(r4) ; CHECK-P9-NEXT: lxv vs1, 32(r4) -; CHECK-P9-NEXT: xscvdpsxws f4, f3 ; CHECK-P9-NEXT: lxv vs0, 48(r4) +; CHECK-P9-NEXT: xscvdpsxws f4, f3 ; CHECK-P9-NEXT: xscvdpsxws f5, f2 ; CHECK-P9-NEXT: xscvdpsxws f6, f1 ; CHECK-P9-NEXT: xxswapd vs3, vs3 ; CHECK-P9-NEXT: xscvdpsxws f7, f0 +; CHECK-P9-NEXT: xxswapd vs2, vs2 +; CHECK-P9-NEXT: xxswapd vs1, vs1 ; CHECK-P9-NEXT: xxswapd vs0, vs0 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: mffprwz r5, f4 ; CHECK-P9-NEXT: xscvdpsxws f3, f3 -; CHECK-P9-NEXT: xxswapd vs2, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: xscvdpsxws f1, f1 +; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: mffprwz r5, f4 ; CHECK-P9-NEXT: mtvsrd v2, r5 ; CHECK-P9-NEXT: mffprwz r5, f5 ; CHECK-P9-NEXT: mtvsrd v3, r5 @@ -408,8 +410,6 @@ ; CHECK-P9-NEXT: mtvsrd v5, r5 ; CHECK-P9-NEXT: mffprwz r5, f3 ; CHECK-P9-NEXT: lxv vs3, 64(r4) -; CHECK-P9-NEXT: xxswapd vs1, vs1 -; CHECK-P9-NEXT: xscvdpsxws f1, f1 ; CHECK-P9-NEXT: mtvsrd v0, r5 ; CHECK-P9-NEXT: mffprwz r5, f2 ; CHECK-P9-NEXT: lxv vs2, 80(r4) @@ -469,30 +469,30 @@ ; CHECK-BE-LABEL: test16elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs4, 48(r4) +; CHECK-BE-NEXT: lxv vs3, 32(r4) +; CHECK-BE-NEXT: lxv vs2, 16(r4) +; CHECK-BE-NEXT: lxv vs1, 0(r4) ; CHECK-BE-NEXT: xscvdpsxws f5, f4 ; CHECK-BE-NEXT: xxswapd vs4, vs4 -; CHECK-BE-NEXT: lxv vs3, 32(r4) ; CHECK-BE-NEXT: xscvdpsxws f6, f3 ; CHECK-BE-NEXT: xxswapd vs3, vs3 -; CHECK-BE-NEXT: xscvdpsxws f4, f4 -; CHECK-BE-NEXT: mffprwz r5, f5 -; CHECK-BE-NEXT: sldi r5, r5, 48 -; CHECK-BE-NEXT: lxv vs2, 16(r4) -; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: xscvdpsxws f7, f2 +; CHECK-BE-NEXT: lxv vs0, 112(r4) ; CHECK-BE-NEXT: xxswapd vs2, vs2 +; CHECK-BE-NEXT: xscvdpsxws f4, f4 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: xscvdpsxws f2, f2 +; CHECK-BE-NEXT: mffprwz r5, f5 +; CHECK-BE-NEXT: sldi r5, r5, 48 ; CHECK-BE-NEXT: mtvsrd v2, r5 ; CHECK-BE-NEXT: mffprwz r5, f4 -; CHECK-BE-NEXT: sldi r5, r5, 48 -; CHECK-BE-NEXT: lxv vs1, 0(r4) ; CHECK-BE-NEXT: xscvdpsxws f4, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs1 +; CHECK-BE-NEXT: sldi r5, r5, 48 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: mtvsrd v3, r5 ; CHECK-BE-NEXT: mffprwz r5, f6 ; CHECK-BE-NEXT: sldi r5, r5, 48 -; CHECK-BE-NEXT: lxv vs0, 112(r4) ; CHECK-BE-NEXT: vmrghh v2, v2, v3 ; CHECK-BE-NEXT: mtvsrd v3, r5 ; CHECK-BE-NEXT: mffprwz r5, f3 @@ -524,12 +524,15 @@ ; CHECK-BE-NEXT: vmrghh v4, v4, v1 ; CHECK-BE-NEXT: mtvsrd v1, r5 ; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: vmrghh v5, v5, v1 ; CHECK-BE-NEXT: mffprwz r5, f0 ; CHECK-BE-NEXT: lxv vs0, 64(r4) +; CHECK-BE-NEXT: vmrghh v5, v5, v1 +; CHECK-BE-NEXT: sldi r5, r5, 48 ; CHECK-BE-NEXT: mffprwz r4, f3 -; CHECK-BE-NEXT: sldi r4, r4, 48 +; CHECK-BE-NEXT: mtvsrd v1, r5 ; CHECK-BE-NEXT: vmrghw v3, v5, v4 +; CHECK-BE-NEXT: sldi r4, r4, 48 +; CHECK-BE-NEXT: vmrghh v0, v0, v1 ; CHECK-BE-NEXT: xxmrghd vs3, v3, v2 ; CHECK-BE-NEXT: mtvsrd v2, r4 ; CHECK-BE-NEXT: mffprwz r4, f2 @@ -537,10 +540,12 @@ ; CHECK-BE-NEXT: xxswapd vs1, vs1 ; CHECK-BE-NEXT: sldi r4, r4, 48 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: stxv vs3, 0(r3) ; CHECK-BE-NEXT: mtvsrd v3, r4 ; CHECK-BE-NEXT: vmrghh v2, v2, v3 ; CHECK-BE-NEXT: mffprwz r4, f2 ; CHECK-BE-NEXT: sldi r4, r4, 48 +; CHECK-BE-NEXT: vmrghw v2, v2, v0 ; CHECK-BE-NEXT: mtvsrd v3, r4 ; CHECK-BE-NEXT: mffprwz r4, f1 ; CHECK-BE-NEXT: xscvdpsxws f1, f0 @@ -553,11 +558,6 @@ ; CHECK-BE-NEXT: sldi r4, r4, 48 ; CHECK-BE-NEXT: mtvsrd v4, r4 ; CHECK-BE-NEXT: mffprwz r4, f0 -; CHECK-BE-NEXT: sldi r5, r5, 48 -; CHECK-BE-NEXT: mtvsrd v1, r5 -; CHECK-BE-NEXT: vmrghh v0, v0, v1 -; CHECK-BE-NEXT: vmrghw v2, v2, v0 -; CHECK-BE-NEXT: stxv vs3, 0(r3) ; CHECK-BE-NEXT: sldi r4, r4, 48 ; CHECK-BE-NEXT: mtvsrd v5, r4 ; CHECK-BE-NEXT: vmrghh v4, v4, v5 @@ -652,10 +652,10 @@ ; CHECK-P9-LABEL: test4elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs1, 0(r3) +; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: xscvdpsxws f2, f1 ; CHECK-P9-NEXT: xxswapd vs1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: mffprwz r3, f2 ; CHECK-P9-NEXT: mtvsrd v2, r3 ; CHECK-P9-NEXT: mffprwz r3, f1 @@ -676,10 +676,10 @@ ; CHECK-BE-LABEL: test4elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: xscvdpsxws f2, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs1 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 -; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: mffprwz r3, f2 ; CHECK-BE-NEXT: sldi r3, r3, 48 ; CHECK-BE-NEXT: mtvsrd v2, r3 @@ -757,12 +757,12 @@ ; CHECK-P9-LABEL: test8elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs3, 0(r3) -; CHECK-P9-NEXT: xscvdpsxws f4, f3 -; CHECK-P9-NEXT: xxswapd vs3, vs3 -; CHECK-P9-NEXT: xscvdpsxws f3, f3 ; CHECK-P9-NEXT: lxv vs2, 16(r3) ; CHECK-P9-NEXT: lxv vs0, 48(r3) ; CHECK-P9-NEXT: lxv vs1, 32(r3) +; CHECK-P9-NEXT: xscvdpsxws f4, f3 +; CHECK-P9-NEXT: xxswapd vs3, vs3 +; CHECK-P9-NEXT: xscvdpsxws f3, f3 ; CHECK-P9-NEXT: mffprwz r3, f4 ; CHECK-P9-NEXT: mtvsrd v2, r3 ; CHECK-P9-NEXT: mffprwz r3, f3 @@ -800,12 +800,12 @@ ; CHECK-BE-LABEL: test8elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs3, 48(r3) -; CHECK-BE-NEXT: xscvdpsxws f4, f3 -; CHECK-BE-NEXT: xxswapd vs3, vs3 -; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: lxv vs2, 32(r3) ; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: xscvdpsxws f4, f3 +; CHECK-BE-NEXT: xxswapd vs3, vs3 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: mffprwz r3, f4 ; CHECK-BE-NEXT: sldi r3, r3, 48 ; CHECK-BE-NEXT: mtvsrd v2, r3 @@ -950,18 +950,20 @@ ; CHECK-P9-NEXT: lxv vs3, 0(r4) ; CHECK-P9-NEXT: lxv vs2, 16(r4) ; CHECK-P9-NEXT: lxv vs1, 32(r4) -; CHECK-P9-NEXT: xscvdpsxws f4, f3 ; CHECK-P9-NEXT: lxv vs0, 48(r4) +; CHECK-P9-NEXT: xscvdpsxws f4, f3 ; CHECK-P9-NEXT: xscvdpsxws f5, f2 ; CHECK-P9-NEXT: xscvdpsxws f6, f1 ; CHECK-P9-NEXT: xxswapd vs3, vs3 ; CHECK-P9-NEXT: xscvdpsxws f7, f0 +; CHECK-P9-NEXT: xxswapd vs2, vs2 +; CHECK-P9-NEXT: xxswapd vs1, vs1 ; CHECK-P9-NEXT: xxswapd vs0, vs0 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: mffprwz r5, f4 ; CHECK-P9-NEXT: xscvdpsxws f3, f3 -; CHECK-P9-NEXT: xxswapd vs2, vs2 ; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: xscvdpsxws f1, f1 +; CHECK-P9-NEXT: xscvdpsxws f0, f0 +; CHECK-P9-NEXT: mffprwz r5, f4 ; CHECK-P9-NEXT: mtvsrd v2, r5 ; CHECK-P9-NEXT: mffprwz r5, f5 ; CHECK-P9-NEXT: mtvsrd v3, r5 @@ -971,8 +973,6 @@ ; CHECK-P9-NEXT: mtvsrd v5, r5 ; CHECK-P9-NEXT: mffprwz r5, f3 ; CHECK-P9-NEXT: lxv vs3, 64(r4) -; CHECK-P9-NEXT: xxswapd vs1, vs1 -; CHECK-P9-NEXT: xscvdpsxws f1, f1 ; CHECK-P9-NEXT: mtvsrd v0, r5 ; CHECK-P9-NEXT: mffprwz r5, f2 ; CHECK-P9-NEXT: lxv vs2, 80(r4) @@ -1032,30 +1032,30 @@ ; CHECK-BE-LABEL: test16elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs4, 48(r4) +; CHECK-BE-NEXT: lxv vs3, 32(r4) +; CHECK-BE-NEXT: lxv vs2, 16(r4) +; CHECK-BE-NEXT: lxv vs1, 0(r4) ; CHECK-BE-NEXT: xscvdpsxws f5, f4 ; CHECK-BE-NEXT: xxswapd vs4, vs4 -; CHECK-BE-NEXT: lxv vs3, 32(r4) ; CHECK-BE-NEXT: xscvdpsxws f6, f3 ; CHECK-BE-NEXT: xxswapd vs3, vs3 -; CHECK-BE-NEXT: xscvdpsxws f4, f4 -; CHECK-BE-NEXT: mffprwz r5, f5 -; CHECK-BE-NEXT: sldi r5, r5, 48 -; CHECK-BE-NEXT: lxv vs2, 16(r4) -; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: xscvdpsxws f7, f2 +; CHECK-BE-NEXT: lxv vs0, 112(r4) ; CHECK-BE-NEXT: xxswapd vs2, vs2 +; CHECK-BE-NEXT: xscvdpsxws f4, f4 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: xscvdpsxws f2, f2 +; CHECK-BE-NEXT: mffprwz r5, f5 +; CHECK-BE-NEXT: sldi r5, r5, 48 ; CHECK-BE-NEXT: mtvsrd v2, r5 ; CHECK-BE-NEXT: mffprwz r5, f4 -; CHECK-BE-NEXT: sldi r5, r5, 48 -; CHECK-BE-NEXT: lxv vs1, 0(r4) ; CHECK-BE-NEXT: xscvdpsxws f4, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs1 +; CHECK-BE-NEXT: sldi r5, r5, 48 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 ; CHECK-BE-NEXT: mtvsrd v3, r5 ; CHECK-BE-NEXT: mffprwz r5, f6 ; CHECK-BE-NEXT: sldi r5, r5, 48 -; CHECK-BE-NEXT: lxv vs0, 112(r4) ; CHECK-BE-NEXT: vmrghh v2, v2, v3 ; CHECK-BE-NEXT: mtvsrd v3, r5 ; CHECK-BE-NEXT: mffprwz r5, f3 @@ -1087,12 +1087,15 @@ ; CHECK-BE-NEXT: vmrghh v4, v4, v1 ; CHECK-BE-NEXT: mtvsrd v1, r5 ; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: vmrghh v5, v5, v1 ; CHECK-BE-NEXT: mffprwz r5, f0 ; CHECK-BE-NEXT: lxv vs0, 64(r4) +; CHECK-BE-NEXT: vmrghh v5, v5, v1 +; CHECK-BE-NEXT: sldi r5, r5, 48 ; CHECK-BE-NEXT: mffprwz r4, f3 -; CHECK-BE-NEXT: sldi r4, r4, 48 +; CHECK-BE-NEXT: mtvsrd v1, r5 ; CHECK-BE-NEXT: vmrghw v3, v5, v4 +; CHECK-BE-NEXT: sldi r4, r4, 48 +; CHECK-BE-NEXT: vmrghh v0, v0, v1 ; CHECK-BE-NEXT: xxmrghd vs3, v3, v2 ; CHECK-BE-NEXT: mtvsrd v2, r4 ; CHECK-BE-NEXT: mffprwz r4, f2 @@ -1100,10 +1103,12 @@ ; CHECK-BE-NEXT: xxswapd vs1, vs1 ; CHECK-BE-NEXT: sldi r4, r4, 48 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: stxv vs3, 0(r3) ; CHECK-BE-NEXT: mtvsrd v3, r4 ; CHECK-BE-NEXT: vmrghh v2, v2, v3 ; CHECK-BE-NEXT: mffprwz r4, f2 ; CHECK-BE-NEXT: sldi r4, r4, 48 +; CHECK-BE-NEXT: vmrghw v2, v2, v0 ; CHECK-BE-NEXT: mtvsrd v3, r4 ; CHECK-BE-NEXT: mffprwz r4, f1 ; CHECK-BE-NEXT: xscvdpsxws f1, f0 @@ -1116,11 +1121,6 @@ ; CHECK-BE-NEXT: sldi r4, r4, 48 ; CHECK-BE-NEXT: mtvsrd v4, r4 ; CHECK-BE-NEXT: mffprwz r4, f0 -; CHECK-BE-NEXT: sldi r5, r5, 48 -; CHECK-BE-NEXT: mtvsrd v1, r5 -; CHECK-BE-NEXT: vmrghh v0, v0, v1 -; CHECK-BE-NEXT: vmrghw v2, v2, v0 -; CHECK-BE-NEXT: stxv vs3, 0(r3) ; CHECK-BE-NEXT: sldi r4, r4, 48 ; CHECK-BE-NEXT: mtvsrd v5, r4 ; CHECK-BE-NEXT: vmrghh v4, v4, v5 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i32_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i32_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i32_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i32_elts.ll @@ -129,10 +129,10 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs2, 0(r4) ; CHECK-P9-NEXT: lxv vs3, 16(r4) -; CHECK-P9-NEXT: xxmrgld vs4, vs3, vs2 -; CHECK-P9-NEXT: xxmrghd vs2, vs3, vs2 ; CHECK-P9-NEXT: lxv vs0, 32(r4) ; CHECK-P9-NEXT: lxv vs1, 48(r4) +; CHECK-P9-NEXT: xxmrgld vs4, vs3, vs2 +; CHECK-P9-NEXT: xxmrghd vs2, vs3, vs2 ; CHECK-P9-NEXT: xvcvdpuxws v2, vs4 ; CHECK-P9-NEXT: xvcvdpuxws v3, vs2 ; CHECK-P9-NEXT: xxmrgld vs2, vs1, vs0 @@ -149,10 +149,10 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs2, 16(r4) ; CHECK-BE-NEXT: lxv vs3, 0(r4) -; CHECK-BE-NEXT: xxmrgld vs4, vs3, vs2 -; CHECK-BE-NEXT: xxmrghd vs2, vs3, vs2 ; CHECK-BE-NEXT: lxv vs0, 48(r4) ; CHECK-BE-NEXT: lxv vs1, 32(r4) +; CHECK-BE-NEXT: xxmrgld vs4, vs3, vs2 +; CHECK-BE-NEXT: xxmrghd vs2, vs3, vs2 ; CHECK-BE-NEXT: xvcvdpuxws v2, vs4 ; CHECK-BE-NEXT: xvcvdpuxws v3, vs2 ; CHECK-BE-NEXT: xxmrgld vs2, vs1, vs0 @@ -227,23 +227,23 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs6, 0(r4) ; CHECK-P9-NEXT: lxv vs7, 16(r4) -; CHECK-P9-NEXT: xxmrgld vs8, vs7, vs6 -; CHECK-P9-NEXT: xxmrghd vs6, vs7, vs6 ; CHECK-P9-NEXT: lxv vs4, 32(r4) ; CHECK-P9-NEXT: lxv vs5, 48(r4) +; CHECK-P9-NEXT: xxmrgld vs8, vs7, vs6 +; CHECK-P9-NEXT: xxmrghd vs6, vs7, vs6 ; CHECK-P9-NEXT: xxmrgld vs7, vs5, vs4 ; CHECK-P9-NEXT: xxmrghd vs4, vs5, vs4 -; CHECK-P9-NEXT: xvcvdpuxws v2, vs8 -; CHECK-P9-NEXT: xvcvdpuxws v3, vs6 ; CHECK-P9-NEXT: lxv vs2, 64(r4) ; CHECK-P9-NEXT: lxv vs3, 80(r4) +; CHECK-P9-NEXT: lxv vs0, 96(r4) +; CHECK-P9-NEXT: lxv vs1, 112(r4) +; CHECK-P9-NEXT: xvcvdpuxws v2, vs8 +; CHECK-P9-NEXT: xvcvdpuxws v3, vs6 ; CHECK-P9-NEXT: xvcvdpuxws v4, vs7 ; CHECK-P9-NEXT: vmrgew v2, v3, v2 ; CHECK-P9-NEXT: xvcvdpuxws v3, vs4 ; CHECK-P9-NEXT: xxmrgld vs4, vs3, vs2 ; CHECK-P9-NEXT: xxmrghd vs2, vs3, vs2 -; CHECK-P9-NEXT: lxv vs0, 96(r4) -; CHECK-P9-NEXT: lxv vs1, 112(r4) ; CHECK-P9-NEXT: stxv v2, 0(r3) ; CHECK-P9-NEXT: xvcvdpuxws v5, vs2 ; CHECK-P9-NEXT: xxmrgld vs2, vs1, vs0 @@ -263,23 +263,23 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs6, 16(r4) ; CHECK-BE-NEXT: lxv vs7, 0(r4) -; CHECK-BE-NEXT: xxmrgld vs8, vs7, vs6 -; CHECK-BE-NEXT: xxmrghd vs6, vs7, vs6 ; CHECK-BE-NEXT: lxv vs4, 48(r4) ; CHECK-BE-NEXT: lxv vs5, 32(r4) +; CHECK-BE-NEXT: xxmrgld vs8, vs7, vs6 +; CHECK-BE-NEXT: xxmrghd vs6, vs7, vs6 ; CHECK-BE-NEXT: xxmrgld vs7, vs5, vs4 ; CHECK-BE-NEXT: xxmrghd vs4, vs5, vs4 -; CHECK-BE-NEXT: xvcvdpuxws v2, vs8 -; CHECK-BE-NEXT: xvcvdpuxws v3, vs6 ; CHECK-BE-NEXT: lxv vs2, 80(r4) ; CHECK-BE-NEXT: lxv vs3, 64(r4) +; CHECK-BE-NEXT: lxv vs0, 112(r4) +; CHECK-BE-NEXT: lxv vs1, 96(r4) +; CHECK-BE-NEXT: xvcvdpuxws v2, vs8 +; CHECK-BE-NEXT: xvcvdpuxws v3, vs6 ; CHECK-BE-NEXT: xvcvdpuxws v4, vs7 ; CHECK-BE-NEXT: vmrgew v2, v3, v2 ; CHECK-BE-NEXT: xvcvdpuxws v3, vs4 ; CHECK-BE-NEXT: xxmrgld vs4, vs3, vs2 ; CHECK-BE-NEXT: xxmrghd vs2, vs3, vs2 -; CHECK-BE-NEXT: lxv vs0, 112(r4) -; CHECK-BE-NEXT: lxv vs1, 96(r4) ; CHECK-BE-NEXT: stxv v2, 0(r3) ; CHECK-BE-NEXT: xvcvdpuxws v5, vs2 ; CHECK-BE-NEXT: xxmrgld vs2, vs1, vs0 @@ -421,10 +421,10 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs2, 0(r4) ; CHECK-P9-NEXT: lxv vs3, 16(r4) -; CHECK-P9-NEXT: xxmrgld vs4, vs3, vs2 -; CHECK-P9-NEXT: xxmrghd vs2, vs3, vs2 ; CHECK-P9-NEXT: lxv vs0, 32(r4) ; CHECK-P9-NEXT: lxv vs1, 48(r4) +; CHECK-P9-NEXT: xxmrgld vs4, vs3, vs2 +; CHECK-P9-NEXT: xxmrghd vs2, vs3, vs2 ; CHECK-P9-NEXT: xvcvdpsxws v2, vs4 ; CHECK-P9-NEXT: xvcvdpsxws v3, vs2 ; CHECK-P9-NEXT: xxmrgld vs2, vs1, vs0 @@ -441,10 +441,10 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs2, 16(r4) ; CHECK-BE-NEXT: lxv vs3, 0(r4) -; CHECK-BE-NEXT: xxmrgld vs4, vs3, vs2 -; CHECK-BE-NEXT: xxmrghd vs2, vs3, vs2 ; CHECK-BE-NEXT: lxv vs0, 48(r4) ; CHECK-BE-NEXT: lxv vs1, 32(r4) +; CHECK-BE-NEXT: xxmrgld vs4, vs3, vs2 +; CHECK-BE-NEXT: xxmrghd vs2, vs3, vs2 ; CHECK-BE-NEXT: xvcvdpsxws v2, vs4 ; CHECK-BE-NEXT: xvcvdpsxws v3, vs2 ; CHECK-BE-NEXT: xxmrgld vs2, vs1, vs0 @@ -519,23 +519,23 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs6, 0(r4) ; CHECK-P9-NEXT: lxv vs7, 16(r4) -; CHECK-P9-NEXT: xxmrgld vs8, vs7, vs6 -; CHECK-P9-NEXT: xxmrghd vs6, vs7, vs6 ; CHECK-P9-NEXT: lxv vs4, 32(r4) ; CHECK-P9-NEXT: lxv vs5, 48(r4) +; CHECK-P9-NEXT: xxmrgld vs8, vs7, vs6 +; CHECK-P9-NEXT: xxmrghd vs6, vs7, vs6 ; CHECK-P9-NEXT: xxmrgld vs7, vs5, vs4 ; CHECK-P9-NEXT: xxmrghd vs4, vs5, vs4 -; CHECK-P9-NEXT: xvcvdpsxws v2, vs8 -; CHECK-P9-NEXT: xvcvdpsxws v3, vs6 ; CHECK-P9-NEXT: lxv vs2, 64(r4) ; CHECK-P9-NEXT: lxv vs3, 80(r4) +; CHECK-P9-NEXT: lxv vs0, 96(r4) +; CHECK-P9-NEXT: lxv vs1, 112(r4) +; CHECK-P9-NEXT: xvcvdpsxws v2, vs8 +; CHECK-P9-NEXT: xvcvdpsxws v3, vs6 ; CHECK-P9-NEXT: xvcvdpsxws v4, vs7 ; CHECK-P9-NEXT: vmrgew v2, v3, v2 ; CHECK-P9-NEXT: xvcvdpsxws v3, vs4 ; CHECK-P9-NEXT: xxmrgld vs4, vs3, vs2 ; CHECK-P9-NEXT: xxmrghd vs2, vs3, vs2 -; CHECK-P9-NEXT: lxv vs0, 96(r4) -; CHECK-P9-NEXT: lxv vs1, 112(r4) ; CHECK-P9-NEXT: stxv v2, 0(r3) ; CHECK-P9-NEXT: xvcvdpsxws v5, vs2 ; CHECK-P9-NEXT: xxmrgld vs2, vs1, vs0 @@ -555,23 +555,23 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs6, 16(r4) ; CHECK-BE-NEXT: lxv vs7, 0(r4) -; CHECK-BE-NEXT: xxmrgld vs8, vs7, vs6 -; CHECK-BE-NEXT: xxmrghd vs6, vs7, vs6 ; CHECK-BE-NEXT: lxv vs4, 48(r4) ; CHECK-BE-NEXT: lxv vs5, 32(r4) +; CHECK-BE-NEXT: xxmrgld vs8, vs7, vs6 +; CHECK-BE-NEXT: xxmrghd vs6, vs7, vs6 ; CHECK-BE-NEXT: xxmrgld vs7, vs5, vs4 ; CHECK-BE-NEXT: xxmrghd vs4, vs5, vs4 -; CHECK-BE-NEXT: xvcvdpsxws v2, vs8 -; CHECK-BE-NEXT: xvcvdpsxws v3, vs6 ; CHECK-BE-NEXT: lxv vs2, 80(r4) ; CHECK-BE-NEXT: lxv vs3, 64(r4) +; CHECK-BE-NEXT: lxv vs0, 112(r4) +; CHECK-BE-NEXT: lxv vs1, 96(r4) +; CHECK-BE-NEXT: xvcvdpsxws v2, vs8 +; CHECK-BE-NEXT: xvcvdpsxws v3, vs6 ; CHECK-BE-NEXT: xvcvdpsxws v4, vs7 ; CHECK-BE-NEXT: vmrgew v2, v3, v2 ; CHECK-BE-NEXT: xvcvdpsxws v3, vs4 ; CHECK-BE-NEXT: xxmrgld vs4, vs3, vs2 ; CHECK-BE-NEXT: xxmrghd vs2, vs3, vs2 -; CHECK-BE-NEXT: lxv vs0, 112(r4) -; CHECK-BE-NEXT: lxv vs1, 96(r4) ; CHECK-BE-NEXT: stxv v2, 0(r3) ; CHECK-BE-NEXT: xvcvdpsxws v5, vs2 ; CHECK-BE-NEXT: xxmrgld vs2, vs1, vs0 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll @@ -96,10 +96,10 @@ ; CHECK-P9-LABEL: test4elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs1, 0(r3) +; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: xscvdpsxws f2, f1 ; CHECK-P9-NEXT: xxswapd vs1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: mffprwz r3, f2 ; CHECK-P9-NEXT: mtvsrd v2, r3 ; CHECK-P9-NEXT: mffprwz r3, f1 @@ -121,10 +121,10 @@ ; CHECK-BE-LABEL: test4elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: xscvdpsxws f2, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs1 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 -; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: mffprwz r3, f2 ; CHECK-BE-NEXT: sldi r3, r3, 56 ; CHECK-BE-NEXT: mtvsrd v2, r3 @@ -205,12 +205,12 @@ ; CHECK-P9-LABEL: test8elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs3, 0(r3) -; CHECK-P9-NEXT: xscvdpsxws f4, f3 -; CHECK-P9-NEXT: xxswapd vs3, vs3 -; CHECK-P9-NEXT: xscvdpsxws f3, f3 ; CHECK-P9-NEXT: lxv vs2, 16(r3) ; CHECK-P9-NEXT: lxv vs0, 48(r3) ; CHECK-P9-NEXT: lxv vs1, 32(r3) +; CHECK-P9-NEXT: xscvdpsxws f4, f3 +; CHECK-P9-NEXT: xxswapd vs3, vs3 +; CHECK-P9-NEXT: xscvdpsxws f3, f3 ; CHECK-P9-NEXT: mffprwz r3, f4 ; CHECK-P9-NEXT: mtvsrd v2, r3 ; CHECK-P9-NEXT: mffprwz r3, f3 @@ -249,12 +249,12 @@ ; CHECK-BE-LABEL: test8elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs3, 48(r3) -; CHECK-BE-NEXT: xscvdpsxws f4, f3 -; CHECK-BE-NEXT: xxswapd vs3, vs3 -; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: lxv vs2, 32(r3) ; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: xscvdpsxws f4, f3 +; CHECK-BE-NEXT: xxswapd vs3, vs3 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: mffprwz r3, f4 ; CHECK-BE-NEXT: sldi r3, r3, 56 ; CHECK-BE-NEXT: mtvsrd v2, r3 @@ -398,16 +398,16 @@ ; CHECK-P9-LABEL: test16elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs7, 0(r3) -; CHECK-P9-NEXT: xscvdpsxws f8, f7 -; CHECK-P9-NEXT: xxswapd vs7, vs7 -; CHECK-P9-NEXT: xscvdpsxws f7, f7 ; CHECK-P9-NEXT: lxv vs6, 16(r3) ; CHECK-P9-NEXT: lxv vs0, 112(r3) ; CHECK-P9-NEXT: lxv vs1, 96(r3) +; CHECK-P9-NEXT: xscvdpsxws f8, f7 +; CHECK-P9-NEXT: xxswapd vs7, vs7 ; CHECK-P9-NEXT: lxv vs2, 80(r3) ; CHECK-P9-NEXT: lxv vs3, 64(r3) ; CHECK-P9-NEXT: lxv vs4, 48(r3) ; CHECK-P9-NEXT: lxv vs5, 32(r3) +; CHECK-P9-NEXT: xscvdpsxws f7, f7 ; CHECK-P9-NEXT: mffprwz r3, f8 ; CHECK-P9-NEXT: mtvsrd v2, r3 ; CHECK-P9-NEXT: mffprwz r3, f7 @@ -481,16 +481,16 @@ ; CHECK-BE-LABEL: test16elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs7, 112(r3) -; CHECK-BE-NEXT: xscvdpsxws f8, f7 -; CHECK-BE-NEXT: xxswapd vs7, vs7 -; CHECK-BE-NEXT: xscvdpsxws f7, f7 ; CHECK-BE-NEXT: lxv vs6, 96(r3) ; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: xscvdpsxws f8, f7 +; CHECK-BE-NEXT: xxswapd vs7, vs7 ; CHECK-BE-NEXT: lxv vs2, 32(r3) ; CHECK-BE-NEXT: lxv vs3, 48(r3) ; CHECK-BE-NEXT: lxv vs4, 64(r3) ; CHECK-BE-NEXT: lxv vs5, 80(r3) +; CHECK-BE-NEXT: xscvdpsxws f7, f7 ; CHECK-BE-NEXT: mffprwz r3, f8 ; CHECK-BE-NEXT: sldi r3, r3, 56 ; CHECK-BE-NEXT: mtvsrd v2, r3 @@ -669,10 +669,10 @@ ; CHECK-P9-LABEL: test4elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs1, 0(r3) +; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: xscvdpsxws f2, f1 ; CHECK-P9-NEXT: xxswapd vs1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: lxv vs0, 16(r3) ; CHECK-P9-NEXT: mffprwz r3, f2 ; CHECK-P9-NEXT: mtvsrd v2, r3 ; CHECK-P9-NEXT: mffprwz r3, f1 @@ -694,10 +694,10 @@ ; CHECK-BE-LABEL: test4elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: xscvdpsxws f2, f1 ; CHECK-BE-NEXT: xxswapd vs1, vs1 ; CHECK-BE-NEXT: xscvdpsxws f1, f1 -; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: mffprwz r3, f2 ; CHECK-BE-NEXT: sldi r3, r3, 56 ; CHECK-BE-NEXT: mtvsrd v2, r3 @@ -778,12 +778,12 @@ ; CHECK-P9-LABEL: test8elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs3, 0(r3) -; CHECK-P9-NEXT: xscvdpsxws f4, f3 -; CHECK-P9-NEXT: xxswapd vs3, vs3 -; CHECK-P9-NEXT: xscvdpsxws f3, f3 ; CHECK-P9-NEXT: lxv vs2, 16(r3) ; CHECK-P9-NEXT: lxv vs0, 48(r3) ; CHECK-P9-NEXT: lxv vs1, 32(r3) +; CHECK-P9-NEXT: xscvdpsxws f4, f3 +; CHECK-P9-NEXT: xxswapd vs3, vs3 +; CHECK-P9-NEXT: xscvdpsxws f3, f3 ; CHECK-P9-NEXT: mffprwz r3, f4 ; CHECK-P9-NEXT: mtvsrd v2, r3 ; CHECK-P9-NEXT: mffprwz r3, f3 @@ -822,12 +822,12 @@ ; CHECK-BE-LABEL: test8elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs3, 48(r3) -; CHECK-BE-NEXT: xscvdpsxws f4, f3 -; CHECK-BE-NEXT: xxswapd vs3, vs3 -; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: lxv vs2, 32(r3) ; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: xscvdpsxws f4, f3 +; CHECK-BE-NEXT: xxswapd vs3, vs3 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: mffprwz r3, f4 ; CHECK-BE-NEXT: sldi r3, r3, 56 ; CHECK-BE-NEXT: mtvsrd v2, r3 @@ -971,16 +971,16 @@ ; CHECK-P9-LABEL: test16elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs7, 0(r3) -; CHECK-P9-NEXT: xscvdpsxws f8, f7 -; CHECK-P9-NEXT: xxswapd vs7, vs7 -; CHECK-P9-NEXT: xscvdpsxws f7, f7 ; CHECK-P9-NEXT: lxv vs6, 16(r3) ; CHECK-P9-NEXT: lxv vs0, 112(r3) ; CHECK-P9-NEXT: lxv vs1, 96(r3) +; CHECK-P9-NEXT: xscvdpsxws f8, f7 +; CHECK-P9-NEXT: xxswapd vs7, vs7 ; CHECK-P9-NEXT: lxv vs2, 80(r3) ; CHECK-P9-NEXT: lxv vs3, 64(r3) ; CHECK-P9-NEXT: lxv vs4, 48(r3) ; CHECK-P9-NEXT: lxv vs5, 32(r3) +; CHECK-P9-NEXT: xscvdpsxws f7, f7 ; CHECK-P9-NEXT: mffprwz r3, f8 ; CHECK-P9-NEXT: mtvsrd v2, r3 ; CHECK-P9-NEXT: mffprwz r3, f7 @@ -1054,16 +1054,16 @@ ; CHECK-BE-LABEL: test16elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs7, 112(r3) -; CHECK-BE-NEXT: xscvdpsxws f8, f7 -; CHECK-BE-NEXT: xxswapd vs7, vs7 -; CHECK-BE-NEXT: xscvdpsxws f7, f7 ; CHECK-BE-NEXT: lxv vs6, 96(r3) ; CHECK-BE-NEXT: lxv vs0, 0(r3) ; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: xscvdpsxws f8, f7 +; CHECK-BE-NEXT: xxswapd vs7, vs7 ; CHECK-BE-NEXT: lxv vs2, 32(r3) ; CHECK-BE-NEXT: lxv vs3, 48(r3) ; CHECK-BE-NEXT: lxv vs4, 64(r3) ; CHECK-BE-NEXT: lxv vs5, 80(r3) +; CHECK-BE-NEXT: xscvdpsxws f7, f7 ; CHECK-BE-NEXT: mffprwz r3, f8 ; CHECK-BE-NEXT: sldi r3, r3, 56 ; CHECK-BE-NEXT: mtvsrd v2, r3 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp32_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp32_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp32_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp32_elts.ll @@ -40,9 +40,9 @@ ; CHECK-P9-NEXT: mtfprwz f0, r3 ; CHECK-P9-NEXT: li r3, 2 ; CHECK-P9-NEXT: xscvuxdsp f0, f0 -; CHECK-P9-NEXT: xscvdpspn vs0, f0 ; CHECK-P9-NEXT: vextuhrx r3, r3, v2 ; CHECK-P9-NEXT: clrlwi r3, r3, 16 +; CHECK-P9-NEXT: xscvdpspn vs0, f0 ; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 3 ; CHECK-P9-NEXT: mtfprwz f0, r3 ; CHECK-P9-NEXT: xscvuxdsp f0, f0 @@ -98,9 +98,9 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrd v2, r3 ; CHECK-BE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r3, r3, .LCPI1_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r3 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: vperm v2, v2, v4, v3 ; CHECK-BE-NEXT: xvcvuxwsp v2, v2 ; CHECK-BE-NEXT: blr @@ -137,9 +137,9 @@ ; CHECK-BE-LABEL: test8elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis r4, r2, .LCPI2_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI2_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI2_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI2_1@toc@l ; CHECK-BE-NEXT: vperm v3, v2, v4, v3 @@ -190,9 +190,9 @@ ; CHECK-P9-NEXT: lxv v2, 16(r4) ; CHECK-P9-NEXT: lxv v3, 0(r4) ; CHECK-P9-NEXT: addis r4, r2, .LCPI3_0@toc@ha +; CHECK-P9-NEXT: xxlxor v5, v5, v5 ; CHECK-P9-NEXT: addi r4, r4, .LCPI3_0@toc@l ; CHECK-P9-NEXT: lxvx v4, 0, r4 -; CHECK-P9-NEXT: xxlxor v5, v5, v5 ; CHECK-P9-NEXT: addis r4, r2, .LCPI3_1@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI3_1@toc@l ; CHECK-P9-NEXT: vperm v0, v5, v3, v4 @@ -215,9 +215,9 @@ ; CHECK-BE-NEXT: lxv v2, 16(r4) ; CHECK-BE-NEXT: lxv v3, 0(r4) ; CHECK-BE-NEXT: addis r4, r2, .LCPI3_0@toc@ha +; CHECK-BE-NEXT: xxlxor v5, v5, v5 ; CHECK-BE-NEXT: addi r4, r4, .LCPI3_0@toc@l ; CHECK-BE-NEXT: lxvx v4, 0, r4 -; CHECK-BE-NEXT: xxlxor v5, v5, v5 ; CHECK-BE-NEXT: addis r4, r2, .LCPI3_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI3_1@toc@l ; CHECK-BE-NEXT: vperm v0, v3, v5, v4 @@ -272,9 +272,9 @@ ; CHECK-P9-NEXT: mtfprwa f0, r3 ; CHECK-P9-NEXT: li r3, 2 ; CHECK-P9-NEXT: xscvsxdsp f0, f0 -; CHECK-P9-NEXT: xscvdpspn vs0, f0 ; CHECK-P9-NEXT: vextuhrx r3, r3, v2 ; CHECK-P9-NEXT: extsh r3, r3 +; CHECK-P9-NEXT: xscvdpspn vs0, f0 ; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 3 ; CHECK-P9-NEXT: mtfprwa f0, r3 ; CHECK-P9-NEXT: xscvsxdsp f0, f0 @@ -375,9 +375,9 @@ ; CHECK-BE-LABEL: test8elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis r4, r2, .LCPI6_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI6_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: vperm v3, v4, v2, v3 ; CHECK-BE-NEXT: vmrghh v2, v2, v2 ; CHECK-BE-NEXT: vextsh2w v3, v3 @@ -432,10 +432,10 @@ ; CHECK-P9-NEXT: vmrglh v4, v3, v3 ; CHECK-P9-NEXT: vmrghh v3, v3, v3 ; CHECK-P9-NEXT: vextsh2w v3, v3 +; CHECK-P9-NEXT: vextsh2w v4, v4 ; CHECK-P9-NEXT: xvcvsxwsp vs1, v3 ; CHECK-P9-NEXT: vmrglh v3, v2, v2 ; CHECK-P9-NEXT: vmrghh v2, v2, v2 -; CHECK-P9-NEXT: vextsh2w v4, v4 ; CHECK-P9-NEXT: xvcvsxwsp vs0, v4 ; CHECK-P9-NEXT: vextsh2w v3, v3 ; CHECK-P9-NEXT: vextsh2w v2, v2 @@ -452,9 +452,9 @@ ; CHECK-BE-NEXT: lxv v2, 16(r4) ; CHECK-BE-NEXT: lxv v3, 0(r4) ; CHECK-BE-NEXT: addis r4, r2, .LCPI7_0@toc@ha +; CHECK-BE-NEXT: xxlxor v5, v5, v5 ; CHECK-BE-NEXT: addi r4, r4, .LCPI7_0@toc@l ; CHECK-BE-NEXT: lxvx v4, 0, r4 -; CHECK-BE-NEXT: xxlxor v5, v5, v5 ; CHECK-BE-NEXT: vperm v0, v5, v3, v4 ; CHECK-BE-NEXT: vperm v4, v5, v2, v4 ; CHECK-BE-NEXT: vmrghh v3, v3, v3 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp64_elts.ll @@ -25,9 +25,9 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: mtvsrws v2, r3 ; CHECK-P9-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addi r3, r3, .LCPI0_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r3 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: vperm v2, v4, v2, v3 ; CHECK-P9-NEXT: xvcvuxddp v2, v2 ; CHECK-P9-NEXT: blr @@ -36,9 +36,9 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrws v2, r3 ; CHECK-BE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r3, r3, .LCPI0_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r3 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: vperm v2, v2, v4, v3 ; CHECK-BE-NEXT: xvcvuxddp v2, v2 ; CHECK-BE-NEXT: blr @@ -74,9 +74,9 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: mtvsrd v2, r4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI1_0@toc@ha +; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addi r4, r4, .LCPI1_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r4 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI1_1@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI1_1@toc@l ; CHECK-P9-NEXT: vperm v3, v4, v2, v3 @@ -92,9 +92,9 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrd v2, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI1_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI1_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI1_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI1_1@toc@l ; CHECK-BE-NEXT: vperm v3, v2, v4, v3 @@ -152,9 +152,9 @@ ; CHECK-P9-LABEL: test8elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: addis r4, r2, .LCPI2_0@toc@ha +; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addi r4, r4, .LCPI2_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r4 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI2_1@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI2_1@toc@l ; CHECK-P9-NEXT: vperm v3, v4, v2, v3 @@ -181,9 +181,9 @@ ; CHECK-BE-LABEL: test8elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis r4, r2, .LCPI2_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI2_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI2_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI2_1@toc@l ; CHECK-BE-NEXT: vperm v3, v2, v4, v3 @@ -276,9 +276,9 @@ ; CHECK-P9-NEXT: lxv v2, 16(r4) ; CHECK-P9-NEXT: lxv v3, 0(r4) ; CHECK-P9-NEXT: addis r4, r2, .LCPI3_0@toc@ha +; CHECK-P9-NEXT: xxlxor v5, v5, v5 ; CHECK-P9-NEXT: addi r4, r4, .LCPI3_0@toc@l ; CHECK-P9-NEXT: lxvx v4, 0, r4 -; CHECK-P9-NEXT: xxlxor v5, v5, v5 ; CHECK-P9-NEXT: addis r4, r2, .LCPI3_1@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI3_1@toc@l ; CHECK-P9-NEXT: vperm v0, v5, v3, v4 @@ -319,9 +319,9 @@ ; CHECK-BE-NEXT: lxv v2, 16(r4) ; CHECK-BE-NEXT: lxv v3, 0(r4) ; CHECK-BE-NEXT: addis r4, r2, .LCPI3_0@toc@ha +; CHECK-BE-NEXT: xxlxor v5, v5, v5 ; CHECK-BE-NEXT: addi r4, r4, .LCPI3_0@toc@l ; CHECK-BE-NEXT: lxvx v4, 0, r4 -; CHECK-BE-NEXT: xxlxor v5, v5, v5 ; CHECK-BE-NEXT: addis r4, r2, .LCPI3_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI3_1@toc@l ; CHECK-BE-NEXT: vperm v0, v3, v5, v4 @@ -459,13 +459,13 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrd v2, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI5_0@toc@ha +; CHECK-BE-NEXT: xxlxor v3, v3, v3 ; CHECK-BE-NEXT: addi r4, r4, .LCPI5_0@toc@l ; CHECK-BE-NEXT: lxvx v4, 0, r4 -; CHECK-BE-NEXT: xxlxor v3, v3, v3 -; CHECK-BE-NEXT: vperm v3, v3, v2, v4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI5_1@toc@ha -; CHECK-BE-NEXT: vextsh2d v3, v3 ; CHECK-BE-NEXT: addi r4, r4, .LCPI5_1@toc@l +; CHECK-BE-NEXT: vperm v3, v3, v2, v4 +; CHECK-BE-NEXT: vextsh2d v3, v3 ; CHECK-BE-NEXT: xvcvsxddp vs0, v3 ; CHECK-BE-NEXT: lxvx v3, 0, r4 ; CHECK-BE-NEXT: vperm v2, v2, v2, v3 @@ -564,12 +564,12 @@ ; CHECK-BE-LABEL: test8elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis r4, r2, .LCPI6_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI6_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 -; CHECK-BE-NEXT: vperm v3, v4, v2, v3 ; CHECK-BE-NEXT: addis r4, r2, .LCPI6_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI6_1@toc@l +; CHECK-BE-NEXT: vperm v3, v4, v2, v3 ; CHECK-BE-NEXT: vextsh2d v3, v3 ; CHECK-BE-NEXT: xvcvsxddp vs0, v3 ; CHECK-BE-NEXT: lxvx v3, 0, r4 @@ -680,8 +680,8 @@ ; CHECK-P9-LABEL: test16elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: addis r5, r2, .LCPI7_0@toc@ha -; CHECK-P9-NEXT: addi r5, r5, .LCPI7_0@toc@l ; CHECK-P9-NEXT: lxv v2, 0(r4) +; CHECK-P9-NEXT: addi r5, r5, .LCPI7_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r5 ; CHECK-P9-NEXT: addis r5, r2, .LCPI7_1@toc@ha ; CHECK-P9-NEXT: addi r5, r5, .LCPI7_1@toc@l @@ -700,16 +700,17 @@ ; CHECK-P9-NEXT: xvcvsxddp vs1, v4 ; CHECK-P9-NEXT: vperm v4, v2, v2, v0 ; CHECK-P9-NEXT: vperm v2, v2, v2, v1 +; CHECK-P9-NEXT: stxv vs0, 0(r3) ; CHECK-P9-NEXT: vextsh2d v4, v4 ; CHECK-P9-NEXT: xvcvsxddp vs2, v4 ; CHECK-P9-NEXT: lxv v4, 16(r4) +; CHECK-P9-NEXT: stxv vs1, 16(r3) ; CHECK-P9-NEXT: vextsh2d v2, v2 ; CHECK-P9-NEXT: xvcvsxddp vs3, v2 ; CHECK-P9-NEXT: vperm v2, v4, v4, v3 ; CHECK-P9-NEXT: stxv vs2, 32(r3) ; CHECK-P9-NEXT: vextsh2d v2, v2 ; CHECK-P9-NEXT: stxv vs3, 48(r3) -; CHECK-P9-NEXT: stxv vs1, 16(r3) ; CHECK-P9-NEXT: xvcvsxddp vs4, v2 ; CHECK-P9-NEXT: vperm v2, v4, v4, v5 ; CHECK-P9-NEXT: vextsh2d v2, v2 @@ -720,60 +721,59 @@ ; CHECK-P9-NEXT: xvcvsxddp vs6, v2 ; CHECK-P9-NEXT: vperm v2, v4, v4, v1 ; CHECK-P9-NEXT: stxv vs5, 80(r3) -; CHECK-P9-NEXT: stxv vs6, 96(r3) ; CHECK-P9-NEXT: vextsh2d v2, v2 ; CHECK-P9-NEXT: xvcvsxddp vs7, v2 +; CHECK-P9-NEXT: stxv vs6, 96(r3) ; CHECK-P9-NEXT: stxv vs7, 112(r3) -; CHECK-P9-NEXT: stxv vs0, 0(r3) ; CHECK-P9-NEXT: blr ; ; CHECK-BE-LABEL: test16elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis r5, r2, .LCPI7_0@toc@ha -; CHECK-BE-NEXT: addi r5, r5, .LCPI7_0@toc@l -; CHECK-BE-NEXT: lxvx v2, 0, r5 ; CHECK-BE-NEXT: lxv v4, 0(r4) ; CHECK-BE-NEXT: lxv v1, 16(r4) +; CHECK-BE-NEXT: xxlxor v5, v5, v5 +; CHECK-BE-NEXT: addis r4, r2, .LCPI7_2@toc@ha +; CHECK-BE-NEXT: addi r5, r5, .LCPI7_0@toc@l +; CHECK-BE-NEXT: addi r4, r4, .LCPI7_2@toc@l +; CHECK-BE-NEXT: lxvx v2, 0, r5 ; CHECK-BE-NEXT: addis r5, r2, .LCPI7_1@toc@ha ; CHECK-BE-NEXT: addi r5, r5, .LCPI7_1@toc@l -; CHECK-BE-NEXT: addis r4, r2, .LCPI7_2@toc@ha -; CHECK-BE-NEXT: xxlxor v5, v5, v5 -; CHECK-BE-NEXT: vperm v0, v5, v4, v2 ; CHECK-BE-NEXT: lxvx v3, 0, r5 +; CHECK-BE-NEXT: vperm v0, v5, v4, v2 ; CHECK-BE-NEXT: vperm v2, v5, v1, v2 ; CHECK-BE-NEXT: vextsh2d v2, v2 -; CHECK-BE-NEXT: addi r4, r4, .LCPI7_2@toc@l ; CHECK-BE-NEXT: vextsh2d v0, v0 ; CHECK-BE-NEXT: xvcvsxddp vs2, v2 ; CHECK-BE-NEXT: vperm v2, v5, v1, v3 +; CHECK-BE-NEXT: xvcvsxddp vs0, v0 +; CHECK-BE-NEXT: vperm v0, v5, v4, v3 ; CHECK-BE-NEXT: vextsh2d v2, v2 -; CHECK-BE-NEXT: stxv vs2, 80(r3) +; CHECK-BE-NEXT: vextsh2d v0, v0 ; CHECK-BE-NEXT: xvcvsxddp vs3, v2 ; CHECK-BE-NEXT: lxvx v2, 0, r4 -; CHECK-BE-NEXT: xvcvsxddp vs0, v0 -; CHECK-BE-NEXT: vperm v0, v5, v4, v3 -; CHECK-BE-NEXT: vperm v3, v4, v4, v2 ; CHECK-BE-NEXT: addis r4, r2, .LCPI7_3@toc@ha -; CHECK-BE-NEXT: vextsh2d v0, v0 ; CHECK-BE-NEXT: xvcvsxddp vs1, v0 +; CHECK-BE-NEXT: addi r4, r4, .LCPI7_3@toc@l +; CHECK-BE-NEXT: stxv vs2, 80(r3) +; CHECK-BE-NEXT: stxv vs0, 16(r3) +; CHECK-BE-NEXT: vperm v3, v4, v4, v2 +; CHECK-BE-NEXT: vperm v2, v1, v1, v2 +; CHECK-BE-NEXT: stxv vs3, 112(r3) ; CHECK-BE-NEXT: stxv vs1, 48(r3) ; CHECK-BE-NEXT: vextsh2d v3, v3 -; CHECK-BE-NEXT: addi r4, r4, .LCPI7_3@toc@l +; CHECK-BE-NEXT: vextsh2d v2, v2 ; CHECK-BE-NEXT: xvcvsxddp vs4, v3 ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: vperm v2, v1, v1, v2 -; CHECK-BE-NEXT: vextsh2d v2, v2 ; CHECK-BE-NEXT: xvcvsxddp vs6, v2 -; CHECK-BE-NEXT: vperm v2, v1, v1, v3 ; CHECK-BE-NEXT: vperm v4, v4, v4, v3 +; CHECK-BE-NEXT: vperm v2, v1, v1, v3 +; CHECK-BE-NEXT: stxv vs6, 64(r3) +; CHECK-BE-NEXT: stxv vs4, 0(r3) ; CHECK-BE-NEXT: vextsh2d v4, v4 ; CHECK-BE-NEXT: vextsh2d v2, v2 -; CHECK-BE-NEXT: xvcvsxddp vs7, v2 ; CHECK-BE-NEXT: xvcvsxddp vs5, v4 -; CHECK-BE-NEXT: stxv vs3, 112(r3) -; CHECK-BE-NEXT: stxv vs6, 64(r3) -; CHECK-BE-NEXT: stxv vs0, 16(r3) -; CHECK-BE-NEXT: stxv vs4, 0(r3) +; CHECK-BE-NEXT: xvcvsxddp vs7, v2 ; CHECK-BE-NEXT: stxv vs7, 96(r3) ; CHECK-BE-NEXT: stxv vs5, 32(r3) ; CHECK-BE-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i32_to_fp64_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i32_to_fp64_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_i32_to_fp64_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_i32_to_fp64_elts.ll @@ -106,8 +106,8 @@ ; CHECK-P9-LABEL: test8elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs1, 0(r4) -; CHECK-P9-NEXT: xxmrglw v2, vs1, vs1 ; CHECK-P9-NEXT: lxv vs0, 16(r4) +; CHECK-P9-NEXT: xxmrglw v2, vs1, vs1 ; CHECK-P9-NEXT: xvcvuxwdp vs2, v2 ; CHECK-P9-NEXT: xxmrghw v2, vs1, vs1 ; CHECK-P9-NEXT: xvcvuxwdp vs1, v2 @@ -124,8 +124,8 @@ ; CHECK-BE-LABEL: test8elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 0(r4) -; CHECK-BE-NEXT: xxmrghw v2, vs1, vs1 ; CHECK-BE-NEXT: lxv vs0, 16(r4) +; CHECK-BE-NEXT: xxmrghw v2, vs1, vs1 ; CHECK-BE-NEXT: xvcvuxwdp vs2, v2 ; CHECK-BE-NEXT: xxmrglw v2, vs1, vs1 ; CHECK-BE-NEXT: xvcvuxwdp vs1, v2 @@ -196,12 +196,12 @@ ; CHECK-P9-LABEL: test16elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs0, 0(r4) -; CHECK-P9-NEXT: xxmrglw v2, vs0, vs0 ; CHECK-P9-NEXT: lxv vs2, 16(r4) +; CHECK-P9-NEXT: lxv vs5, 32(r4) ; CHECK-P9-NEXT: lxv vs4, 48(r4) +; CHECK-P9-NEXT: xxmrglw v2, vs0, vs0 ; CHECK-P9-NEXT: xvcvuxwdp vs1, v2 ; CHECK-P9-NEXT: xxmrghw v2, vs0, vs0 -; CHECK-P9-NEXT: lxv vs5, 32(r4) ; CHECK-P9-NEXT: xvcvuxwdp vs0, v2 ; CHECK-P9-NEXT: xxmrglw v2, vs2, vs2 ; CHECK-P9-NEXT: xvcvuxwdp vs3, v2 @@ -228,12 +228,12 @@ ; CHECK-BE-LABEL: test16elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs0, 0(r4) -; CHECK-BE-NEXT: xxmrghw v2, vs0, vs0 ; CHECK-BE-NEXT: lxv vs2, 16(r4) +; CHECK-BE-NEXT: lxv vs5, 32(r4) ; CHECK-BE-NEXT: lxv vs4, 48(r4) +; CHECK-BE-NEXT: xxmrghw v2, vs0, vs0 ; CHECK-BE-NEXT: xvcvuxwdp vs1, v2 ; CHECK-BE-NEXT: xxmrglw v2, vs0, vs0 -; CHECK-BE-NEXT: lxv vs5, 32(r4) ; CHECK-BE-NEXT: xvcvuxwdp vs0, v2 ; CHECK-BE-NEXT: xxmrghw v2, vs2, vs2 ; CHECK-BE-NEXT: xvcvuxwdp vs3, v2 @@ -360,8 +360,8 @@ ; CHECK-P9-LABEL: test8elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs1, 0(r4) -; CHECK-P9-NEXT: xxmrglw v2, vs1, vs1 ; CHECK-P9-NEXT: lxv vs0, 16(r4) +; CHECK-P9-NEXT: xxmrglw v2, vs1, vs1 ; CHECK-P9-NEXT: xvcvsxwdp vs2, v2 ; CHECK-P9-NEXT: xxmrghw v2, vs1, vs1 ; CHECK-P9-NEXT: xvcvsxwdp vs1, v2 @@ -378,8 +378,8 @@ ; CHECK-BE-LABEL: test8elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs1, 0(r4) -; CHECK-BE-NEXT: xxmrghw v2, vs1, vs1 ; CHECK-BE-NEXT: lxv vs0, 16(r4) +; CHECK-BE-NEXT: xxmrghw v2, vs1, vs1 ; CHECK-BE-NEXT: xvcvsxwdp vs2, v2 ; CHECK-BE-NEXT: xxmrglw v2, vs1, vs1 ; CHECK-BE-NEXT: xvcvsxwdp vs1, v2 @@ -450,12 +450,12 @@ ; CHECK-P9-LABEL: test16elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv vs0, 0(r4) -; CHECK-P9-NEXT: xxmrglw v2, vs0, vs0 ; CHECK-P9-NEXT: lxv vs2, 16(r4) +; CHECK-P9-NEXT: lxv vs5, 32(r4) ; CHECK-P9-NEXT: lxv vs4, 48(r4) +; CHECK-P9-NEXT: xxmrglw v2, vs0, vs0 ; CHECK-P9-NEXT: xvcvsxwdp vs1, v2 ; CHECK-P9-NEXT: xxmrghw v2, vs0, vs0 -; CHECK-P9-NEXT: lxv vs5, 32(r4) ; CHECK-P9-NEXT: xvcvsxwdp vs0, v2 ; CHECK-P9-NEXT: xxmrglw v2, vs2, vs2 ; CHECK-P9-NEXT: xvcvsxwdp vs3, v2 @@ -482,12 +482,12 @@ ; CHECK-BE-LABEL: test16elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs0, 0(r4) -; CHECK-BE-NEXT: xxmrghw v2, vs0, vs0 ; CHECK-BE-NEXT: lxv vs2, 16(r4) +; CHECK-BE-NEXT: lxv vs5, 32(r4) ; CHECK-BE-NEXT: lxv vs4, 48(r4) +; CHECK-BE-NEXT: xxmrghw v2, vs0, vs0 ; CHECK-BE-NEXT: xvcvsxwdp vs1, v2 ; CHECK-BE-NEXT: xxmrglw v2, vs0, vs0 -; CHECK-BE-NEXT: lxv vs5, 32(r4) ; CHECK-BE-NEXT: xvcvsxwdp vs0, v2 ; CHECK-BE-NEXT: xxmrghw v2, vs2, vs2 ; CHECK-BE-NEXT: xvcvsxwdp vs3, v2 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i64_to_fp32_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i64_to_fp32_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_i64_to_fp32_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_i64_to_fp32_elts.ll @@ -74,8 +74,8 @@ ; CHECK-P9-LABEL: test4elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv v3, 0(r3) -; CHECK-P9-NEXT: xvcvuxdsp vs0, v3 ; CHECK-P9-NEXT: lxv v2, 16(r3) +; CHECK-P9-NEXT: xvcvuxdsp vs0, v3 ; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvuxdsp vs0, v2 ; CHECK-P9-NEXT: xxsldwi v2, vs0, vs0, 3 @@ -85,8 +85,8 @@ ; CHECK-BE-LABEL: test4elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv v3, 16(r3) -; CHECK-BE-NEXT: xvcvuxdsp vs0, v3 ; CHECK-BE-NEXT: lxv v2, 0(r3) +; CHECK-BE-NEXT: xvcvuxdsp vs0, v3 ; CHECK-BE-NEXT: xxsldwi v3, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvuxdsp vs0, v2 ; CHECK-BE-NEXT: xxsldwi v2, vs0, vs0, 3 @@ -129,14 +129,14 @@ ; CHECK-P9-LABEL: test8elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv v5, 0(r4) -; CHECK-P9-NEXT: xvcvuxdsp vs0, v5 ; CHECK-P9-NEXT: lxv v4, 16(r4) +; CHECK-P9-NEXT: lxv v3, 32(r4) +; CHECK-P9-NEXT: lxv v2, 48(r4) +; CHECK-P9-NEXT: xvcvuxdsp vs0, v5 ; CHECK-P9-NEXT: xxsldwi v5, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvuxdsp vs0, v4 -; CHECK-P9-NEXT: lxv v3, 32(r4) ; CHECK-P9-NEXT: xxsldwi v4, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvuxdsp vs0, v3 -; CHECK-P9-NEXT: lxv v2, 48(r4) ; CHECK-P9-NEXT: vpkudum v3, v4, v5 ; CHECK-P9-NEXT: stxv v3, 0(r3) ; CHECK-P9-NEXT: xxsldwi v4, vs0, vs0, 3 @@ -149,14 +149,14 @@ ; CHECK-BE-LABEL: test8elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv v5, 16(r4) -; CHECK-BE-NEXT: xvcvuxdsp vs0, v5 ; CHECK-BE-NEXT: lxv v4, 0(r4) +; CHECK-BE-NEXT: lxv v3, 48(r4) +; CHECK-BE-NEXT: lxv v2, 32(r4) +; CHECK-BE-NEXT: xvcvuxdsp vs0, v5 ; CHECK-BE-NEXT: xxsldwi v5, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvuxdsp vs0, v4 -; CHECK-BE-NEXT: lxv v3, 48(r4) ; CHECK-BE-NEXT: xxsldwi v4, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvuxdsp vs0, v3 -; CHECK-BE-NEXT: lxv v2, 32(r4) ; CHECK-BE-NEXT: vpkudum v3, v4, v5 ; CHECK-BE-NEXT: stxv v3, 0(r3) ; CHECK-BE-NEXT: xxsldwi v4, vs0, vs0, 3 @@ -227,30 +227,30 @@ ; CHECK-P9-LABEL: test16elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv v7, 0(r4) -; CHECK-P9-NEXT: xvcvuxdsp vs0, v7 ; CHECK-P9-NEXT: lxv v6, 16(r4) +; CHECK-P9-NEXT: lxv v1, 32(r4) +; CHECK-P9-NEXT: lxv v0, 48(r4) +; CHECK-P9-NEXT: xvcvuxdsp vs0, v7 +; CHECK-P9-NEXT: lxv v5, 64(r4) +; CHECK-P9-NEXT: lxv v4, 80(r4) +; CHECK-P9-NEXT: lxv v3, 96(r4) +; CHECK-P9-NEXT: lxv v2, 112(r4) ; CHECK-P9-NEXT: xxsldwi v7, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvuxdsp vs0, v6 -; CHECK-P9-NEXT: lxv v1, 32(r4) ; CHECK-P9-NEXT: xxsldwi v6, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvuxdsp vs0, v1 -; CHECK-P9-NEXT: lxv v0, 48(r4) ; CHECK-P9-NEXT: vpkudum v1, v6, v7 +; CHECK-P9-NEXT: stxv v1, 0(r3) ; CHECK-P9-NEXT: xxsldwi v6, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvuxdsp vs0, v0 -; CHECK-P9-NEXT: lxv v5, 64(r4) -; CHECK-P9-NEXT: stxv v1, 0(r3) ; CHECK-P9-NEXT: xxsldwi v0, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvuxdsp vs0, v5 -; CHECK-P9-NEXT: lxv v4, 80(r4) ; CHECK-P9-NEXT: vpkudum v0, v0, v6 ; CHECK-P9-NEXT: stxv v0, 16(r3) ; CHECK-P9-NEXT: xxsldwi v5, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvuxdsp vs0, v4 -; CHECK-P9-NEXT: lxv v3, 96(r4) ; CHECK-P9-NEXT: xxsldwi v4, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvuxdsp vs0, v3 -; CHECK-P9-NEXT: lxv v2, 112(r4) ; CHECK-P9-NEXT: vpkudum v4, v4, v5 ; CHECK-P9-NEXT: stxv v4, 32(r3) ; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 3 @@ -263,30 +263,30 @@ ; CHECK-BE-LABEL: test16elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv v7, 16(r4) -; CHECK-BE-NEXT: xvcvuxdsp vs0, v7 ; CHECK-BE-NEXT: lxv v6, 0(r4) +; CHECK-BE-NEXT: lxv v1, 48(r4) +; CHECK-BE-NEXT: lxv v0, 32(r4) +; CHECK-BE-NEXT: xvcvuxdsp vs0, v7 +; CHECK-BE-NEXT: lxv v5, 80(r4) +; CHECK-BE-NEXT: lxv v4, 64(r4) +; CHECK-BE-NEXT: lxv v3, 112(r4) +; CHECK-BE-NEXT: lxv v2, 96(r4) ; CHECK-BE-NEXT: xxsldwi v7, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvuxdsp vs0, v6 -; CHECK-BE-NEXT: lxv v1, 48(r4) ; CHECK-BE-NEXT: xxsldwi v6, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvuxdsp vs0, v1 -; CHECK-BE-NEXT: lxv v0, 32(r4) ; CHECK-BE-NEXT: vpkudum v1, v6, v7 +; CHECK-BE-NEXT: stxv v1, 0(r3) ; CHECK-BE-NEXT: xxsldwi v6, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvuxdsp vs0, v0 -; CHECK-BE-NEXT: lxv v5, 80(r4) -; CHECK-BE-NEXT: stxv v1, 0(r3) ; CHECK-BE-NEXT: xxsldwi v0, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvuxdsp vs0, v5 -; CHECK-BE-NEXT: lxv v4, 64(r4) ; CHECK-BE-NEXT: vpkudum v0, v0, v6 ; CHECK-BE-NEXT: stxv v0, 16(r3) ; CHECK-BE-NEXT: xxsldwi v5, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvuxdsp vs0, v4 -; CHECK-BE-NEXT: lxv v3, 112(r4) ; CHECK-BE-NEXT: xxsldwi v4, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvuxdsp vs0, v3 -; CHECK-BE-NEXT: lxv v2, 96(r4) ; CHECK-BE-NEXT: vpkudum v4, v4, v5 ; CHECK-BE-NEXT: stxv v4, 32(r3) ; CHECK-BE-NEXT: xxsldwi v3, vs0, vs0, 3 @@ -367,8 +367,8 @@ ; CHECK-P9-LABEL: test4elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv v3, 0(r3) -; CHECK-P9-NEXT: xvcvsxdsp vs0, v3 ; CHECK-P9-NEXT: lxv v2, 16(r3) +; CHECK-P9-NEXT: xvcvsxdsp vs0, v3 ; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvsxdsp vs0, v2 ; CHECK-P9-NEXT: xxsldwi v2, vs0, vs0, 3 @@ -378,8 +378,8 @@ ; CHECK-BE-LABEL: test4elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv v3, 16(r3) -; CHECK-BE-NEXT: xvcvsxdsp vs0, v3 ; CHECK-BE-NEXT: lxv v2, 0(r3) +; CHECK-BE-NEXT: xvcvsxdsp vs0, v3 ; CHECK-BE-NEXT: xxsldwi v3, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvsxdsp vs0, v2 ; CHECK-BE-NEXT: xxsldwi v2, vs0, vs0, 3 @@ -422,14 +422,14 @@ ; CHECK-P9-LABEL: test8elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv v5, 0(r4) -; CHECK-P9-NEXT: xvcvsxdsp vs0, v5 ; CHECK-P9-NEXT: lxv v4, 16(r4) +; CHECK-P9-NEXT: lxv v3, 32(r4) +; CHECK-P9-NEXT: lxv v2, 48(r4) +; CHECK-P9-NEXT: xvcvsxdsp vs0, v5 ; CHECK-P9-NEXT: xxsldwi v5, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvsxdsp vs0, v4 -; CHECK-P9-NEXT: lxv v3, 32(r4) ; CHECK-P9-NEXT: xxsldwi v4, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvsxdsp vs0, v3 -; CHECK-P9-NEXT: lxv v2, 48(r4) ; CHECK-P9-NEXT: vpkudum v3, v4, v5 ; CHECK-P9-NEXT: stxv v3, 0(r3) ; CHECK-P9-NEXT: xxsldwi v4, vs0, vs0, 3 @@ -442,14 +442,14 @@ ; CHECK-BE-LABEL: test8elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv v5, 16(r4) -; CHECK-BE-NEXT: xvcvsxdsp vs0, v5 ; CHECK-BE-NEXT: lxv v4, 0(r4) +; CHECK-BE-NEXT: lxv v3, 48(r4) +; CHECK-BE-NEXT: lxv v2, 32(r4) +; CHECK-BE-NEXT: xvcvsxdsp vs0, v5 ; CHECK-BE-NEXT: xxsldwi v5, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvsxdsp vs0, v4 -; CHECK-BE-NEXT: lxv v3, 48(r4) ; CHECK-BE-NEXT: xxsldwi v4, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvsxdsp vs0, v3 -; CHECK-BE-NEXT: lxv v2, 32(r4) ; CHECK-BE-NEXT: vpkudum v3, v4, v5 ; CHECK-BE-NEXT: stxv v3, 0(r3) ; CHECK-BE-NEXT: xxsldwi v4, vs0, vs0, 3 @@ -520,30 +520,30 @@ ; CHECK-P9-LABEL: test16elt_signed: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lxv v7, 0(r4) -; CHECK-P9-NEXT: xvcvsxdsp vs0, v7 ; CHECK-P9-NEXT: lxv v6, 16(r4) +; CHECK-P9-NEXT: lxv v1, 32(r4) +; CHECK-P9-NEXT: lxv v0, 48(r4) +; CHECK-P9-NEXT: xvcvsxdsp vs0, v7 +; CHECK-P9-NEXT: lxv v5, 64(r4) +; CHECK-P9-NEXT: lxv v4, 80(r4) +; CHECK-P9-NEXT: lxv v3, 96(r4) +; CHECK-P9-NEXT: lxv v2, 112(r4) ; CHECK-P9-NEXT: xxsldwi v7, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvsxdsp vs0, v6 -; CHECK-P9-NEXT: lxv v1, 32(r4) ; CHECK-P9-NEXT: xxsldwi v6, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvsxdsp vs0, v1 -; CHECK-P9-NEXT: lxv v0, 48(r4) ; CHECK-P9-NEXT: vpkudum v1, v6, v7 +; CHECK-P9-NEXT: stxv v1, 0(r3) ; CHECK-P9-NEXT: xxsldwi v6, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvsxdsp vs0, v0 -; CHECK-P9-NEXT: lxv v5, 64(r4) -; CHECK-P9-NEXT: stxv v1, 0(r3) ; CHECK-P9-NEXT: xxsldwi v0, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvsxdsp vs0, v5 -; CHECK-P9-NEXT: lxv v4, 80(r4) ; CHECK-P9-NEXT: vpkudum v0, v0, v6 ; CHECK-P9-NEXT: stxv v0, 16(r3) ; CHECK-P9-NEXT: xxsldwi v5, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvsxdsp vs0, v4 -; CHECK-P9-NEXT: lxv v3, 96(r4) ; CHECK-P9-NEXT: xxsldwi v4, vs0, vs0, 3 ; CHECK-P9-NEXT: xvcvsxdsp vs0, v3 -; CHECK-P9-NEXT: lxv v2, 112(r4) ; CHECK-P9-NEXT: vpkudum v4, v4, v5 ; CHECK-P9-NEXT: stxv v4, 32(r3) ; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 3 @@ -556,30 +556,30 @@ ; CHECK-BE-LABEL: test16elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv v7, 16(r4) -; CHECK-BE-NEXT: xvcvsxdsp vs0, v7 ; CHECK-BE-NEXT: lxv v6, 0(r4) +; CHECK-BE-NEXT: lxv v1, 48(r4) +; CHECK-BE-NEXT: lxv v0, 32(r4) +; CHECK-BE-NEXT: xvcvsxdsp vs0, v7 +; CHECK-BE-NEXT: lxv v5, 80(r4) +; CHECK-BE-NEXT: lxv v4, 64(r4) +; CHECK-BE-NEXT: lxv v3, 112(r4) +; CHECK-BE-NEXT: lxv v2, 96(r4) ; CHECK-BE-NEXT: xxsldwi v7, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvsxdsp vs0, v6 -; CHECK-BE-NEXT: lxv v1, 48(r4) ; CHECK-BE-NEXT: xxsldwi v6, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvsxdsp vs0, v1 -; CHECK-BE-NEXT: lxv v0, 32(r4) ; CHECK-BE-NEXT: vpkudum v1, v6, v7 +; CHECK-BE-NEXT: stxv v1, 0(r3) ; CHECK-BE-NEXT: xxsldwi v6, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvsxdsp vs0, v0 -; CHECK-BE-NEXT: lxv v5, 80(r4) -; CHECK-BE-NEXT: stxv v1, 0(r3) ; CHECK-BE-NEXT: xxsldwi v0, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvsxdsp vs0, v5 -; CHECK-BE-NEXT: lxv v4, 64(r4) ; CHECK-BE-NEXT: vpkudum v0, v0, v6 ; CHECK-BE-NEXT: stxv v0, 16(r3) ; CHECK-BE-NEXT: xxsldwi v5, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvsxdsp vs0, v4 -; CHECK-BE-NEXT: lxv v3, 112(r4) ; CHECK-BE-NEXT: xxsldwi v4, vs0, vs0, 3 ; CHECK-BE-NEXT: xvcvsxdsp vs0, v3 -; CHECK-BE-NEXT: lxv v2, 96(r4) ; CHECK-BE-NEXT: vpkudum v4, v4, v5 ; CHECK-BE-NEXT: stxv v4, 32(r3) ; CHECK-BE-NEXT: xxsldwi v3, vs0, vs0, 3 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll @@ -40,9 +40,9 @@ ; CHECK-P9-NEXT: mtfprwz f0, r3 ; CHECK-P9-NEXT: li r3, 1 ; CHECK-P9-NEXT: xscvuxdsp f0, f0 -; CHECK-P9-NEXT: xscvdpspn vs0, f0 ; CHECK-P9-NEXT: vextubrx r3, r3, v2 ; CHECK-P9-NEXT: clrlwi r3, r3, 24 +; CHECK-P9-NEXT: xscvdpspn vs0, f0 ; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 3 ; CHECK-P9-NEXT: mtfprwz f0, r3 ; CHECK-P9-NEXT: xscvuxdsp f0, f0 @@ -93,9 +93,9 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: mtvsrws v2, r3 ; CHECK-P9-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addi r3, r3, .LCPI1_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r3 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: vperm v2, v4, v2, v3 ; CHECK-P9-NEXT: xvcvuxwsp v2, v2 ; CHECK-P9-NEXT: blr @@ -104,9 +104,9 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrws v2, r3 ; CHECK-BE-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r3, r3, .LCPI1_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r3 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: vperm v2, v2, v4, v3 ; CHECK-BE-NEXT: xvcvuxwsp v2, v2 ; CHECK-BE-NEXT: blr @@ -140,9 +140,9 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: mtvsrd v2, r4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI2_0@toc@ha +; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addi r4, r4, .LCPI2_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r4 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI2_1@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI2_1@toc@l ; CHECK-P9-NEXT: vperm v3, v4, v2, v3 @@ -158,9 +158,9 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrd v2, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI2_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI2_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI2_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI2_1@toc@l ; CHECK-BE-NEXT: vperm v3, v2, v4, v3 @@ -214,9 +214,9 @@ ; CHECK-P9-LABEL: test16elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: addis r4, r2, .LCPI3_0@toc@ha +; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addi r4, r4, .LCPI3_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r4 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI3_1@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI3_1@toc@l ; CHECK-P9-NEXT: vperm v3, v4, v2, v3 @@ -243,9 +243,9 @@ ; CHECK-BE-LABEL: test16elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis r4, r2, .LCPI3_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI3_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI3_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI3_1@toc@l ; CHECK-BE-NEXT: vperm v3, v2, v4, v3 @@ -305,9 +305,9 @@ ; CHECK-P9-NEXT: mtfprwa f0, r3 ; CHECK-P9-NEXT: li r3, 1 ; CHECK-P9-NEXT: xscvsxdsp f0, f0 -; CHECK-P9-NEXT: xscvdpspn vs0, f0 ; CHECK-P9-NEXT: vextubrx r3, r3, v2 ; CHECK-P9-NEXT: extsb r3, r3 +; CHECK-P9-NEXT: xscvdpspn vs0, f0 ; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 3 ; CHECK-P9-NEXT: mtfprwa f0, r3 ; CHECK-P9-NEXT: xscvsxdsp f0, f0 @@ -432,13 +432,13 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrd v2, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI6_0@toc@ha +; CHECK-BE-NEXT: xxlxor v3, v3, v3 ; CHECK-BE-NEXT: addi r4, r4, .LCPI6_0@toc@l ; CHECK-BE-NEXT: lxvx v4, 0, r4 -; CHECK-BE-NEXT: xxlxor v3, v3, v3 -; CHECK-BE-NEXT: vperm v3, v3, v2, v4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI6_1@toc@ha -; CHECK-BE-NEXT: vextsb2w v3, v3 ; CHECK-BE-NEXT: addi r4, r4, .LCPI6_1@toc@l +; CHECK-BE-NEXT: vperm v3, v3, v2, v4 +; CHECK-BE-NEXT: vextsb2w v3, v3 ; CHECK-BE-NEXT: xvcvsxwsp vs0, v3 ; CHECK-BE-NEXT: lxvx v3, 0, r4 ; CHECK-BE-NEXT: vperm v2, v2, v2, v3 @@ -531,12 +531,12 @@ ; CHECK-BE-LABEL: test16elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis r4, r2, .LCPI7_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI7_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 -; CHECK-BE-NEXT: vperm v3, v4, v2, v3 ; CHECK-BE-NEXT: addis r4, r2, .LCPI7_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI7_1@toc@l +; CHECK-BE-NEXT: vperm v3, v4, v2, v3 ; CHECK-BE-NEXT: vextsb2w v3, v3 ; CHECK-BE-NEXT: xvcvsxwsp vs0, v3 ; CHECK-BE-NEXT: lxvx v3, 0, r4 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll @@ -25,9 +25,9 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: mtvsrws v2, r3 ; CHECK-P9-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addi r3, r3, .LCPI0_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r3 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: vperm v2, v4, v2, v3 ; CHECK-P9-NEXT: xvcvuxddp v2, v2 ; CHECK-P9-NEXT: blr @@ -36,9 +36,9 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrws v2, r3 ; CHECK-BE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r3, r3, .LCPI0_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r3 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: vperm v2, v2, v4, v3 ; CHECK-BE-NEXT: xvcvuxddp v2, v2 ; CHECK-BE-NEXT: blr @@ -74,9 +74,9 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: mtvsrws v2, r4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI1_0@toc@ha +; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addi r4, r4, .LCPI1_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r4 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI1_1@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI1_1@toc@l ; CHECK-P9-NEXT: vperm v3, v4, v2, v3 @@ -92,9 +92,9 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrws v2, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI1_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI1_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI1_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI1_1@toc@l ; CHECK-BE-NEXT: vperm v3, v2, v4, v3 @@ -154,9 +154,9 @@ ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: mtvsrd v2, r4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI2_0@toc@ha +; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addi r4, r4, .LCPI2_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r4 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI2_1@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI2_1@toc@l ; CHECK-P9-NEXT: vperm v3, v4, v2, v3 @@ -184,9 +184,9 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrd v2, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI2_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI2_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI2_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI2_1@toc@l ; CHECK-BE-NEXT: vperm v3, v2, v4, v3 @@ -288,9 +288,9 @@ ; CHECK-P9-LABEL: test16elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: addis r4, r2, .LCPI3_0@toc@ha +; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addi r4, r4, .LCPI3_0@toc@l ; CHECK-P9-NEXT: lxvx v3, 0, r4 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 ; CHECK-P9-NEXT: addis r4, r2, .LCPI3_1@toc@ha ; CHECK-P9-NEXT: addi r4, r4, .LCPI3_1@toc@l ; CHECK-P9-NEXT: vperm v3, v4, v2, v3 @@ -341,9 +341,9 @@ ; CHECK-BE-LABEL: test16elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis r4, r2, .LCPI3_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI3_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI3_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI3_1@toc@l ; CHECK-BE-NEXT: vperm v3, v2, v4, v3 @@ -492,13 +492,13 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrws v2, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI5_0@toc@ha +; CHECK-BE-NEXT: xxlxor v3, v3, v3 ; CHECK-BE-NEXT: addi r4, r4, .LCPI5_0@toc@l ; CHECK-BE-NEXT: lxvx v4, 0, r4 -; CHECK-BE-NEXT: xxlxor v3, v3, v3 -; CHECK-BE-NEXT: vperm v3, v3, v2, v4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI5_1@toc@ha -; CHECK-BE-NEXT: vextsb2d v3, v3 ; CHECK-BE-NEXT: addi r4, r4, .LCPI5_1@toc@l +; CHECK-BE-NEXT: vperm v3, v3, v2, v4 +; CHECK-BE-NEXT: vextsb2d v3, v3 ; CHECK-BE-NEXT: xvcvsxddp vs0, v3 ; CHECK-BE-NEXT: lxvx v3, 0, r4 ; CHECK-BE-NEXT: vperm v2, v2, v2, v3 @@ -600,13 +600,13 @@ ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrd v2, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI6_0@toc@ha +; CHECK-BE-NEXT: xxlxor v4, v4, v4 ; CHECK-BE-NEXT: addi r4, r4, .LCPI6_0@toc@l ; CHECK-BE-NEXT: lxvx v3, 0, r4 -; CHECK-BE-NEXT: xxlxor v4, v4, v4 -; CHECK-BE-NEXT: vperm v3, v4, v2, v3 ; CHECK-BE-NEXT: addis r4, r2, .LCPI6_1@toc@ha -; CHECK-BE-NEXT: vextsb2d v3, v3 ; CHECK-BE-NEXT: addi r4, r4, .LCPI6_1@toc@l +; CHECK-BE-NEXT: vperm v3, v4, v2, v3 +; CHECK-BE-NEXT: vextsb2d v3, v3 ; CHECK-BE-NEXT: xvcvsxddp vs0, v3 ; CHECK-BE-NEXT: lxvx v3, 0, r4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI6_2@toc@ha @@ -787,12 +787,12 @@ ; CHECK-BE-LABEL: test16elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis r4, r2, .LCPI7_0@toc@ha +; CHECK-BE-NEXT: xxlxor v3, v3, v3 ; CHECK-BE-NEXT: addi r4, r4, .LCPI7_0@toc@l ; CHECK-BE-NEXT: lxvx v4, 0, r4 -; CHECK-BE-NEXT: xxlxor v3, v3, v3 -; CHECK-BE-NEXT: vperm v4, v3, v2, v4 ; CHECK-BE-NEXT: addis r4, r2, .LCPI7_1@toc@ha ; CHECK-BE-NEXT: addi r4, r4, .LCPI7_1@toc@l +; CHECK-BE-NEXT: vperm v4, v3, v2, v4 ; CHECK-BE-NEXT: vextsb2d v4, v4 ; CHECK-BE-NEXT: xvcvsxddp vs0, v4 ; CHECK-BE-NEXT: lxvx v4, 0, r4 diff --git a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll @@ -305,8 +305,8 @@ ; PC64LE9-NEXT: addis 3, 2, .LCPI6_2@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 ; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill -; PC64LE9-NEXT: lfs 1, .LCPI6_2@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 +; PC64LE9-NEXT: lfs 1, .LCPI6_2@toc@l(3) ; PC64LE9-NEXT: bl fmod ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload @@ -390,24 +390,24 @@ ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI7_2@toc@ha ; PC64LE9-NEXT: fmr 30, 1 -; PC64LE9-NEXT: lfs 1, .LCPI7_2@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 +; PC64LE9-NEXT: lfs 1, .LCPI7_2@toc@l(3) ; PC64LE9-NEXT: bl fmodf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI7_3@toc@ha ; PC64LE9-NEXT: fmr 29, 1 -; PC64LE9-NEXT: lfs 1, .LCPI7_3@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 +; PC64LE9-NEXT: lfs 1, .LCPI7_3@toc@l(3) ; PC64LE9-NEXT: bl fmodf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI7_4@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI7_4@toc@l +; PC64LE9-NEXT: lxvx 36, 0, 3 ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 29 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 30 -; PC64LE9-NEXT: addis 3, 2, .LCPI7_4@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI7_4@toc@l -; PC64LE9-NEXT: lxvx 36, 0, 3 ; PC64LE9-NEXT: vmrghw 2, 3, 2 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3 ; PC64LE9-NEXT: vperm 2, 3, 2, 4 @@ -478,27 +478,27 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -80(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI8_0@toc@ha +; PC64LE9-NEXT: stfd 31, 72(1) # 8-byte Folded Spill +; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill ; PC64LE9-NEXT: lfs 1, .LCPI8_0@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI8_1@toc@ha -; PC64LE9-NEXT: stfd 31, 72(1) # 8-byte Folded Spill ; PC64LE9-NEXT: lfs 31, .LCPI8_1@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 -; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill ; PC64LE9-NEXT: bl fmod ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI8_2@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 ; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill -; PC64LE9-NEXT: lfs 1, .LCPI8_2@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 +; PC64LE9-NEXT: lfs 1, .LCPI8_2@toc@l(3) ; PC64LE9-NEXT: bl fmod ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload ; PC64LE9-NEXT: addis 3, 2, .LCPI8_3@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE9-NEXT: fmr 2, 31 ; PC64LE9-NEXT: xxmrghd 63, 1, 0 ; PC64LE9-NEXT: lfs 1, .LCPI8_3@toc@l(3) -; PC64LE9-NEXT: fmr 2, 31 ; PC64LE9-NEXT: bl fmod ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: fmr 3, 1 @@ -580,34 +580,34 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -80(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI9_0@toc@ha +; PC64LE9-NEXT: stfd 31, 72(1) # 8-byte Folded Spill +; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill ; PC64LE9-NEXT: lfs 1, .LCPI9_0@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI9_1@toc@ha -; PC64LE9-NEXT: stfd 31, 72(1) # 8-byte Folded Spill ; PC64LE9-NEXT: lfs 31, .LCPI9_1@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 -; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill ; PC64LE9-NEXT: bl fmod ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI9_2@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 ; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill -; PC64LE9-NEXT: lfs 1, .LCPI9_2@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 +; PC64LE9-NEXT: lfs 1, .LCPI9_2@toc@l(3) ; PC64LE9-NEXT: bl fmod ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload ; PC64LE9-NEXT: addis 3, 2, .LCPI9_3@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE9-NEXT: fmr 2, 31 ; PC64LE9-NEXT: xxmrghd 63, 1, 0 ; PC64LE9-NEXT: lfs 1, .LCPI9_3@toc@l(3) -; PC64LE9-NEXT: fmr 2, 31 ; PC64LE9-NEXT: bl fmod ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI9_4@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 ; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill -; PC64LE9-NEXT: lfs 1, .LCPI9_4@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 +; PC64LE9-NEXT: lfs 1, .LCPI9_4@toc@l(3) ; PC64LE9-NEXT: bl fmod ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload @@ -935,23 +935,23 @@ ; PC64LE9-LABEL: constrained_vector_fadd_v3f32: ; PC64LE9: # %bb.0: # %entry ; PC64LE9-NEXT: addis 3, 2, .LCPI17_0@toc@ha +; PC64LE9-NEXT: xxlxor 1, 1, 1 ; PC64LE9-NEXT: lfs 0, .LCPI17_0@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI17_1@toc@ha ; PC64LE9-NEXT: lfs 2, .LCPI17_1@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI17_2@toc@ha -; PC64LE9-NEXT: xsaddsp 2, 0, 2 ; PC64LE9-NEXT: lfs 3, .LCPI17_2@toc@l(3) -; PC64LE9-NEXT: xxlxor 1, 1, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI17_3@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI17_3@toc@l ; PC64LE9-NEXT: xsaddsp 1, 0, 1 +; PC64LE9-NEXT: lxvx 36, 0, 3 +; PC64LE9-NEXT: xsaddsp 2, 0, 2 ; PC64LE9-NEXT: xsaddsp 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 0 ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 2 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 1 -; PC64LE9-NEXT: addis 3, 2, .LCPI17_3@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI17_3@toc@l -; PC64LE9-NEXT: lxvx 36, 0, 3 ; PC64LE9-NEXT: vmrghw 2, 3, 2 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3 ; PC64LE9-NEXT: vperm 2, 3, 2, 4 @@ -990,9 +990,9 @@ ; PC64LE9-LABEL: constrained_vector_fadd_v3f64: ; PC64LE9: # %bb.0: # %entry ; PC64LE9-NEXT: addis 3, 2, .LCPI18_0@toc@ha +; PC64LE9-NEXT: xxlxor 1, 1, 1 ; PC64LE9-NEXT: lfd 0, .LCPI18_0@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI18_1@toc@ha -; PC64LE9-NEXT: xxlxor 1, 1, 1 ; PC64LE9-NEXT: addi 3, 3, .LCPI18_1@toc@l ; PC64LE9-NEXT: xsadddp 3, 0, 1 ; PC64LE9-NEXT: lxvx 0, 0, 3 @@ -1147,23 +1147,23 @@ ; PC64LE9-LABEL: constrained_vector_fsub_v3f32: ; PC64LE9: # %bb.0: # %entry ; PC64LE9-NEXT: addis 3, 2, .LCPI22_0@toc@ha +; PC64LE9-NEXT: xxlxor 1, 1, 1 ; PC64LE9-NEXT: lfs 0, .LCPI22_0@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI22_1@toc@ha ; PC64LE9-NEXT: lfs 2, .LCPI22_1@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI22_2@toc@ha -; PC64LE9-NEXT: xssubsp 2, 0, 2 ; PC64LE9-NEXT: lfs 3, .LCPI22_2@toc@l(3) -; PC64LE9-NEXT: xxlxor 1, 1, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI22_3@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI22_3@toc@l ; PC64LE9-NEXT: xssubsp 1, 0, 1 +; PC64LE9-NEXT: lxvx 36, 0, 3 +; PC64LE9-NEXT: xssubsp 2, 0, 2 ; PC64LE9-NEXT: xssubsp 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 0 ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 2 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 1 -; PC64LE9-NEXT: addis 3, 2, .LCPI22_3@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI22_3@toc@l -; PC64LE9-NEXT: lxvx 36, 0, 3 ; PC64LE9-NEXT: vmrghw 2, 3, 2 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3 ; PC64LE9-NEXT: vperm 2, 3, 2, 4 @@ -1202,9 +1202,9 @@ ; PC64LE9-LABEL: constrained_vector_fsub_v3f64: ; PC64LE9: # %bb.0: # %entry ; PC64LE9-NEXT: addis 3, 2, .LCPI23_0@toc@ha +; PC64LE9-NEXT: xxlxor 1, 1, 1 ; PC64LE9-NEXT: lfd 0, .LCPI23_0@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI23_1@toc@ha -; PC64LE9-NEXT: xxlxor 1, 1, 1 ; PC64LE9-NEXT: addi 3, 3, .LCPI23_1@toc@l ; PC64LE9-NEXT: xssubdp 3, 0, 1 ; PC64LE9-NEXT: lxvx 0, 0, 3 @@ -1534,8 +1534,8 @@ ; PC64LE9-NEXT: addis 3, 2, .LCPI31_2@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 ; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill -; PC64LE9-NEXT: lfd 1, .LCPI31_2@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 +; PC64LE9-NEXT: lfd 1, .LCPI31_2@toc@l(3) ; PC64LE9-NEXT: bl pow ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload @@ -1619,24 +1619,24 @@ ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI32_2@toc@ha ; PC64LE9-NEXT: fmr 30, 1 -; PC64LE9-NEXT: lfs 1, .LCPI32_2@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 +; PC64LE9-NEXT: lfs 1, .LCPI32_2@toc@l(3) ; PC64LE9-NEXT: bl powf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI32_3@toc@ha ; PC64LE9-NEXT: fmr 29, 1 -; PC64LE9-NEXT: lfs 1, .LCPI32_3@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 +; PC64LE9-NEXT: lfs 1, .LCPI32_3@toc@l(3) ; PC64LE9-NEXT: bl powf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI32_4@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI32_4@toc@l +; PC64LE9-NEXT: lxvx 36, 0, 3 ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 29 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 30 -; PC64LE9-NEXT: addis 3, 2, .LCPI32_4@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI32_4@toc@l -; PC64LE9-NEXT: lxvx 36, 0, 3 ; PC64LE9-NEXT: vmrghw 2, 3, 2 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3 ; PC64LE9-NEXT: vperm 2, 3, 2, 4 @@ -1707,27 +1707,27 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -80(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI33_0@toc@ha +; PC64LE9-NEXT: stfd 31, 72(1) # 8-byte Folded Spill +; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill ; PC64LE9-NEXT: lfs 1, .LCPI33_0@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI33_1@toc@ha -; PC64LE9-NEXT: stfd 31, 72(1) # 8-byte Folded Spill ; PC64LE9-NEXT: lfs 31, .LCPI33_1@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 -; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill ; PC64LE9-NEXT: bl pow ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI33_2@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 ; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill -; PC64LE9-NEXT: lfd 1, .LCPI33_2@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 +; PC64LE9-NEXT: lfd 1, .LCPI33_2@toc@l(3) ; PC64LE9-NEXT: bl pow ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload ; PC64LE9-NEXT: addis 3, 2, .LCPI33_3@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE9-NEXT: fmr 2, 31 ; PC64LE9-NEXT: xxmrghd 63, 1, 0 ; PC64LE9-NEXT: lfd 1, .LCPI33_3@toc@l(3) -; PC64LE9-NEXT: fmr 2, 31 ; PC64LE9-NEXT: bl pow ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: fmr 3, 1 @@ -1809,34 +1809,34 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -80(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI34_0@toc@ha +; PC64LE9-NEXT: stfd 31, 72(1) # 8-byte Folded Spill +; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill ; PC64LE9-NEXT: lfd 1, .LCPI34_0@toc@l(3) ; PC64LE9-NEXT: addis 3, 2, .LCPI34_1@toc@ha -; PC64LE9-NEXT: stfd 31, 72(1) # 8-byte Folded Spill ; PC64LE9-NEXT: lfs 31, .LCPI34_1@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 -; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill ; PC64LE9-NEXT: bl pow ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI34_2@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 ; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill -; PC64LE9-NEXT: lfd 1, .LCPI34_2@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 +; PC64LE9-NEXT: lfd 1, .LCPI34_2@toc@l(3) ; PC64LE9-NEXT: bl pow ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload ; PC64LE9-NEXT: addis 3, 2, .LCPI34_3@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE9-NEXT: fmr 2, 31 ; PC64LE9-NEXT: xxmrghd 63, 1, 0 ; PC64LE9-NEXT: lfd 1, .LCPI34_3@toc@l(3) -; PC64LE9-NEXT: fmr 2, 31 ; PC64LE9-NEXT: bl pow ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI34_4@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 ; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill -; PC64LE9-NEXT: lfd 1, .LCPI34_4@toc@l(3) ; PC64LE9-NEXT: fmr 2, 31 +; PC64LE9-NEXT: lfd 1, .LCPI34_4@toc@l(3) ; PC64LE9-NEXT: bl pow ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload @@ -1882,8 +1882,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -32(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI35_0@toc@ha -; PC64LE9-NEXT: lfs 1, .LCPI35_0@toc@l(3) ; PC64LE9-NEXT: li 4, 3 +; PC64LE9-NEXT: lfs 1, .LCPI35_0@toc@l(3) ; PC64LE9-NEXT: bl __powisf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addi 1, 1, 32 @@ -1933,15 +1933,15 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -48(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI36_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI36_0@toc@l(3) ; PC64LE9-NEXT: li 4, 3 +; PC64LE9-NEXT: lfd 1, .LCPI36_0@toc@l(3) ; PC64LE9-NEXT: bl __powidf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI36_1@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 ; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill -; PC64LE9-NEXT: lfd 1, .LCPI36_1@toc@l(3) ; PC64LE9-NEXT: li 4, 3 +; PC64LE9-NEXT: lfd 1, .LCPI36_1@toc@l(3) ; PC64LE9-NEXT: bl __powidf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload @@ -2013,30 +2013,30 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -48(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI37_0@toc@ha -; PC64LE9-NEXT: lfs 1, .LCPI37_0@toc@l(3) ; PC64LE9-NEXT: li 4, 3 +; PC64LE9-NEXT: lfs 1, .LCPI37_0@toc@l(3) ; PC64LE9-NEXT: bl __powisf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI37_1@toc@ha ; PC64LE9-NEXT: fmr 31, 1 -; PC64LE9-NEXT: lfs 1, .LCPI37_1@toc@l(3) ; PC64LE9-NEXT: li 4, 3 +; PC64LE9-NEXT: lfs 1, .LCPI37_1@toc@l(3) ; PC64LE9-NEXT: bl __powisf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI37_2@toc@ha ; PC64LE9-NEXT: fmr 30, 1 -; PC64LE9-NEXT: lfs 1, .LCPI37_2@toc@l(3) ; PC64LE9-NEXT: li 4, 3 +; PC64LE9-NEXT: lfs 1, .LCPI37_2@toc@l(3) ; PC64LE9-NEXT: bl __powisf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI37_3@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI37_3@toc@l +; PC64LE9-NEXT: lxvx 36, 0, 3 ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 30 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 31 -; PC64LE9-NEXT: addis 3, 2, .LCPI37_3@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI37_3@toc@l -; PC64LE9-NEXT: lxvx 36, 0, 3 ; PC64LE9-NEXT: vmrghw 2, 3, 2 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3 ; PC64LE9-NEXT: vperm 2, 3, 2, 4 @@ -2102,24 +2102,24 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI38_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI38_0@toc@l(3) ; PC64LE9-NEXT: li 4, 3 ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI38_0@toc@l(3) ; PC64LE9-NEXT: bl __powidf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI38_1@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 ; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill -; PC64LE9-NEXT: lfs 1, .LCPI38_1@toc@l(3) ; PC64LE9-NEXT: li 4, 3 +; PC64LE9-NEXT: lfs 1, .LCPI38_1@toc@l(3) ; PC64LE9-NEXT: bl __powidf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload ; PC64LE9-NEXT: addis 3, 2, .LCPI38_2@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE9-NEXT: li 4, 3 ; PC64LE9-NEXT: xxmrghd 63, 0, 1 ; PC64LE9-NEXT: lfd 1, .LCPI38_2@toc@l(3) -; PC64LE9-NEXT: li 4, 3 ; PC64LE9-NEXT: bl __powidf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: fmr 3, 1 @@ -2196,31 +2196,31 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI39_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI39_0@toc@l(3) ; PC64LE9-NEXT: li 4, 3 ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI39_0@toc@l(3) ; PC64LE9-NEXT: bl __powidf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI39_1@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 ; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill -; PC64LE9-NEXT: lfd 1, .LCPI39_1@toc@l(3) ; PC64LE9-NEXT: li 4, 3 +; PC64LE9-NEXT: lfd 1, .LCPI39_1@toc@l(3) ; PC64LE9-NEXT: bl __powidf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload ; PC64LE9-NEXT: addis 3, 2, .LCPI39_2@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE9-NEXT: li 4, 3 ; PC64LE9-NEXT: xxmrghd 63, 1, 0 ; PC64LE9-NEXT: lfd 1, .LCPI39_2@toc@l(3) -; PC64LE9-NEXT: li 4, 3 ; PC64LE9-NEXT: bl __powidf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI39_3@toc@ha ; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 ; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill -; PC64LE9-NEXT: lfd 1, .LCPI39_3@toc@l(3) ; PC64LE9-NEXT: li 4, 3 +; PC64LE9-NEXT: lfd 1, .LCPI39_3@toc@l(3) ; PC64LE9-NEXT: bl __powidf2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload @@ -2396,12 +2396,12 @@ ; PC64LE9-NEXT: bl sinf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI42_3@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI42_3@toc@l ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 30 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 31 -; PC64LE9-NEXT: addis 3, 2, .LCPI42_3@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI42_3@toc@l ; PC64LE9-NEXT: vmrghw 2, 3, 2 ; PC64LE9-NEXT: lxvx 35, 0, 3 ; PC64LE9-NEXT: xxsldwi 36, 0, 0, 3 @@ -2464,8 +2464,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI43_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI43_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI43_0@toc@l(3) ; PC64LE9-NEXT: bl sin ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI43_1@toc@ha @@ -2550,8 +2550,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI44_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI44_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI44_0@toc@l(3) ; PC64LE9-NEXT: bl sin ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI44_1@toc@ha @@ -2745,12 +2745,12 @@ ; PC64LE9-NEXT: bl cosf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI47_3@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI47_3@toc@l ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 30 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 31 -; PC64LE9-NEXT: addis 3, 2, .LCPI47_3@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI47_3@toc@l ; PC64LE9-NEXT: vmrghw 2, 3, 2 ; PC64LE9-NEXT: lxvx 35, 0, 3 ; PC64LE9-NEXT: xxsldwi 36, 0, 0, 3 @@ -2813,8 +2813,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI48_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI48_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI48_0@toc@l(3) ; PC64LE9-NEXT: bl cos ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI48_1@toc@ha @@ -2899,8 +2899,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI49_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI49_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI49_0@toc@l(3) ; PC64LE9-NEXT: bl cos ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI49_1@toc@ha @@ -3094,12 +3094,12 @@ ; PC64LE9-NEXT: bl expf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI52_3@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI52_3@toc@l ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 30 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 31 -; PC64LE9-NEXT: addis 3, 2, .LCPI52_3@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI52_3@toc@l ; PC64LE9-NEXT: vmrghw 2, 3, 2 ; PC64LE9-NEXT: lxvx 35, 0, 3 ; PC64LE9-NEXT: xxsldwi 36, 0, 0, 3 @@ -3162,8 +3162,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI53_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI53_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI53_0@toc@l(3) ; PC64LE9-NEXT: bl exp ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI53_1@toc@ha @@ -3248,8 +3248,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI54_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI54_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI54_0@toc@l(3) ; PC64LE9-NEXT: bl exp ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI54_1@toc@ha @@ -3443,12 +3443,12 @@ ; PC64LE9-NEXT: bl exp2f ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI57_3@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI57_3@toc@l ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 30 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 31 -; PC64LE9-NEXT: addis 3, 2, .LCPI57_3@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI57_3@toc@l ; PC64LE9-NEXT: vmrghw 2, 3, 2 ; PC64LE9-NEXT: lxvx 35, 0, 3 ; PC64LE9-NEXT: xxsldwi 36, 0, 0, 3 @@ -3511,8 +3511,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI58_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI58_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI58_0@toc@l(3) ; PC64LE9-NEXT: bl exp2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI58_1@toc@ha @@ -3597,8 +3597,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI59_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI59_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI59_0@toc@l(3) ; PC64LE9-NEXT: bl exp2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI59_1@toc@ha @@ -3792,12 +3792,12 @@ ; PC64LE9-NEXT: bl logf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI62_3@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI62_3@toc@l ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 30 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 31 -; PC64LE9-NEXT: addis 3, 2, .LCPI62_3@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI62_3@toc@l ; PC64LE9-NEXT: vmrghw 2, 3, 2 ; PC64LE9-NEXT: lxvx 35, 0, 3 ; PC64LE9-NEXT: xxsldwi 36, 0, 0, 3 @@ -3860,8 +3860,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI63_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI63_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI63_0@toc@l(3) ; PC64LE9-NEXT: bl log ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI63_1@toc@ha @@ -3946,8 +3946,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI64_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI64_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI64_0@toc@l(3) ; PC64LE9-NEXT: bl log ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI64_1@toc@ha @@ -4141,12 +4141,12 @@ ; PC64LE9-NEXT: bl log10f ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI67_3@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI67_3@toc@l ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 30 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 31 -; PC64LE9-NEXT: addis 3, 2, .LCPI67_3@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI67_3@toc@l ; PC64LE9-NEXT: vmrghw 2, 3, 2 ; PC64LE9-NEXT: lxvx 35, 0, 3 ; PC64LE9-NEXT: xxsldwi 36, 0, 0, 3 @@ -4209,8 +4209,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI68_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI68_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI68_0@toc@l(3) ; PC64LE9-NEXT: bl log10 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI68_1@toc@ha @@ -4295,8 +4295,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI69_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI69_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI69_0@toc@l(3) ; PC64LE9-NEXT: bl log10 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI69_1@toc@ha @@ -4490,12 +4490,12 @@ ; PC64LE9-NEXT: bl log2f ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI72_3@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI72_3@toc@l ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 30 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 31 -; PC64LE9-NEXT: addis 3, 2, .LCPI72_3@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI72_3@toc@l ; PC64LE9-NEXT: vmrghw 2, 3, 2 ; PC64LE9-NEXT: lxvx 35, 0, 3 ; PC64LE9-NEXT: xxsldwi 36, 0, 0, 3 @@ -4558,8 +4558,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI73_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI73_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI73_0@toc@l(3) ; PC64LE9-NEXT: bl log2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI73_1@toc@ha @@ -4644,8 +4644,8 @@ ; PC64LE9-NEXT: std 0, 16(1) ; PC64LE9-NEXT: stdu 1, -64(1) ; PC64LE9-NEXT: addis 3, 2, .LCPI74_0@toc@ha -; PC64LE9-NEXT: lfd 1, .LCPI74_0@toc@l(3) ; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: lfd 1, .LCPI74_0@toc@l(3) ; PC64LE9-NEXT: bl log2 ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI74_1@toc@ha @@ -4983,12 +4983,12 @@ ; PC64LE9-NEXT: bl nearbyintf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI82_3@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI82_3@toc@l ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 30 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 31 -; PC64LE9-NEXT: addis 3, 2, .LCPI82_3@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI82_3@toc@l ; PC64LE9-NEXT: vmrghw 2, 3, 2 ; PC64LE9-NEXT: lxvx 35, 0, 3 ; PC64LE9-NEXT: xxsldwi 36, 0, 0, 3 @@ -5221,19 +5221,19 @@ ; PC64LE9-NEXT: bl fmaxf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI87_4@toc@ha -; PC64LE9-NEXT: lfs 2, .LCPI87_4@toc@l(3) ; PC64LE9-NEXT: fmr 29, 1 ; PC64LE9-NEXT: fmr 1, 31 +; PC64LE9-NEXT: lfs 2, .LCPI87_4@toc@l(3) ; PC64LE9-NEXT: bl fmaxf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI87_5@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI87_5@toc@l +; PC64LE9-NEXT: lxvx 36, 0, 3 ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 29 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 30 -; PC64LE9-NEXT: addis 3, 2, .LCPI87_5@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI87_5@toc@l -; PC64LE9-NEXT: lxvx 36, 0, 3 ; PC64LE9-NEXT: vmrghw 2, 3, 2 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3 ; PC64LE9-NEXT: vperm 2, 3, 2, 4 @@ -5294,11 +5294,11 @@ ; PC64LE9-NEXT: bl fmax ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI88_2@toc@ha +; PC64LE9-NEXT: fmr 3, 1 ; PC64LE9-NEXT: addi 3, 3, .LCPI88_2@toc@l ; PC64LE9-NEXT: lxvx 0, 0, 3 ; PC64LE9-NEXT: addis 3, 2, .LCPI88_3@toc@ha ; PC64LE9-NEXT: addi 3, 3, .LCPI88_3@toc@l -; PC64LE9-NEXT: fmr 3, 1 ; PC64LE9-NEXT: lxvx 1, 0, 3 ; PC64LE9-NEXT: xvmaxdp 2, 1, 0 ; PC64LE9-NEXT: xxswapd 1, 2 @@ -5508,19 +5508,19 @@ ; PC64LE9-NEXT: bl fminf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI92_4@toc@ha -; PC64LE9-NEXT: lfs 2, .LCPI92_4@toc@l(3) ; PC64LE9-NEXT: fmr 29, 1 ; PC64LE9-NEXT: fmr 1, 31 +; PC64LE9-NEXT: lfs 2, .LCPI92_4@toc@l(3) ; PC64LE9-NEXT: bl fminf ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: addis 3, 2, .LCPI92_5@toc@ha +; PC64LE9-NEXT: addi 3, 3, .LCPI92_5@toc@l +; PC64LE9-NEXT: lxvx 36, 0, 3 ; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 29 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3 ; PC64LE9-NEXT: xscvdpspn 0, 30 -; PC64LE9-NEXT: addis 3, 2, .LCPI92_5@toc@ha -; PC64LE9-NEXT: addi 3, 3, .LCPI92_5@toc@l -; PC64LE9-NEXT: lxvx 36, 0, 3 ; PC64LE9-NEXT: vmrghw 2, 3, 2 ; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3 ; PC64LE9-NEXT: vperm 2, 3, 2, 4 @@ -5581,11 +5581,11 @@ ; PC64LE9-NEXT: bl fmin ; PC64LE9-NEXT: nop ; PC64LE9-NEXT: addis 3, 2, .LCPI93_2@toc@ha +; PC64LE9-NEXT: fmr 3, 1 ; PC64LE9-NEXT: addi 3, 3, .LCPI93_2@toc@l ; PC64LE9-NEXT: lxvx 0, 0, 3 ; PC64LE9-NEXT: addis 3, 2, .LCPI93_3@toc@ha ; PC64LE9-NEXT: addi 3, 3, .LCPI93_3@toc@l -; PC64LE9-NEXT: fmr 3, 1 ; PC64LE9-NEXT: lxvx 1, 0, 3 ; PC64LE9-NEXT: xvmindp 2, 1, 0 ; PC64LE9-NEXT: xxswapd 1, 2 diff --git a/llvm/test/CodeGen/X86/testb-je-fusion.ll b/llvm/test/CodeGen/X86/testb-je-fusion.ll --- a/llvm/test/CodeGen/X86/testb-je-fusion.ll +++ b/llvm/test/CodeGen/X86/testb-je-fusion.ll @@ -238,8 +238,8 @@ ; NOFUSION_MISCHEDPOSTRA-LABEL: macrofuse_alu_je: ; NOFUSION_MISCHEDPOSTRA: # %bb.0: # %entry ; NOFUSION_MISCHEDPOSTRA-NEXT: movl %edi, %eax -; NOFUSION_MISCHEDPOSTRA-NEXT: addl $-512, %eax # imm = 0xFE00 ; NOFUSION_MISCHEDPOSTRA-NEXT: movb $1, (%rsi) +; NOFUSION_MISCHEDPOSTRA-NEXT: addl $-512, %eax # imm = 0xFE00 ; NOFUSION_MISCHEDPOSTRA-NEXT: je .LBB2_2 ; NOFUSION_MISCHEDPOSTRA-NEXT: # %bb.1: # %if.then ; NOFUSION_MISCHEDPOSTRA-NEXT: movl $1, %eax @@ -249,8 +249,8 @@ ; BRANCHFUSIONONLY_MISCHEDPOSTRA-LABEL: macrofuse_alu_je: ; BRANCHFUSIONONLY_MISCHEDPOSTRA: # %bb.0: # %entry ; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: movl %edi, %eax -; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: addl $-512, %eax # imm = 0xFE00 ; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: movb $1, (%rsi) +; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: addl $-512, %eax # imm = 0xFE00 ; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: je .LBB2_2 ; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: # %bb.1: # %if.then ; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: movl $1, %eax @@ -340,8 +340,8 @@ ; NOFUSION_MISCHEDPOSTRA-LABEL: macrofuse_dec_je: ; NOFUSION_MISCHEDPOSTRA: # %bb.0: # %entry ; NOFUSION_MISCHEDPOSTRA-NEXT: movl %edi, %eax -; NOFUSION_MISCHEDPOSTRA-NEXT: decl %eax ; NOFUSION_MISCHEDPOSTRA-NEXT: movb $1, (%rsi) +; NOFUSION_MISCHEDPOSTRA-NEXT: decl %eax ; NOFUSION_MISCHEDPOSTRA-NEXT: je .LBB3_2 ; NOFUSION_MISCHEDPOSTRA-NEXT: # %bb.1: # %if.then ; NOFUSION_MISCHEDPOSTRA-NEXT: movl $1, %eax @@ -351,8 +351,8 @@ ; BRANCHFUSIONONLY_MISCHEDPOSTRA-LABEL: macrofuse_dec_je: ; BRANCHFUSIONONLY_MISCHEDPOSTRA: # %bb.0: # %entry ; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: movl %edi, %eax -; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: decl %eax ; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: movb $1, (%rsi) +; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: decl %eax ; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: je .LBB3_2 ; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: # %bb.1: # %if.then ; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: movl $1, %eax diff --git a/llvm/test/CodeGen/X86/topdepthreduce-postra.mir b/llvm/test/CodeGen/X86/topdepthreduce-postra.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/topdepthreduce-postra.mir @@ -0,0 +1,16 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=x86_64 -enable-post-misched -run-pass=postmisched -o - %s | FileCheck %s +--- +# Check that postmisched's TopDepthReduce heuristic moves the DEC32r later +# because of the dependency on eax +name: test +body: | + bb.0: + ; CHECK-LABEL: name: test + ; CHECK: $eax = MOV32rr killed $edi + ; CHECK: MOV8mi killed renamable $rsi, 1, $noreg, 0, $noreg, 1 :: (store 1) + ; CHECK: renamable $eax = DEC32r killed renamable $eax, implicit-def $eflags + $eax = MOV32rr $edi + renamable $eax = DEC32r killed renamable $eax, implicit-def $eflags + MOV8mi killed renamable $rsi, 1, $noreg, 0, $noreg, 1 :: (store 1) +...