diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -52,6 +52,12 @@ cl::desc("Break large PHI nodes for DAGISel"), cl::ReallyHidden, cl::init(true)); +static cl::opt + ForceScalarizeLargePHIs("amdgpu-codegenprepare-force-break-large-phis", + cl::desc("For testing purposes, always break large " + "PHIs even if it isn't profitable."), + cl::ReallyHidden, cl::init(false)); + static cl::opt ScalarizeLargePHIsThreshold( "amdgpu-codegenprepare-break-large-phis-threshold", cl::desc("Minimum type size in bits for breaking large PHI nodes"), @@ -1394,6 +1400,40 @@ return Changed; } +// Helper for breaking large PHIs that returns true when an extractelement on V +// is likely to be combined away (and is thus free or even beneficial). +static bool isInterestingPHIIncomingValue(Value *V, FixedVectorType *FVT) { + InsertElementInst *IE = dyn_cast(V); + + // Constants & InsertElements are interesting. + if (!IE) + return isa(V); + + // Check if this is a simple chain of insertelement that fills the vector. If + // that's the case, we can break up this PHI node profitably because the + // extractelement we will insert will get folded out. + BasicBlock *BB = IE->getParent(); + BitVector EltsCovered(FVT->getNumElements()); + InsertElementInst *Next = IE; + while (Next && !EltsCovered.all()) { + ConstantInt *Idx = dyn_cast(Next->getOperand(2)); + + // Non constant index/out of bounds index -> folding is unlikely. + if (!Idx || Idx->getSExtValue() >= FVT->getNumElements()) + return false; + + EltsCovered.set(Idx->getSExtValue()); + Next = dyn_cast(Next->getOperand(0)); + + // If the chain is spread across basic blocks -> folding is unlikely. + if (Next && Next->getParent() != BB) + return false; + } + + // All elements covered: folding is likely. + return EltsCovered.all(); +} + bool AMDGPUCodeGenPrepare::visitPHINode(PHINode &I) { // Break-up fixed-vector PHIs into smaller pieces. // Default threshold is 32, so it breaks up any vector that's >32 bits into @@ -1412,6 +1452,22 @@ if (!FVT || DL->getTypeSizeInBits(FVT) <= ScalarizeLargePHIsThreshold) return false; + // Try to avoid unprofitable cases: + // - Don't break PHIs that have no interesting incoming values. That is, where + // there is no clear opportunity to fold the "extractelement" instructions we + // would add. + // - For simplicity, don't break PHIs that are used by other PHIs because it'd + // require us to determine if the whole "chain" can be converted or not. e.g. + // if we broke this PHI but not its user, we would actually make things worse. + if (!ForceScalarizeLargePHIs) { + if (none_of( + I.incoming_values(), + [&](Value *V) { return isInterestingPHIIncomingValue(V, FVT); }) || + any_of(I.users(), [&](User *U) { return isa(U); })) { + return false; + } + } + struct VectorSlice { Type *Ty = nullptr; unsigned Idx = 0; diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -514,112 +514,114 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg2, i64 %arg3, <2 x half> %arg4, <2 x half> %arg5) #3 { ; GFX908-LABEL: introduced_copy_to_sgpr: ; GFX908: ; %bb.0: ; %bb -; GFX908-NEXT: global_load_ushort v8, v[0:1], off glc +; GFX908-NEXT: global_load_ushort v16, v[0:1], off glc ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX908-NEXT: s_mov_b32 s9, 0 -; GFX908-NEXT: s_load_dword s4, s[4:5], 0x18 -; GFX908-NEXT: v_mov_b32_e32 v11, 0 +; GFX908-NEXT: s_load_dword s9, s[4:5], 0x18 +; GFX908-NEXT: s_mov_b32 s8, 0 +; GFX908-NEXT: s_mov_b32 s5, s8 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX908-NEXT: s_sub_i32 s5, 0, s3 -; GFX908-NEXT: v_cvt_f32_f16_e32 v9, s4 -; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX908-NEXT: v_cvt_u32_f32_e32 v2, v0 +; GFX908-NEXT: s_sub_i32 s4, 0, s3 +; GFX908-NEXT: v_cvt_f32_f16_e32 v17, s9 +; GFX908-NEXT: v_mov_b32_e32 v19, 0 +; GFX908-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: v_mov_b32_e32 v1, 0 -; GFX908-NEXT: v_readfirstlane_b32 s8, v2 -; GFX908-NEXT: s_mul_i32 s5, s5, s8 -; GFX908-NEXT: s_mul_hi_u32 s5, s8, s5 -; GFX908-NEXT: s_add_i32 s8, s8, s5 -; GFX908-NEXT: s_mul_hi_u32 s5, s2, s8 -; GFX908-NEXT: s_mul_i32 s8, s5, s3 -; GFX908-NEXT: s_sub_i32 s2, s2, s8 -; GFX908-NEXT: s_add_i32 s10, s5, 1 -; GFX908-NEXT: s_sub_i32 s8, s2, s3 +; GFX908-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GFX908-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX908-NEXT: v_readfirstlane_b32 s10, v2 +; GFX908-NEXT: s_mul_i32 s4, s4, s10 +; GFX908-NEXT: s_mul_hi_u32 s4, s10, s4 +; GFX908-NEXT: s_add_i32 s10, s10, s4 +; GFX908-NEXT: s_mul_hi_u32 s4, s2, s10 +; GFX908-NEXT: s_mul_i32 s10, s4, s3 +; GFX908-NEXT: s_sub_i32 s2, s2, s10 +; GFX908-NEXT: s_add_i32 s11, s4, 1 +; GFX908-NEXT: s_sub_i32 s10, s2, s3 ; GFX908-NEXT: s_cmp_ge_u32 s2, s3 -; GFX908-NEXT: s_cselect_b32 s5, s10, s5 -; GFX908-NEXT: s_cselect_b32 s2, s8, s2 -; GFX908-NEXT: s_add_i32 s8, s5, 1 +; GFX908-NEXT: s_cselect_b32 s4, s11, s4 +; GFX908-NEXT: s_cselect_b32 s2, s10, s2 +; GFX908-NEXT: s_add_i32 s10, s4, 1 ; GFX908-NEXT: s_cmp_ge_u32 s2, s3 -; GFX908-NEXT: s_cselect_b32 s8, s8, s5 -; GFX908-NEXT: s_lshr_b32 s10, s4, 16 -; GFX908-NEXT: v_cvt_f32_f16_e32 v10, s10 -; GFX908-NEXT: s_lshl_b64 s[10:11], s[8:9], 5 +; GFX908-NEXT: s_cselect_b32 s4, s10, s4 +; GFX908-NEXT: s_lshr_b32 s9, s9, 16 +; GFX908-NEXT: s_lshl_b64 s[12:13], s[4:5], 5 +; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s9 ; GFX908-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 -; GFX908-NEXT: s_lshl_b64 s[4:5], s[6:7], 5 -; GFX908-NEXT: s_or_b32 s4, s4, 28 +; GFX908-NEXT: s_lshl_b64 s[10:11], s[6:7], 5 +; GFX908-NEXT: s_or_b32 s10, s10, 28 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s9, v8 -; GFX908-NEXT: s_and_b32 s9, 0xffff, s9 -; GFX908-NEXT: s_mul_i32 s1, s1, s9 -; GFX908-NEXT: s_mul_hi_u32 s12, s0, s9 -; GFX908-NEXT: s_mul_i32 s0, s0, s9 -; GFX908-NEXT: s_add_i32 s1, s12, s1 +; GFX908-NEXT: v_readfirstlane_b32 s5, v16 +; GFX908-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX908-NEXT: s_mul_i32 s1, s1, s5 +; GFX908-NEXT: s_mul_hi_u32 s9, s0, s5 +; GFX908-NEXT: s_mul_i32 s0, s0, s5 +; GFX908-NEXT: s_add_i32 s1, s9, s1 ; GFX908-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 ; GFX908-NEXT: s_branch .LBB3_2 -; GFX908-NEXT: .LBB3_1: ; %Flow56 +; GFX908-NEXT: .LBB3_1: ; %Flow20 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[12:13] +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[14:15] ; GFX908-NEXT: s_cbranch_vccz .LBB3_12 ; GFX908-NEXT: .LBB3_2: ; %bb9 ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB3_5 Depth 2 -; GFX908-NEXT: s_mov_b64 s[14:15], -1 +; GFX908-NEXT: s_mov_b64 s[16:17], -1 ; GFX908-NEXT: s_cbranch_scc0 .LBB3_10 ; GFX908-NEXT: ; %bb.3: ; %bb14 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX908-NEXT: v_cmp_lt_i64_e64 s[12:13], s[6:7], 0 -; GFX908-NEXT: v_cmp_gt_i64_e64 s[14:15], s[6:7], -1 -; GFX908-NEXT: v_mov_b32_e32 v12, 0 -; GFX908-NEXT: s_mov_b64 s[18:19], s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v18, 0 -; GFX908-NEXT: v_mov_b32_e32 v17, 0 -; GFX908-NEXT: v_mov_b32_e32 v16, 0 -; GFX908-NEXT: v_mov_b32_e32 v15, 0 -; GFX908-NEXT: v_mov_b32_e32 v14, 0 -; GFX908-NEXT: v_mov_b32_e32 v13, 0 -; GFX908-NEXT: v_mov_b32_e32 v19, 0 +; GFX908-NEXT: s_mov_b32 s9, s8 +; GFX908-NEXT: v_mov_b32_e32 v4, s8 +; GFX908-NEXT: v_mov_b32_e32 v8, s8 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 +; GFX908-NEXT: v_mov_b32_e32 v5, s9 +; GFX908-NEXT: v_mov_b32_e32 v9, s9 +; GFX908-NEXT: v_mov_b32_e32 v7, s9 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0 +; GFX908-NEXT: v_cmp_gt_i64_e64 s[16:17], s[6:7], -1 +; GFX908-NEXT: v_mov_b32_e32 v11, v5 +; GFX908-NEXT: s_mov_b64 s[20:21], s[10:11] +; GFX908-NEXT: v_mov_b32_e32 v10, v4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s9, v2 -; GFX908-NEXT: v_readfirstlane_b32 s16, v3 -; GFX908-NEXT: s_add_u32 s9, s9, 1 -; GFX908-NEXT: s_addc_u32 s17, s16, 0 -; GFX908-NEXT: s_mul_hi_u32 s20, s2, s9 -; GFX908-NEXT: s_mul_i32 s21, s3, s9 -; GFX908-NEXT: s_mul_i32 s16, s2, s9 -; GFX908-NEXT: s_mul_i32 s9, s2, s17 -; GFX908-NEXT: s_add_i32 s9, s20, s9 -; GFX908-NEXT: s_add_i32 s9, s9, s21 +; GFX908-NEXT: v_readfirstlane_b32 s5, v2 +; GFX908-NEXT: v_readfirstlane_b32 s9, v3 +; GFX908-NEXT: s_add_u32 s5, s5, 1 +; GFX908-NEXT: s_addc_u32 s9, s9, 0 +; GFX908-NEXT: s_mul_hi_u32 s19, s2, s5 +; GFX908-NEXT: s_mul_i32 s22, s3, s5 +; GFX908-NEXT: s_mul_i32 s18, s2, s5 +; GFX908-NEXT: s_mul_i32 s5, s2, s9 +; GFX908-NEXT: s_add_i32 s5, s19, s5 +; GFX908-NEXT: s_add_i32 s5, s5, s22 ; GFX908-NEXT: s_branch .LBB3_5 ; GFX908-NEXT: .LBB3_4: ; %bb58 ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 -; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX908-NEXT: s_add_u32 s18, s18, s0 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[22:23], -1, v[2:3] -; GFX908-NEXT: s_addc_u32 s19, s19, s1 -; GFX908-NEXT: s_mov_b64 s[20:21], 0 -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23] +; GFX908-NEXT: s_add_u32 s20, s20, s0 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3] +; GFX908-NEXT: s_addc_u32 s21, s21, s1 +; GFX908-NEXT: s_mov_b64 s[22:23], 0 +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25] ; GFX908-NEXT: s_cbranch_vccz .LBB3_9 ; GFX908-NEXT: .LBB3_5: ; %bb16 ; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: s_add_u32 s20, s18, s16 -; GFX908-NEXT: s_addc_u32 s21, s19, s9 -; GFX908-NEXT: global_load_dword v21, v11, s[20:21] offset:-12 glc +; GFX908-NEXT: s_add_u32 s22, s20, s18 +; GFX908-NEXT: s_addc_u32 s23, s21, s5 +; GFX908-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v20, v11, s[20:21] offset:-8 glc +; GFX908-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v4, v11, s[20:21] offset:-4 glc +; GFX908-NEXT: global_load_dword v12, v19, s[22:23] offset:-4 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v4, v11, s[20:21] glc +; GFX908-NEXT: global_load_dword v12, v19, s[22:23] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: ds_read_b64 v[4:5], v11 -; GFX908-NEXT: ds_read_b64 v[6:7], v0 -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[14:15] +; GFX908-NEXT: ds_read_b64 v[12:13], v19 +; GFX908-NEXT: ds_read_b64 v[14:15], v0 +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[16:17] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_cbranch_vccnz .LBB3_7 ; GFX908-NEXT: ; %bb.6: ; %bb51 @@ -628,54 +630,50 @@ ; GFX908-NEXT: v_cvt_f32_f16_e32 v21, v21 ; GFX908-NEXT: v_cvt_f32_f16_sdwa v23, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX908-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GFX908-NEXT: v_add_f32_e32 v24, v10, v5 -; GFX908-NEXT: v_add_f32_e32 v25, v9, v4 -; GFX908-NEXT: v_add_f32_e32 v26, 0, v5 -; GFX908-NEXT: v_add_f32_e32 v27, 0, v4 -; GFX908-NEXT: v_add_f32_e32 v7, v22, v7 -; GFX908-NEXT: v_add_f32_e32 v6, v21, v6 -; GFX908-NEXT: v_add_f32_e32 v5, v23, v5 -; GFX908-NEXT: v_add_f32_e32 v4, v20, v4 -; GFX908-NEXT: v_add_f32_e32 v12, v12, v25 -; GFX908-NEXT: v_add_f32_e32 v18, v18, v24 -; GFX908-NEXT: v_add_f32_e32 v17, v17, v27 -; GFX908-NEXT: v_add_f32_e32 v16, v16, v26 -; GFX908-NEXT: v_add_f32_e32 v15, v15, v6 -; GFX908-NEXT: v_add_f32_e32 v14, v14, v7 -; GFX908-NEXT: v_add_f32_e32 v13, v13, v4 -; GFX908-NEXT: v_add_f32_e32 v19, v19, v5 -; GFX908-NEXT: s_mov_b64 s[20:21], -1 +; GFX908-NEXT: v_add_f32_e32 v24, v17, v12 +; GFX908-NEXT: v_add_f32_e32 v25, v18, v13 +; GFX908-NEXT: v_add_f32_e32 v26, 0, v12 +; GFX908-NEXT: v_add_f32_e32 v27, 0, v13 +; GFX908-NEXT: v_add_f32_e32 v15, v22, v15 +; GFX908-NEXT: v_add_f32_e32 v14, v21, v14 +; GFX908-NEXT: v_add_f32_e32 v13, v23, v13 +; GFX908-NEXT: v_add_f32_e32 v12, v20, v12 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v25 +; GFX908-NEXT: v_add_f32_e32 v4, v4, v24 +; GFX908-NEXT: v_add_f32_e32 v9, v9, v27 +; GFX908-NEXT: v_add_f32_e32 v8, v8, v26 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v14 +; GFX908-NEXT: v_add_f32_e32 v7, v7, v15 +; GFX908-NEXT: v_add_f32_e32 v10, v10, v12 +; GFX908-NEXT: v_add_f32_e32 v11, v11, v13 +; GFX908-NEXT: s_mov_b64 s[22:23], -1 ; GFX908-NEXT: s_branch .LBB3_4 ; GFX908-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 -; GFX908-NEXT: s_mov_b64 s[20:21], s[12:13] -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[20:21] +; GFX908-NEXT: s_mov_b64 s[22:23], s[14:15] +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23] ; GFX908-NEXT: s_cbranch_vccz .LBB3_4 ; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: ; implicit-def: $vgpr19 -; GFX908-NEXT: ; implicit-def: $vgpr13 -; GFX908-NEXT: ; implicit-def: $vgpr14 -; GFX908-NEXT: ; implicit-def: $vgpr15 -; GFX908-NEXT: ; implicit-def: $vgpr16 -; GFX908-NEXT: ; implicit-def: $vgpr17 -; GFX908-NEXT: ; implicit-def: $vgpr18 -; GFX908-NEXT: ; implicit-def: $vgpr12 +; GFX908-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX908-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX908-NEXT: ; implicit-def: $sgpr18_sgpr19 +; GFX908-NEXT: ; implicit-def: $sgpr20_sgpr21 ; GFX908-NEXT: .LBB3_9: ; %loop.exit.guard ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_xor_b64 s[14:15], s[20:21], -1 -; GFX908-NEXT: .LBB3_10: ; %Flow55 +; GFX908-NEXT: s_xor_b64 s[16:17], s[22:23], -1 +; GFX908-NEXT: .LBB3_10: ; %Flow19 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_mov_b64 s[12:13], -1 -; GFX908-NEXT: s_and_b64 vcc, exec, s[14:15] +; GFX908-NEXT: s_mov_b64 s[14:15], -1 +; GFX908-NEXT: s_and_b64 vcc, exec, s[16:17] ; GFX908-NEXT: s_cbranch_vccz .LBB3_1 ; GFX908-NEXT: ; %bb.11: ; %bb12 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_add_u32 s6, s6, s8 +; GFX908-NEXT: s_add_u32 s6, s6, s4 ; GFX908-NEXT: s_addc_u32 s7, s7, 0 -; GFX908-NEXT: s_add_u32 s4, s4, s10 -; GFX908-NEXT: s_addc_u32 s5, s5, s11 -; GFX908-NEXT: s_mov_b64 s[12:13], 0 +; GFX908-NEXT: s_add_u32 s10, s10, s12 +; GFX908-NEXT: s_addc_u32 s11, s11, s13 +; GFX908-NEXT: s_mov_b64 s[14:15], 0 ; GFX908-NEXT: s_branch .LBB3_1 ; GFX908-NEXT: .LBB3_12: ; %DummyReturnBlock ; GFX908-NEXT: s_endpgm @@ -685,109 +683,107 @@ ; GFX90A-NEXT: global_load_ushort v18, v[0:1], off glc ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX90A-NEXT: s_mov_b32 s9, 0 -; GFX90A-NEXT: s_load_dword s4, s[4:5], 0x18 -; GFX90A-NEXT: v_mov_b32_e32 v19, 0 +; GFX90A-NEXT: s_load_dword s9, s[4:5], 0x18 +; GFX90A-NEXT: s_mov_b32 s8, 0 +; GFX90A-NEXT: s_mov_b32 s5, s8 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX90A-NEXT: s_sub_i32 s5, 0, s3 +; GFX90A-NEXT: s_sub_i32 s4, 0, s3 +; GFX90A-NEXT: v_mov_b32_e32 v19, 0 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v0 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v1 -; GFX90A-NEXT: s_mul_i32 s5, s5, s8 -; GFX90A-NEXT: s_mul_hi_u32 s5, s8, s5 -; GFX90A-NEXT: s_add_i32 s8, s8, s5 -; GFX90A-NEXT: s_mul_hi_u32 s5, s2, s8 -; GFX90A-NEXT: s_mul_i32 s8, s5, s3 -; GFX90A-NEXT: s_sub_i32 s2, s2, s8 -; GFX90A-NEXT: s_add_i32 s10, s5, 1 -; GFX90A-NEXT: s_sub_i32 s8, s2, s3 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, s9 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v1 +; GFX90A-NEXT: s_mul_i32 s4, s4, s10 +; GFX90A-NEXT: s_mul_hi_u32 s4, s10, s4 +; GFX90A-NEXT: s_add_i32 s10, s10, s4 +; GFX90A-NEXT: s_mul_hi_u32 s4, s2, s10 +; GFX90A-NEXT: s_mul_i32 s10, s4, s3 +; GFX90A-NEXT: s_sub_i32 s2, s2, s10 +; GFX90A-NEXT: s_add_i32 s11, s4, 1 +; GFX90A-NEXT: s_sub_i32 s10, s2, s3 ; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 -; GFX90A-NEXT: s_cselect_b32 s5, s10, s5 -; GFX90A-NEXT: s_cselect_b32 s2, s8, s2 -; GFX90A-NEXT: s_add_i32 s8, s5, 1 +; GFX90A-NEXT: s_cselect_b32 s4, s11, s4 +; GFX90A-NEXT: s_cselect_b32 s2, s10, s2 +; GFX90A-NEXT: s_add_i32 s10, s4, 1 ; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 -; GFX90A-NEXT: s_cselect_b32 s8, s8, s5 -; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v1, s10 -; GFX90A-NEXT: s_lshl_b64 s[10:11], s[8:9], 5 +; GFX90A-NEXT: s_cselect_b32 s4, s10, s4 +; GFX90A-NEXT: s_lshr_b32 s9, s9, 16 +; GFX90A-NEXT: s_lshl_b64 s[12:13], s[4:5], 5 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v1, s9 ; GFX90A-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 -; GFX90A-NEXT: s_lshl_b64 s[4:5], s[6:7], 5 -; GFX90A-NEXT: s_or_b32 s4, s4, 28 +; GFX90A-NEXT: s_lshl_b64 s[10:11], s[6:7], 5 +; GFX90A-NEXT: s_or_b32 s10, s10, 28 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s9, v18 -; GFX90A-NEXT: s_and_b32 s9, 0xffff, s9 -; GFX90A-NEXT: s_mul_i32 s1, s1, s9 -; GFX90A-NEXT: s_mul_hi_u32 s12, s0, s9 -; GFX90A-NEXT: s_mul_i32 s0, s0, s9 -; GFX90A-NEXT: s_add_i32 s1, s12, s1 +; GFX90A-NEXT: v_readfirstlane_b32 s5, v18 +; GFX90A-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX90A-NEXT: s_mul_i32 s1, s1, s5 +; GFX90A-NEXT: s_mul_hi_u32 s9, s0, s5 +; GFX90A-NEXT: s_mul_i32 s0, s0, s5 +; GFX90A-NEXT: s_add_i32 s1, s9, s1 ; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 ; GFX90A-NEXT: s_branch .LBB3_2 -; GFX90A-NEXT: .LBB3_1: ; %Flow56 +; GFX90A-NEXT: .LBB3_1: ; %Flow20 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[12:13] +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[14:15] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_12 ; GFX90A-NEXT: .LBB3_2: ; %bb9 ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB3_5 Depth 2 -; GFX90A-NEXT: s_mov_b64 s[14:15], -1 +; GFX90A-NEXT: s_mov_b64 s[16:17], -1 ; GFX90A-NEXT: s_cbranch_scc0 .LBB3_10 ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[12:13], s[6:7], 0 -; GFX90A-NEXT: v_cmp_gt_i64_e64 s[14:15], s[6:7], -1 -; GFX90A-NEXT: s_mov_b64 s[18:19], s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v7, v6 -; GFX90A-NEXT: v_mov_b32_e32 v12, v6 -; GFX90A-NEXT: v_mov_b32_e32 v13, v6 -; GFX90A-NEXT: v_mov_b32_e32 v10, v6 -; GFX90A-NEXT: v_mov_b32_e32 v11, v6 -; GFX90A-NEXT: v_mov_b32_e32 v8, v6 -; GFX90A-NEXT: v_mov_b32_e32 v9, v6 +; GFX90A-NEXT: s_mov_b32 s9, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0 +; GFX90A-NEXT: v_cmp_gt_i64_e64 s[16:17], s[6:7], -1 +; GFX90A-NEXT: s_mov_b64 s[20:21], s[10:11] +; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s9, v4 -; GFX90A-NEXT: v_readfirstlane_b32 s16, v5 -; GFX90A-NEXT: s_add_u32 s9, s9, 1 -; GFX90A-NEXT: s_addc_u32 s17, s16, 0 -; GFX90A-NEXT: s_mul_hi_u32 s20, s2, s9 -; GFX90A-NEXT: s_mul_i32 s21, s3, s9 -; GFX90A-NEXT: s_mul_i32 s16, s2, s9 -; GFX90A-NEXT: s_mul_i32 s9, s2, s17 -; GFX90A-NEXT: s_add_i32 s9, s20, s9 -; GFX90A-NEXT: s_add_i32 s9, s9, s21 +; GFX90A-NEXT: v_readfirstlane_b32 s5, v4 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v5 +; GFX90A-NEXT: s_add_u32 s5, s5, 1 +; GFX90A-NEXT: s_addc_u32 s9, s9, 0 +; GFX90A-NEXT: s_mul_hi_u32 s19, s2, s5 +; GFX90A-NEXT: s_mul_i32 s22, s3, s5 +; GFX90A-NEXT: s_mul_i32 s18, s2, s5 +; GFX90A-NEXT: s_mul_i32 s5, s2, s9 +; GFX90A-NEXT: s_add_i32 s5, s19, s5 +; GFX90A-NEXT: s_add_i32 s5, s5, s22 ; GFX90A-NEXT: s_branch .LBB3_5 ; GFX90A-NEXT: .LBB3_4: ; %bb58 ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX90A-NEXT: s_add_u32 s18, s18, s0 -; GFX90A-NEXT: s_addc_u32 s19, s19, s1 -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[22:23], -1, v[4:5] -; GFX90A-NEXT: s_mov_b64 s[20:21], 0 -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23] +; GFX90A-NEXT: s_add_u32 s20, s20, s0 +; GFX90A-NEXT: s_addc_u32 s21, s21, s1 +; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5] +; GFX90A-NEXT: s_mov_b64 s[22:23], 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_9 ; GFX90A-NEXT: .LBB3_5: ; %bb16 ; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: s_add_u32 s20, s18, s16 -; GFX90A-NEXT: s_addc_u32 s21, s19, s9 -; GFX90A-NEXT: global_load_dword v21, v19, s[20:21] offset:-12 glc +; GFX90A-NEXT: s_add_u32 s22, s20, s18 +; GFX90A-NEXT: s_addc_u32 s23, s21, s5 +; GFX90A-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v20, v19, s[20:21] offset:-8 glc +; GFX90A-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v14, v19, s[20:21] offset:-4 glc +; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] offset:-4 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v14, v19, s[20:21] glc +; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ds_read_b64 v[14:15], v19 ; GFX90A-NEXT: ds_read_b64 v[16:17], v0 -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[14:15] -; GFX90A-NEXT: ; kill: killed $sgpr20 killed $sgpr21 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[16:17] +; GFX90A-NEXT: ; kill: killed $sgpr22 killed $sgpr23 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7 ; GFX90A-NEXT: ; %bb.6: ; %bb51 @@ -801,37 +797,37 @@ ; GFX90A-NEXT: v_pk_add_f32 v[16:17], v[22:23], v[16:17] ; GFX90A-NEXT: v_pk_add_f32 v[14:15], v[20:21], v[14:15] ; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[24:25] -; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[26:27] -; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[16:17] -; GFX90A-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[14:15] -; GFX90A-NEXT: s_mov_b64 s[20:21], -1 +; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[26:27] +; GFX90A-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[16:17] +; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[14:15] +; GFX90A-NEXT: s_mov_b64 s[22:23], -1 ; GFX90A-NEXT: s_branch .LBB3_4 ; GFX90A-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 -; GFX90A-NEXT: s_mov_b64 s[20:21], s[12:13] -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[20:21] +; GFX90A-NEXT: s_mov_b64 s[22:23], s[14:15] +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_4 ; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: ; implicit-def: $vgpr9 -; GFX90A-NEXT: ; implicit-def: $vgpr11 -; GFX90A-NEXT: ; implicit-def: $vgpr13 -; GFX90A-NEXT: ; implicit-def: $vgpr7 +; GFX90A-NEXT: ; implicit-def: $vgpr12_vgpr13 +; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX90A-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX90A-NEXT: ; implicit-def: $sgpr18_sgpr19 +; GFX90A-NEXT: ; implicit-def: $sgpr20_sgpr21 ; GFX90A-NEXT: .LBB3_9: ; %loop.exit.guard ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_xor_b64 s[14:15], s[20:21], -1 -; GFX90A-NEXT: .LBB3_10: ; %Flow55 +; GFX90A-NEXT: s_xor_b64 s[16:17], s[22:23], -1 +; GFX90A-NEXT: .LBB3_10: ; %Flow19 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_mov_b64 s[12:13], -1 -; GFX90A-NEXT: s_and_b64 vcc, exec, s[14:15] +; GFX90A-NEXT: s_mov_b64 s[14:15], -1 +; GFX90A-NEXT: s_and_b64 vcc, exec, s[16:17] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 ; GFX90A-NEXT: ; %bb.11: ; %bb12 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_add_u32 s6, s6, s8 +; GFX90A-NEXT: s_add_u32 s6, s6, s4 ; GFX90A-NEXT: s_addc_u32 s7, s7, 0 -; GFX90A-NEXT: s_add_u32 s4, s4, s10 -; GFX90A-NEXT: s_addc_u32 s5, s5, s11 -; GFX90A-NEXT: s_mov_b64 s[12:13], 0 +; GFX90A-NEXT: s_add_u32 s10, s10, s12 +; GFX90A-NEXT: s_addc_u32 s11, s11, s13 +; GFX90A-NEXT: s_mov_b64 s[14:15], 0 ; GFX90A-NEXT: s_branch .LBB3_1 ; GFX90A-NEXT: .LBB3_12: ; %DummyReturnBlock ; GFX90A-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll @@ -0,0 +1,347 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare %s | FileCheck %s + +; This file contains simpler test cases for the break-large-PHIs transform +; and instead focuses on checking whether the profitability heuristics are +; respected. + +; Ok - has interesting incoming value (zeroinit) +define amdgpu_kernel void @zeroinit_inc(<5 x double> %in, ptr %out, i1 %cond) { +; CHECK-LABEL: @zeroinit_inc( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: [[X:%.*]] = insertelement <5 x double> [[IN:%.*]], double 3.140000e+00, i32 3 +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE0:%.*]] = extractelement <5 x double> [[X]], i64 0 +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <5 x double> [[X]], i64 1 +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X]], i64 2 +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <5 x double> [[X]], i64 3 +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X]], i64 4 +; CHECK-NEXT: br label [[FINALLY:%.*]] +; CHECK: else: +; CHECK-NEXT: br label [[FINALLY]] +; CHECK: finally: +; CHECK-NEXT: [[TMP0:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE1]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE3]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ 0.000000e+00, [[ELSE]] ] +; CHECK-NEXT: [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <5 x double> poison, double [[TMP0]], i64 0 +; CHECK-NEXT: [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE0]], double [[TMP1]], i64 1 +; CHECK-NEXT: [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE1]], double [[TMP2]], i64 2 +; CHECK-NEXT: [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE2]], double [[TMP3]], i64 3 +; CHECK-NEXT: [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE3]], double [[TMP4]], i64 4 +; CHECK-NEXT: store <5 x double> [[LARGEPHI_INSERTSLICE4]], ptr [[OUT:%.*]], align 1 +; CHECK-NEXT: ret void +; +entry: + br i1 %cond, label %then, label %else +then: + %x = insertelement <5 x double> %in, double 3.14, i32 3 + br label %finally +else: + br label %finally +finally: + %val = phi <5 x double> [%x, %then], [zeroinitializer, %else] + store <5 x double> %val, ptr %out, align 1 + ret void +} + +; Ok - has interesting incoming value (poison) +define amdgpu_kernel void @poison_inc(<5 x double> %in, ptr %out, i1 %cond) { +; CHECK-LABEL: @poison_inc( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: [[X:%.*]] = insertelement <5 x double> [[IN:%.*]], double 3.140000e+00, i32 3 +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE0:%.*]] = extractelement <5 x double> [[X]], i64 0 +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <5 x double> [[X]], i64 1 +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X]], i64 2 +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <5 x double> [[X]], i64 3 +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X]], i64 4 +; CHECK-NEXT: br label [[FINALLY:%.*]] +; CHECK: else: +; CHECK-NEXT: br label [[FINALLY]] +; CHECK: finally: +; CHECK-NEXT: [[TMP0:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ poison, [[ELSE]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE1]], [[THEN]] ], [ poison, [[ELSE]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ poison, [[ELSE]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE3]], [[THEN]] ], [ poison, [[ELSE]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ poison, [[ELSE]] ] +; CHECK-NEXT: [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <5 x double> poison, double [[TMP0]], i64 0 +; CHECK-NEXT: [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE0]], double [[TMP1]], i64 1 +; CHECK-NEXT: [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE1]], double [[TMP2]], i64 2 +; CHECK-NEXT: [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE2]], double [[TMP3]], i64 3 +; CHECK-NEXT: [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE3]], double [[TMP4]], i64 4 +; CHECK-NEXT: store <5 x double> [[LARGEPHI_INSERTSLICE4]], ptr [[OUT:%.*]], align 1 +; CHECK-NEXT: ret void +; +entry: + br i1 %cond, label %then, label %else +then: + %x = insertelement <5 x double> %in, double 3.14, i32 3 + br label %finally +else: + br label %finally +finally: + %val = phi <5 x double> [%x, %then], [poison, %else] + store <5 x double> %val, ptr %out, align 1 + ret void +} + +; Ok - has interesting incoming value (trivial insertelement chain) +define amdgpu_kernel void @trivial_insertelt_chain(<5 x double> %in, ptr %out, i1 %cond, double %x) { +; CHECK-LABEL: @trivial_insertelt_chain( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: [[X_0:%.*]] = insertelement <5 x double> poison, double 3.140000e+00, i32 0 +; CHECK-NEXT: [[X_1:%.*]] = insertelement <5 x double> [[X_0]], double [[X:%.*]], i32 4 +; CHECK-NEXT: [[X_2:%.*]] = insertelement <5 x double> [[X_1]], double [[X]], i32 3 +; CHECK-NEXT: [[X_3:%.*]] = insertelement <5 x double> [[X_2]], double 6.140000e+00, i32 2 +; CHECK-NEXT: [[X_4:%.*]] = insertelement <5 x double> [[X_3]], double 9.900000e+00, i32 1 +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE0:%.*]] = extractelement <5 x double> [[X_4]], i64 0 +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <5 x double> [[X_4]], i64 1 +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[X_4]], i64 2 +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <5 x double> [[X_4]], i64 3 +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[X_4]], i64 4 +; CHECK-NEXT: br label [[FINALLY:%.*]] +; CHECK: else: +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE01:%.*]] = extractelement <5 x double> [[IN:%.*]], i64 0 +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE12:%.*]] = extractelement <5 x double> [[IN]], i64 1 +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE23:%.*]] = extractelement <5 x double> [[IN]], i64 2 +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE34:%.*]] = extractelement <5 x double> [[IN]], i64 3 +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE45:%.*]] = extractelement <5 x double> [[IN]], i64 4 +; CHECK-NEXT: br label [[FINALLY]] +; CHECK: finally: +; CHECK-NEXT: [[TMP0:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE01]], [[ELSE]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE1]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE12]], [[ELSE]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE23]], [[ELSE]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE3]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE34]], [[ELSE]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN]] ], [ [[LARGEPHI_EXTRACTSLICE45]], [[ELSE]] ] +; CHECK-NEXT: [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <5 x double> poison, double [[TMP0]], i64 0 +; CHECK-NEXT: [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE0]], double [[TMP1]], i64 1 +; CHECK-NEXT: [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE1]], double [[TMP2]], i64 2 +; CHECK-NEXT: [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE2]], double [[TMP3]], i64 3 +; CHECK-NEXT: [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE3]], double [[TMP4]], i64 4 +; CHECK-NEXT: store <5 x double> [[LARGEPHI_INSERTSLICE4]], ptr [[OUT:%.*]], align 1 +; CHECK-NEXT: ret void +; +entry: + br i1 %cond, label %then, label %else +then: + %x.0 = insertelement <5 x double> poison, double 3.14, i32 0 + %x.1 = insertelement <5 x double> %x.0, double %x, i32 4 + %x.2 = insertelement <5 x double> %x.1, double %x, i32 3 + %x.3 = insertelement <5 x double> %x.2, double 6.14, i32 2 + %x.4 = insertelement <5 x double> %x.3, double 9.9, i32 1 + br label %finally +else: + br label %finally +finally: + %val = phi <5 x double> [%x.4, %then], [%in, %else] + store <5 x double> %val, ptr %out, align 1 + ret void +} + + +; Not Ok - non trivial insertelement chain (non constant idx) +define amdgpu_kernel void @nontrivial_insertelt_chain(<5 x double> %in, ptr %out, i1 %cond, double %x, i32 %idx) { +; CHECK-LABEL: @nontrivial_insertelt_chain( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: [[X_0:%.*]] = insertelement <5 x double> poison, double 3.140000e+00, i32 0 +; CHECK-NEXT: [[X_1:%.*]] = insertelement <5 x double> [[X_0]], double [[X:%.*]], i32 [[IDX:%.*]] +; CHECK-NEXT: [[X_2:%.*]] = insertelement <5 x double> [[X_1]], double [[X]], i32 2 +; CHECK-NEXT: [[X_3:%.*]] = insertelement <5 x double> [[X_2]], double 6.140000e+00, i32 3 +; CHECK-NEXT: [[X_4:%.*]] = insertelement <5 x double> [[X_3]], double 9.900000e+00, i32 4 +; CHECK-NEXT: br label [[FINALLY:%.*]] +; CHECK: else: +; CHECK-NEXT: br label [[FINALLY]] +; CHECK: finally: +; CHECK-NEXT: [[VAL:%.*]] = phi <5 x double> [ [[X_4]], [[THEN]] ], [ [[IN:%.*]], [[ELSE]] ] +; CHECK-NEXT: store <5 x double> [[VAL]], ptr [[OUT:%.*]], align 1 +; CHECK-NEXT: ret void +; +entry: + br i1 %cond, label %then, label %else +then: + %x.0 = insertelement <5 x double> poison, double 3.14, i32 0 + %x.1 = insertelement <5 x double> %x.0, double %x, i32 %idx + %x.2 = insertelement <5 x double> %x.1, double %x, i32 2 + %x.3 = insertelement <5 x double> %x.2, double 6.14, i32 3 + %x.4 = insertelement <5 x double> %x.3, double 9.9, i32 4 + br label %finally +else: + br label %finally +finally: + %val = phi <5 x double> [%x.4, %then], [%in, %else] + store <5 x double> %val, ptr %out, align 1 + ret void +} + +; Not Ok - non trivial insertelement chain (one index is out of bounds) +define amdgpu_kernel void @nontrivial_insertelt_chain_out_of_bounds(<5 x double> %in, ptr %out, i1 %cond, double %x) { +; CHECK-LABEL: @nontrivial_insertelt_chain_out_of_bounds( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: [[X_0:%.*]] = insertelement <5 x double> poison, double 3.140000e+00, i32 0 +; CHECK-NEXT: [[X_1:%.*]] = insertelement <5 x double> [[X_0]], double [[X:%.*]], i32 1 +; CHECK-NEXT: [[X_2:%.*]] = insertelement <5 x double> [[X_1]], double [[X]], i32 2 +; CHECK-NEXT: [[X_3:%.*]] = insertelement <5 x double> [[X_2]], double 6.140000e+00, i32 3 +; CHECK-NEXT: [[X_4:%.*]] = insertelement <5 x double> [[X_3]], double 9.900000e+00, i32 4 +; CHECK-NEXT: [[X_5:%.*]] = insertelement <5 x double> [[X_4]], double 9.900000e+00, i32 5 +; CHECK-NEXT: br label [[FINALLY:%.*]] +; CHECK: else: +; CHECK-NEXT: br label [[FINALLY]] +; CHECK: finally: +; CHECK-NEXT: [[VAL:%.*]] = phi <5 x double> [ [[X_5]], [[THEN]] ], [ [[IN:%.*]], [[ELSE]] ] +; CHECK-NEXT: store <5 x double> [[VAL]], ptr [[OUT:%.*]], align 1 +; CHECK-NEXT: ret void +; +entry: + br i1 %cond, label %then, label %else +then: + %x.0 = insertelement <5 x double> poison, double 3.14, i32 0 + %x.1 = insertelement <5 x double> %x.0, double %x, i32 1 + %x.2 = insertelement <5 x double> %x.1, double %x, i32 2 + %x.3 = insertelement <5 x double> %x.2, double 6.14, i32 3 + %x.4 = insertelement <5 x double> %x.3, double 9.9, i32 4 + %x.5 = insertelement <5 x double> %x.4, double 9.9, i32 5 + br label %finally +else: + br label %finally +finally: + %val = phi <5 x double> [%x.5, %then], [%in, %else] + store <5 x double> %val, ptr %out, align 1 + ret void +} + +; Not Ok - non trivial insertelement chain (not all inserts in same bb) +define amdgpu_kernel void @nontrivial_insertelt_locality(<5 x double> %in, ptr %out, i1 %cond, double %x) { +; CHECK-LABEL: @nontrivial_insertelt_locality( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: [[X_0:%.*]] = insertelement <5 x double> poison, double 3.140000e+00, i32 0 +; CHECK-NEXT: [[X_1:%.*]] = insertelement <5 x double> [[X_0]], double [[X:%.*]], i32 1 +; CHECK-NEXT: [[X_2:%.*]] = insertelement <5 x double> [[X_1]], double [[X]], i32 2 +; CHECK-NEXT: br label [[THEN2:%.*]] +; CHECK: then2: +; CHECK-NEXT: [[X_3:%.*]] = insertelement <5 x double> [[X_2]], double 6.140000e+00, i32 3 +; CHECK-NEXT: [[X_4:%.*]] = insertelement <5 x double> [[X_3]], double 9.900000e+00, i32 4 +; CHECK-NEXT: br label [[FINALLY:%.*]] +; CHECK: else: +; CHECK-NEXT: br label [[FINALLY]] +; CHECK: finally: +; CHECK-NEXT: [[VAL:%.*]] = phi <5 x double> [ [[X_4]], [[THEN2]] ], [ [[IN:%.*]], [[ELSE]] ] +; CHECK-NEXT: store <5 x double> [[VAL]], ptr [[OUT:%.*]], align 1 +; CHECK-NEXT: ret void +; +entry: + br i1 %cond, label %then, label %else +then: + %x.0 = insertelement <5 x double> poison, double 3.14, i32 0 + %x.1 = insertelement <5 x double> %x.0, double %x, i32 1 + %x.2 = insertelement <5 x double> %x.1, double %x, i32 2 + br label %then2 +then2: + %x.3 = insertelement <5 x double> %x.2, double 6.14, i32 3 + %x.4 = insertelement <5 x double> %x.3, double 9.9, i32 4 + br label %finally +else: + br label %finally +finally: + %val = phi <5 x double> [%x.4, %then2], [%in, %else] + store <5 x double> %val, ptr %out, align 1 + ret void +} + +; Not Ok - non trivial insertelement chain (not all elts covered) +define amdgpu_kernel void @nontrivial_insertelt_coverage(<5 x double> %in, ptr %out, i1 %cond, double %x) { +; CHECK-LABEL: @nontrivial_insertelt_coverage( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: [[X_0:%.*]] = insertelement <5 x double> poison, double 3.140000e+00, i32 0 +; CHECK-NEXT: [[X_1:%.*]] = insertelement <5 x double> [[X_0]], double [[X:%.*]], i32 4 +; CHECK-NEXT: [[X_3:%.*]] = insertelement <5 x double> [[X_1]], double 6.140000e+00, i32 2 +; CHECK-NEXT: [[X_4:%.*]] = insertelement <5 x double> [[X_3]], double 9.900000e+00, i32 1 +; CHECK-NEXT: br label [[FINALLY:%.*]] +; CHECK: else: +; CHECK-NEXT: br label [[FINALLY]] +; CHECK: finally: +; CHECK-NEXT: [[VAL:%.*]] = phi <5 x double> [ [[X_4]], [[THEN]] ], [ [[IN:%.*]], [[ELSE]] ] +; CHECK-NEXT: store <5 x double> [[VAL]], ptr [[OUT:%.*]], align 1 +; CHECK-NEXT: ret void +; +entry: + br i1 %cond, label %then, label %else +then: + %x.0 = insertelement <5 x double> poison, double 3.14, i32 0 + %x.1 = insertelement <5 x double> %x.0, double %x, i32 4 + %x.3 = insertelement <5 x double> %x.1, double 6.14, i32 2 + %x.4 = insertelement <5 x double> %x.3, double 9.9, i32 1 + br label %finally +else: + br label %finally +finally: + %val = phi <5 x double> [%x.4, %then], [%in, %else] + store <5 x double> %val, ptr %out, align 1 + ret void +} + +; First Phi Not Ok - Used by second PHI. +; Second Phi is Ok. +define amdgpu_kernel void @used_by_phi(<5 x double> %in, ptr %out, i1 %cond, i1 %cond2) { +; CHECK-LABEL: @used_by_phi( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: [[X:%.*]] = insertelement <5 x double> [[IN:%.*]], double 3.140000e+00, i32 3 +; CHECK-NEXT: br label [[FINALLY:%.*]] +; CHECK: else: +; CHECK-NEXT: br label [[FINALLY]] +; CHECK: finally: +; CHECK-NEXT: [[VAL:%.*]] = phi <5 x double> [ [[X]], [[THEN]] ], [ zeroinitializer, [[ELSE]] ] +; CHECK-NEXT: store <5 x double> [[VAL]], ptr [[OUT:%.*]], align 1 +; CHECK-NEXT: br i1 [[COND2:%.*]], label [[THEN1:%.*]], label [[END:%.*]] +; CHECK: then1: +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE0:%.*]] = extractelement <5 x double> [[VAL]], i64 0 +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE1:%.*]] = extractelement <5 x double> [[VAL]], i64 1 +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE2:%.*]] = extractelement <5 x double> [[VAL]], i64 2 +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE3:%.*]] = extractelement <5 x double> [[VAL]], i64 3 +; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE4:%.*]] = extractelement <5 x double> [[VAL]], i64 4 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[TMP0:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE0]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE1]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE2]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE3]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE4]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] +; CHECK-NEXT: [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <5 x double> poison, double [[TMP0]], i64 0 +; CHECK-NEXT: [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE0]], double [[TMP1]], i64 1 +; CHECK-NEXT: [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE1]], double [[TMP2]], i64 2 +; CHECK-NEXT: [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE2]], double [[TMP3]], i64 3 +; CHECK-NEXT: [[LARGEPHI_INSERTSLICE4:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE3]], double [[TMP4]], i64 4 +; CHECK-NEXT: ret void +; +entry: + br i1 %cond, label %then, label %else +then: + %x = insertelement <5 x double> %in, double 3.14, i32 3 + br label %finally +else: + br label %finally +finally: + %val = phi <5 x double> [%x, %then], [zeroinitializer, %else] + store <5 x double> %val, ptr %out, align 1 + br i1 %cond2, label %then1, label %end +then1: + br label %end +end: + %endval = phi <5 x double> [%val, %then1], [zeroinitializer, %finally] + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare %s | FileCheck %s --check-prefixes=OPT -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare -global-isel %s | FileCheck %s --check-prefixes=NOOPT +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare -amdgpu-codegenprepare-force-break-large-phis %s | FileCheck %s --check-prefixes=OPT +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare --global-isel %s | FileCheck %s --check-prefixes=NOOPT ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare -amdgpu-codegenprepare-break-large-phis=0 %s | FileCheck %s --check-prefixes=NOOPT define amdgpu_kernel void @phi_v5f64(<5 x double> %in, ptr %out, i1 %cond) { diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -28,28 +28,28 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB0_3 ; SI-NEXT: s_branch .LBB0_4 ; SI-NEXT: .LBB0_2: ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB0_3: ; %T ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -63,28 +63,28 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v1 -; SI-NEXT: v_or_b32_e32 v3, v3, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: .LBB0_4: ; %exit ; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 -; SI-NEXT: v_bfe_i32 v1, v2, 0, 16 -; SI-NEXT: v_bfe_i32 v2, v4, 0, 16 +; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 +; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 ; SI-NEXT: v_mov_b32_e32 v3, 0xffff ; SI-NEXT: v_mov_b32_e32 v4, 0x8000 ; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000 ; SI-NEXT: v_bfrev_b32_e32 v6, 1 ; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 -; SI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 ; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc -; SI-NEXT: v_or_b32_e32 v0, v1, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16 +; SI-NEXT: v_or_b32_e32 v2, v3, v4 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: vec_8xi16_extract_4xi16: @@ -97,7 +97,7 @@ ; GFX9-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-NEXT: s_branch .LBB0_4 ; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: .LBB0_3: ; %T ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -158,18 +158,18 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_or_b32_e32 v5, v6, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB1_3 ; SI-NEXT: s_branch .LBB1_4 ; SI-NEXT: .LBB1_2: ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB1_3: ; %T ; SI-NEXT: s_mov_b32 s6, 0 @@ -184,39 +184,39 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_or_b32_e32 v5, v5, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v0 +; SI-NEXT: v_or_b32_e32 v5, v5, v1 ; SI-NEXT: .LBB1_4: ; %exit ; SI-NEXT: v_bfe_i32 v0, v5, 0, 16 -; SI-NEXT: v_bfe_i32 v1, v2, 0, 16 -; SI-NEXT: v_bfe_i32 v2, v3, 0, 16 -; SI-NEXT: v_bfe_i32 v3, v4, 0, 16 +; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 +; SI-NEXT: v_bfe_i32 v3, v3, 0, 16 +; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 ; SI-NEXT: v_mov_b32_e32 v4, 0xffff ; SI-NEXT: v_mov_b32_e32 v5, 0x8000 ; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000 ; SI-NEXT: v_bfrev_b32_e32 v7, 1 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 -; SI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3 -; SI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 -; SI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; SI-NEXT: v_or_b32_e32 v0, v2, v3 -; SI-NEXT: v_or_b32_e32 v2, v4, v5 -; SI-NEXT: v_alignbit_b32 v1, v2, v3, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3 +; SI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 +; SI-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v4 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: vec_8xi16_extract_4xi16_2: @@ -229,7 +229,7 @@ ; GFX9-NEXT: s_cbranch_execz .LBB1_3 ; GFX9-NEXT: s_branch .LBB1_4 ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: .LBB1_3: ; %T ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -282,24 +282,28 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB2_3 ; SI-NEXT: s_branch .LBB2_4 ; SI-NEXT: .LBB2_2: -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB2_3: ; %T ; SI-NEXT: s_mov_b32 s6, 0 @@ -310,21 +314,25 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:6 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; SI-NEXT: .LBB2_4: ; %exit ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 @@ -336,10 +344,10 @@ ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 -; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; SI-NEXT: v_mov_b32_e32 v3, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -353,7 +361,7 @@ ; GFX9-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-NEXT: s_branch .LBB2_4 ; GFX9-NEXT: .LBB2_2: -; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: .LBB2_3: ; %T ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -436,28 +444,28 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB3_3 ; SI-NEXT: s_branch .LBB3_4 ; SI-NEXT: .LBB3_2: ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB3_3: ; %T ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -487,28 +495,28 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v1 -; SI-NEXT: v_or_b32_e32 v3, v3, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: .LBB3_4: ; %exit ; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 -; SI-NEXT: v_bfe_i32 v1, v2, 0, 16 -; SI-NEXT: v_bfe_i32 v2, v4, 0, 16 +; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 +; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 ; SI-NEXT: v_mov_b32_e32 v3, 0xffff ; SI-NEXT: v_mov_b32_e32 v4, 0x8000 ; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000 ; SI-NEXT: v_bfrev_b32_e32 v6, 1 ; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 -; SI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 ; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc -; SI-NEXT: v_or_b32_e32 v0, v1, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16 +; SI-NEXT: v_or_b32_e32 v2, v3, v4 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: vec_16xi16_extract_4xi16: @@ -516,36 +524,33 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX9-NEXT: ; %bb.1: ; %F -; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off offset:16 glc +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 ; GFX9-NEXT: s_cbranch_execz .LBB3_3 ; GFX9-NEXT: s_branch .LBB3_4 ; GFX9-NEXT: .LBB3_2: -; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; GFX9-NEXT: .LBB3_3: ; %T -; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc +; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 glc +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 ; GFX9-NEXT: .LBB3_4: ; %exit -; GFX9-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NEXT: v_bfi_b32 v0, s4, v4, v4 -; GFX9-NEXT: v_bfi_b32 v0, s4, v4, v0 -; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,0] ; GFX9-NEXT: s_movk_i32 s4, 0x8000 -; GFX9-NEXT: v_pk_ashrrev_i16 v2, 15, v5 op_sel_hi:[0,0] ; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] +; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: v_perm_b32 v1, v2, v3, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F @@ -586,11 +591,11 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -608,17 +613,17 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB4_3 ; SI-NEXT: s_branch .LBB4_4 ; SI-NEXT: .LBB4_2: -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB4_3: ; %T @@ -634,39 +639,39 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:18 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:20 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:22 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:24 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:26 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:28 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v2, v1 -; SI-NEXT: v_or_b32_e32 v2, v6, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: .LBB4_4: ; %exit ; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 -; SI-NEXT: v_bfe_i32 v1, v5, 0, 16 +; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 ; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 -; SI-NEXT: v_bfe_i32 v3, v4, 0, 16 +; SI-NEXT: v_bfe_i32 v3, v5, 0, 16 ; SI-NEXT: v_mov_b32_e32 v4, 0xffff ; SI-NEXT: v_mov_b32_e32 v5, 0x8000 ; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000 @@ -690,35 +695,33 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %F -; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:16 glc +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 ; GFX9-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-NEXT: s_branch .LBB4_4 ; GFX9-NEXT: .LBB4_2: -; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; GFX9-NEXT: .LBB4_3: ; %T -; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 ; GFX9-NEXT: .LBB4_4: ; %exit -; GFX9-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NEXT: v_bfi_b32 v0, s4, v6, v6 -; GFX9-NEXT: v_bfi_b32 v0, s4, v6, v0 -; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1] -; GFX9-NEXT: s_movk_i32 s4, 0x8000 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_ashrrev_i16 v2, 15, v7 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v7 op_sel_hi:[0,1] +; GFX9-NEXT: s_movk_i32 s4, 0x8000 ; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v6 op_sel_hi:[0,1] +; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: v_perm_b32 v1, v2, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F @@ -757,40 +760,44 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:18 glc +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:20 glc +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:22 glc +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:24 glc +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:26 glc +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:28 glc +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v4, v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB5_3 ; SI-NEXT: s_branch .LBB5_4 ; SI-NEXT: .LBB5_2: -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB5_3: ; %T ; SI-NEXT: s_mov_b32 s6, 0 @@ -801,37 +808,41 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:6 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:18 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:20 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:22 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:24 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:26 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:28 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; SI-NEXT: .LBB5_4: ; %exit ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 @@ -843,10 +854,10 @@ ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 -; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; SI-NEXT: v_mov_b32_e32 v3, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -855,41 +866,38 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX9-NEXT: ; %bb.1: ; %F -; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off offset:16 glc +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 ; GFX9-NEXT: s_cbranch_execz .LBB5_3 ; GFX9-NEXT: s_branch .LBB5_4 ; GFX9-NEXT: .LBB5_2: -; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; GFX9-NEXT: .LBB5_3: ; %T -; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc +; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 glc +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 ; GFX9-NEXT: .LBB5_4: ; %exit -; GFX9-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NEXT: v_bfi_b32 v0, s4, v4, v4 -; GFX9-NEXT: v_bfi_b32 v0, s4, v4, v0 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x3800 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x3900 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x3d00 -; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v0 -; GFX9-NEXT: v_perm_b32 v1, v5, v5, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v4, vcc -; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v4, vcc -; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX9-NEXT: v_pack_b32_f16 v0, v5, v0 -; GFX9-NEXT: v_pack_b32_f16 v1, v6, v1 +; GFX9-NEXT: v_perm_b32 v0, v5, v5, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x3900 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x3d00 +; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v3, vcc +; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v2, vcc +; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v4, v1 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-NEXT: v_pack_b32_f16 v1, v5, v6 ; GFX9-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -2004,15 +2004,14 @@ ; SI-NEXT: .LBB42_2: ; %if ; SI-NEXT: s_load_dword s7, s[2:3], 0x0 ; SI-NEXT: .LBB42_3: ; %endif +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB42_4: -; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB42_2 ; ; VI-LABEL: insert_split_bb: @@ -2029,15 +2028,14 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s7, s[2:3], 0x0 ; VI-NEXT: .LBB42_3: ; %endif +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB42_4: -; VI-NEXT: ; implicit-def: $sgpr7 ; VI-NEXT: s_branch .LBB42_2 entry: %0 = insertelement <2 x i32> undef, i32 %a, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll b/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll --- a/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll @@ -10,36 +10,22 @@ ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_add_f32_e32 v3, v2, v2 +; CHECK-NEXT: v_add_f32_e32 v0, v0, v0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: s_branch .LBB0_2 -; CHECK-NEXT: .LBB0_1: ; %Flow13 -; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: s_or_b64 exec, exec, s[10:11] -; CHECK-NEXT: s_and_b64 s[6:7], exec, s[8:9] -; CHECK-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execz .LBB0_6 -; CHECK-NEXT: .LBB0_2: ; %bb1 +; CHECK-NEXT: ; kill: killed $vgpr1 +; CHECK-NEXT: .LBB0_1: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_cmp_eq_f32_e64 s[6:7], 0, v2 ; CHECK-NEXT: v_cmp_neq_f32_e32 vcc, 0, v2 -; CHECK-NEXT: s_and_saveexec_b64 s[8:9], vcc -; CHECK-NEXT: ; %bb.3: ; %bb2 -; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: v_mul_f32_e32 v2, v3, v2 -; CHECK-NEXT: s_or_b64 s[6:7], s[6:7], exec -; CHECK-NEXT: ; %bb.4: ; %Flow -; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] -; CHECK-NEXT: s_mov_b64 s[8:9], -1 -; CHECK-NEXT: s_and_saveexec_b64 s[10:11], s[6:7] -; CHECK-NEXT: s_cbranch_execz .LBB0_1 -; CHECK-NEXT: ; %bb.5: ; %bb3 -; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: s_xor_b64 s[8:9], exec, -1 -; CHECK-NEXT: s_branch .LBB0_1 -; CHECK-NEXT: .LBB0_6: ; %DummyReturnBlock +; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_cbranch_execnz .LBB0_1 +; CHECK-NEXT: ; %bb.2: ; %bb2 +; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: v_mul_f32_e32 v2, v3, v2 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_cbranch_execnz .LBB0_1 +; CHECK-NEXT: ; %bb.3: ; %DummyReturnBlock ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: br label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -419,7 +419,7 @@ ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} ; GFX90A-DAG: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0 -; GFX90A-COUNT-28: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX90A-COUNT-28: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] ; GCN: [[LOOP:.LBB[0-9_]+]]: ; GCN-NOT: v_accvgpr @@ -576,7 +576,9 @@ ; GCN-LABEL: {{^}}test_mfma_nested_loop_zeroinit: -; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0 +; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] ; Check that we do not copy agprs to vgprs and back in an outer loop. diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1811,28 +1811,28 @@ ; GFX1032-NEXT: s_branch .LBB33_2 ; GFX1032-NEXT: .LBB33_1: ; %body ; GFX1032-NEXT: ; in Loop: Header=BB33_2 Depth=1 -; GFX1032-NEXT: image_sample v[0:3], v7, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX1032-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX1032-NEXT: v_add_f32_e32 v8, 2.0, v8 ; GFX1032-NEXT: s_cbranch_execz .LBB33_4 ; GFX1032-NEXT: .LBB33_2: ; %loop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v4, v3 -; GFX1032-NEXT: v_mov_b32_e32 v5, v2 -; GFX1032-NEXT: v_mov_b32_e32 v6, v1 -; GFX1032-NEXT: v_mov_b32_e32 v7, v0 +; GFX1032-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032-NEXT: v_mov_b32_e32 v4, v0 ; GFX1032-NEXT: s_cbranch_vccz .LBB33_1 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: ; implicit-def: $vgpr3 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1032-NEXT: ; implicit-def: $vgpr8 ; GFX1032-NEXT: .LBB33_4: ; %break ; GFX1032-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v0, v7 -; GFX1032-NEXT: v_mov_b32_e32 v1, v6 -; GFX1032-NEXT: v_mov_b32_e32 v2, v5 -; GFX1032-NEXT: v_mov_b32_e32 v3, v4 +; GFX1032-NEXT: v_mov_b32_e32 v0, v4 +; GFX1032-NEXT: v_mov_b32_e32 v1, v5 +; GFX1032-NEXT: v_mov_b32_e32 v2, v6 +; GFX1032-NEXT: v_mov_b32_e32 v3, v7 ; GFX1032-NEXT: ; return to shader part epilog ; ; GFX1064-LABEL: test_loop_vcc: @@ -1843,28 +1843,28 @@ ; GFX1064-NEXT: s_branch .LBB33_2 ; GFX1064-NEXT: .LBB33_1: ; %body ; GFX1064-NEXT: ; in Loop: Header=BB33_2 Depth=1 -; GFX1064-NEXT: image_sample v[0:3], v7, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX1064-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX1064-NEXT: v_add_f32_e32 v8, 2.0, v8 ; GFX1064-NEXT: s_cbranch_execz .LBB33_4 ; GFX1064-NEXT: .LBB33_2: ; %loop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_cmp_lt_f32_e32 vcc, 0x40e00000, v8 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v4, v3 -; GFX1064-NEXT: v_mov_b32_e32 v5, v2 -; GFX1064-NEXT: v_mov_b32_e32 v6, v1 -; GFX1064-NEXT: v_mov_b32_e32 v7, v0 +; GFX1064-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064-NEXT: v_mov_b32_e32 v4, v0 ; GFX1064-NEXT: s_cbranch_vccz .LBB33_1 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: ; implicit-def: $vgpr3 +; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX1064-NEXT: ; implicit-def: $vgpr8 ; GFX1064-NEXT: .LBB33_4: ; %break ; GFX1064-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v0, v7 -; GFX1064-NEXT: v_mov_b32_e32 v1, v6 -; GFX1064-NEXT: v_mov_b32_e32 v2, v5 -; GFX1064-NEXT: v_mov_b32_e32 v3, v4 +; GFX1064-NEXT: v_mov_b32_e32 v0, v4 +; GFX1064-NEXT: v_mov_b32_e32 v1, v5 +; GFX1064-NEXT: v_mov_b32_e32 v2, v6 +; GFX1064-NEXT: v_mov_b32_e32 v3, v7 ; GFX1064-NEXT: ; return to shader part epilog entry: br label %loop diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -1835,36 +1835,36 @@ ; GFX9-W64: ; %bb.0: ; %entry ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec +; GFX9-W64-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-W64-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-W64-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] -; GFX9-W64-NEXT: image_store v[0:3], v0, s[0:7] dmask:0xf unorm +; GFX9-W64-NEXT: image_store v[4:7], v0, s[0:7] dmask:0xf unorm ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-W64-NEXT: s_mov_b32 s4, 0x40e00000 ; GFX9-W64-NEXT: s_branch .LBB31_2 ; GFX9-W64-NEXT: .LBB31_1: ; %body ; GFX9-W64-NEXT: ; in Loop: Header=BB31_2 Depth=1 -; GFX9-W64-NEXT: image_sample v[0:3], v7, s[0:7], s[0:3] dmask:0xf +; GFX9-W64-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf ; GFX9-W64-NEXT: v_add_f32_e32 v8, 2.0, v8 ; GFX9-W64-NEXT: s_cbranch_execz .LBB31_4 ; GFX9-W64-NEXT: .LBB31_2: ; %loop ; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-W64-NEXT: v_cmp_lt_f32_e32 vcc, s4, v8 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-W64-NEXT: v_mov_b32_e32 v5, v2 -; GFX9-W64-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-W64-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-W64-NEXT: v_cmp_lt_f32_e32 vcc, s4, v8 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-W64-NEXT: v_mov_b32_e32 v2, v6 +; GFX9-W64-NEXT: v_mov_b32_e32 v3, v7 ; GFX9-W64-NEXT: s_cbranch_vccz .LBB31_1 ; GFX9-W64-NEXT: ; %bb.3: -; GFX9-W64-NEXT: ; implicit-def: $vgpr3 +; GFX9-W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-W64-NEXT: ; implicit-def: $vgpr8 ; GFX9-W64-NEXT: .LBB31_4: ; %break ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_mov_b32_e32 v0, v7 -; GFX9-W64-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-W64-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-W64-NEXT: v_mov_b32_e32 v3, v4 ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_loop_vcc: @@ -1879,28 +1879,28 @@ ; GFX10-W32-NEXT: .p2align 6 ; GFX10-W32-NEXT: .LBB31_1: ; %body ; GFX10-W32-NEXT: ; in Loop: Header=BB31_2 Depth=1 -; GFX10-W32-NEXT: image_sample v[0:3], v7, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: v_add_f32_e32 v8, 2.0, v8 ; GFX10-W32-NEXT: s_cbranch_execz .LBB31_4 ; GFX10-W32-NEXT: .LBB31_2: ; %loop ; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-W32-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10-W32-NEXT: v_mov_b32_e32 v5, v2 -; GFX10-W32-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-W32-NEXT: v_mov_b32_e32 v7, v0 +; GFX10-W32-NEXT: v_mov_b32_e32 v7, v3 +; GFX10-W32-NEXT: v_mov_b32_e32 v6, v2 +; GFX10-W32-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-W32-NEXT: s_cbranch_vccz .LBB31_1 ; GFX10-W32-NEXT: ; %bb.3: -; GFX10-W32-NEXT: ; implicit-def: $vgpr3 +; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX10-W32-NEXT: ; implicit-def: $vgpr8 ; GFX10-W32-NEXT: .LBB31_4: ; %break ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: v_mov_b32_e32 v0, v7 -; GFX10-W32-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-W32-NEXT: v_mov_b32_e32 v2, v5 -; GFX10-W32-NEXT: v_mov_b32_e32 v3, v4 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-W32-NEXT: v_mov_b32_e32 v2, v6 +; GFX10-W32-NEXT: v_mov_b32_e32 v3, v7 ; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-W32-NEXT: ; return to shader part epilog entry: @@ -2136,7 +2136,7 @@ ; GFX9-W64-NEXT: s_cbranch_execz .LBB35_3 ; GFX9-W64-NEXT: s_branch .LBB35_4 ; GFX9-W64-NEXT: .LBB35_2: -; GFX9-W64-NEXT: ; implicit-def: $vgpr3 +; GFX9-W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-W64-NEXT: .LBB35_3: ; %if ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 @@ -2162,7 +2162,7 @@ ; GFX10-W32-NEXT: s_cbranch_execz .LBB35_3 ; GFX10-W32-NEXT: s_branch .LBB35_4 ; GFX10-W32-NEXT: .LBB35_2: -; GFX10-W32-NEXT: ; implicit-def: $vgpr3 +; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX10-W32-NEXT: .LBB35_3: ; %if ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0