diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -47,6 +47,16 @@ cl::ReallyHidden, cl::init(true)); +static cl::opt + ScalarizeLargePHIs("amdgpu-codegenprepare-break-large-phis", + cl::desc("Break large PHI nodes for DAGISel"), + cl::ReallyHidden, cl::init(true)); + +static cl::opt ScalarizeLargePHIsThreshold( + "amdgpu-codegenprepare-break-large-phis-threshold", + cl::desc("Minimum type size in bits for breaking large PHI nodes"), + cl::ReallyHidden, cl::init(32)); + static cl::opt UseMul24Intrin( "amdgpu-codegenprepare-mul24", cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"), @@ -213,6 +223,7 @@ bool visitLoadInst(LoadInst &I); bool visitICmpInst(ICmpInst &I); bool visitSelectInst(SelectInst &I); + bool visitPHINode(PHINode &I); bool visitIntrinsicInst(IntrinsicInst &I); bool visitBitreverseIntrinsicInst(IntrinsicInst &I); @@ -1383,6 +1394,106 @@ return Changed; } +bool AMDGPUCodeGenPrepare::visitPHINode(PHINode &I) { + // Break-up fixed-vector PHIs into smaller pieces. + // Default threshold is 32, so it breaks up any vector that's >32 bits into + // its elements, or into 32-bit pieces (for 8/16 bit elts). + // + // This is only helpful for DAGISel because it doesn't handle large PHIs as + // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg. + // With large, odd-sized PHIs we may end up needing many `build_vector` + // operations with most elements being "undef". This inhibits a lot of + // optimization opportunities and can result in unreasonably high register + // pressure and the inevitable stack spilling. + if (!ScalarizeLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption) + return false; + + FixedVectorType *FVT = dyn_cast(I.getType()); + if (!FVT || DL->getTypeSizeInBits(FVT) <= ScalarizeLargePHIsThreshold) + return false; + + struct VectorSlice { + Type *Ty = nullptr; + unsigned Idx = 0; + unsigned NumElts = 0; + std::vector IncomingValues = {}; + PHINode *NewPHI = nullptr; + }; + + std::vector Slices; + + Type *EltTy = FVT->getElementType(); + { + unsigned Idx = 0; + // For 8/16 bits type, don't scalarize fully but break it up into as many + // 32-bit slices as we can, and scalarize the tail. + const unsigned EltSize = DL->getTypeSizeInBits(EltTy); + const unsigned NumElts = FVT->getNumElements(); + if (EltSize == 8 || EltSize == 16) { + const unsigned SubVecSize = (32 / EltSize); + Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize); + for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End; + Idx += SubVecSize) + Slices.push_back(VectorSlice{SubVecTy, Idx, SubVecSize}); + } + + // Scalarize all remaining elements. + for (; Idx < NumElts; ++Idx) + Slices.push_back(VectorSlice{EltTy, Idx, 1}); + } + + if (Slices.size() == 1) + return false; + + // Break up this PHI's incoming values. + for (unsigned Idx = 0; Idx < I.getNumIncomingValues(); ++Idx) { + Value *Inc = I.getIncomingValue(Idx); + + IRBuilder<> B(I.getIncomingBlock(Idx)->getTerminator()); + if (Instruction *IncInst = dyn_cast(Inc)) + B.SetCurrentDebugLocation(IncInst->getDebugLoc()); + + unsigned NameSuffix = 0; + for (VectorSlice &S : Slices) { + const auto ValName = + "largephi.extractslice" + std::to_string(NameSuffix++); + if (S.NumElts > 1) { + SmallVector Mask; + for (unsigned K = S.Idx; K < (S.Idx + S.NumElts); ++K) + Mask.push_back(K); + S.IncomingValues.push_back(B.CreateShuffleVector(Inc, Mask, ValName)); + } else + S.IncomingValues.push_back(B.CreateExtractElement(Inc, S.Idx, ValName)); + } + } + + // Now create one PHI per vector piece. + IRBuilder<> B(I.getParent()->getFirstNonPHI()); + B.SetCurrentDebugLocation(I.getDebugLoc()); + + for (VectorSlice &S : Slices) { + S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues()); + for (const auto &[Idx, BB] : enumerate(I.blocks())) + S.NewPHI->addIncoming(S.IncomingValues[Idx], BB); + } + + // And replace this PHI with a vector of all the previous PHI values. + Value *Vec = PoisonValue::get(FVT); + unsigned NameSuffix = 0; + for (VectorSlice &S : Slices) { + const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++); + if (S.NumElts > 1) + Vec = + B.CreateInsertVector(FVT, Vec, S.NewPHI, B.getInt64(S.Idx), ValName); + else + Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName); + } + + I.replaceAllUsesWith(Vec); + I.eraseFromParent(); + return true; +} + bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { switch (I.getIntrinsicID()) { case Intrinsic::bitreverse: diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -514,114 +514,112 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg2, i64 %arg3, <2 x half> %arg4, <2 x half> %arg5) #3 { ; GFX908-LABEL: introduced_copy_to_sgpr: ; GFX908: ; %bb.0: ; %bb -; GFX908-NEXT: global_load_ushort v16, v[0:1], off glc +; GFX908-NEXT: global_load_ushort v8, v[0:1], off glc ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX908-NEXT: s_load_dword s9, s[4:5], 0x18 -; GFX908-NEXT: s_mov_b32 s8, 0 -; GFX908-NEXT: s_mov_b32 s5, s8 +; GFX908-NEXT: s_mov_b32 s9, 0 +; GFX908-NEXT: s_load_dword s4, s[4:5], 0x18 +; GFX908-NEXT: v_mov_b32_e32 v11, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX908-NEXT: s_sub_i32 s4, 0, s3 -; GFX908-NEXT: v_cvt_f32_f16_e32 v17, s9 -; GFX908-NEXT: v_mov_b32_e32 v19, 0 -; GFX908-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX908-NEXT: s_sub_i32 s5, 0, s3 +; GFX908-NEXT: v_cvt_f32_f16_e32 v9, s4 +; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX908-NEXT: v_cvt_u32_f32_e32 v2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: v_mov_b32_e32 v1, 0 -; GFX908-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX908-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX908-NEXT: v_readfirstlane_b32 s10, v2 -; GFX908-NEXT: s_mul_i32 s4, s4, s10 -; GFX908-NEXT: s_mul_hi_u32 s4, s10, s4 -; GFX908-NEXT: s_add_i32 s10, s10, s4 -; GFX908-NEXT: s_mul_hi_u32 s4, s2, s10 -; GFX908-NEXT: s_mul_i32 s10, s4, s3 -; GFX908-NEXT: s_sub_i32 s2, s2, s10 -; GFX908-NEXT: s_add_i32 s11, s4, 1 -; GFX908-NEXT: s_sub_i32 s10, s2, s3 +; GFX908-NEXT: v_readfirstlane_b32 s8, v2 +; GFX908-NEXT: s_mul_i32 s5, s5, s8 +; GFX908-NEXT: s_mul_hi_u32 s5, s8, s5 +; GFX908-NEXT: s_add_i32 s8, s8, s5 +; GFX908-NEXT: s_mul_hi_u32 s5, s2, s8 +; GFX908-NEXT: s_mul_i32 s8, s5, s3 +; GFX908-NEXT: s_sub_i32 s2, s2, s8 +; GFX908-NEXT: s_add_i32 s10, s5, 1 +; GFX908-NEXT: s_sub_i32 s8, s2, s3 ; GFX908-NEXT: s_cmp_ge_u32 s2, s3 -; GFX908-NEXT: s_cselect_b32 s4, s11, s4 -; GFX908-NEXT: s_cselect_b32 s2, s10, s2 -; GFX908-NEXT: s_add_i32 s10, s4, 1 +; GFX908-NEXT: s_cselect_b32 s5, s10, s5 +; GFX908-NEXT: s_cselect_b32 s2, s8, s2 +; GFX908-NEXT: s_add_i32 s8, s5, 1 ; GFX908-NEXT: s_cmp_ge_u32 s2, s3 -; GFX908-NEXT: s_cselect_b32 s4, s10, s4 -; GFX908-NEXT: s_lshr_b32 s9, s9, 16 -; GFX908-NEXT: s_lshl_b64 s[12:13], s[4:5], 5 -; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s9 +; GFX908-NEXT: s_cselect_b32 s8, s8, s5 +; GFX908-NEXT: s_lshr_b32 s10, s4, 16 +; GFX908-NEXT: v_cvt_f32_f16_e32 v10, s10 +; GFX908-NEXT: s_lshl_b64 s[10:11], s[8:9], 5 ; GFX908-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 -; GFX908-NEXT: s_lshl_b64 s[10:11], s[6:7], 5 -; GFX908-NEXT: s_or_b32 s10, s10, 28 +; GFX908-NEXT: s_lshl_b64 s[4:5], s[6:7], 5 +; GFX908-NEXT: s_or_b32 s4, s4, 28 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s5, v16 -; GFX908-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX908-NEXT: s_mul_i32 s1, s1, s5 -; GFX908-NEXT: s_mul_hi_u32 s9, s0, s5 -; GFX908-NEXT: s_mul_i32 s0, s0, s5 -; GFX908-NEXT: s_add_i32 s1, s9, s1 +; GFX908-NEXT: v_readfirstlane_b32 s9, v8 +; GFX908-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX908-NEXT: s_mul_i32 s1, s1, s9 +; GFX908-NEXT: s_mul_hi_u32 s12, s0, s9 +; GFX908-NEXT: s_mul_i32 s0, s0, s9 +; GFX908-NEXT: s_add_i32 s1, s12, s1 ; GFX908-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 ; GFX908-NEXT: s_branch .LBB3_2 -; GFX908-NEXT: .LBB3_1: ; %Flow20 +; GFX908-NEXT: .LBB3_1: ; %Flow56 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[14:15] +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[12:13] ; GFX908-NEXT: s_cbranch_vccz .LBB3_12 ; GFX908-NEXT: .LBB3_2: ; %bb9 ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB3_5 Depth 2 -; GFX908-NEXT: s_mov_b64 s[16:17], -1 +; GFX908-NEXT: s_mov_b64 s[14:15], -1 ; GFX908-NEXT: s_cbranch_scc0 .LBB3_10 ; GFX908-NEXT: ; %bb.3: ; %bb14 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX908-NEXT: s_mov_b32 s9, s8 -; GFX908-NEXT: v_mov_b32_e32 v4, s8 -; GFX908-NEXT: v_mov_b32_e32 v8, s8 -; GFX908-NEXT: v_mov_b32_e32 v6, s8 -; GFX908-NEXT: v_mov_b32_e32 v5, s9 -; GFX908-NEXT: v_mov_b32_e32 v9, s9 -; GFX908-NEXT: v_mov_b32_e32 v7, s9 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0 -; GFX908-NEXT: v_cmp_gt_i64_e64 s[16:17], s[6:7], -1 -; GFX908-NEXT: v_mov_b32_e32 v11, v5 -; GFX908-NEXT: s_mov_b64 s[20:21], s[10:11] -; GFX908-NEXT: v_mov_b32_e32 v10, v4 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[12:13], s[6:7], 0 +; GFX908-NEXT: v_cmp_gt_i64_e64 s[14:15], s[6:7], -1 +; GFX908-NEXT: v_mov_b32_e32 v12, 0 +; GFX908-NEXT: s_mov_b64 s[18:19], s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v18, 0 +; GFX908-NEXT: v_mov_b32_e32 v17, 0 +; GFX908-NEXT: v_mov_b32_e32 v16, 0 +; GFX908-NEXT: v_mov_b32_e32 v15, 0 +; GFX908-NEXT: v_mov_b32_e32 v14, 0 +; GFX908-NEXT: v_mov_b32_e32 v13, 0 +; GFX908-NEXT: v_mov_b32_e32 v19, 0 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s5, v2 -; GFX908-NEXT: v_readfirstlane_b32 s9, v3 -; GFX908-NEXT: s_add_u32 s5, s5, 1 -; GFX908-NEXT: s_addc_u32 s9, s9, 0 -; GFX908-NEXT: s_mul_hi_u32 s19, s2, s5 -; GFX908-NEXT: s_mul_i32 s22, s3, s5 -; GFX908-NEXT: s_mul_i32 s18, s2, s5 -; GFX908-NEXT: s_mul_i32 s5, s2, s9 -; GFX908-NEXT: s_add_i32 s5, s19, s5 -; GFX908-NEXT: s_add_i32 s5, s5, s22 +; GFX908-NEXT: v_readfirstlane_b32 s9, v2 +; GFX908-NEXT: v_readfirstlane_b32 s16, v3 +; GFX908-NEXT: s_add_u32 s9, s9, 1 +; GFX908-NEXT: s_addc_u32 s17, s16, 0 +; GFX908-NEXT: s_mul_hi_u32 s20, s2, s9 +; GFX908-NEXT: s_mul_i32 s21, s3, s9 +; GFX908-NEXT: s_mul_i32 s16, s2, s9 +; GFX908-NEXT: s_mul_i32 s9, s2, s17 +; GFX908-NEXT: s_add_i32 s9, s20, s9 +; GFX908-NEXT: s_add_i32 s9, s9, s21 ; GFX908-NEXT: s_branch .LBB3_5 ; GFX908-NEXT: .LBB3_4: ; %bb58 ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 -; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX908-NEXT: s_add_u32 s20, s20, s0 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3] -; GFX908-NEXT: s_addc_u32 s21, s21, s1 -; GFX908-NEXT: s_mov_b64 s[22:23], 0 -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25] +; GFX908-NEXT: s_add_u32 s18, s18, s0 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[22:23], -1, v[2:3] +; GFX908-NEXT: s_addc_u32 s19, s19, s1 +; GFX908-NEXT: s_mov_b64 s[20:21], 0 +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23] ; GFX908-NEXT: s_cbranch_vccz .LBB3_9 ; GFX908-NEXT: .LBB3_5: ; %bb16 ; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: s_add_u32 s22, s20, s18 -; GFX908-NEXT: s_addc_u32 s23, s21, s5 -; GFX908-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc +; GFX908-NEXT: s_add_u32 s20, s18, s16 +; GFX908-NEXT: s_addc_u32 s21, s19, s9 +; GFX908-NEXT: global_load_dword v21, v11, s[20:21] offset:-12 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc +; GFX908-NEXT: global_load_dword v20, v11, s[20:21] offset:-8 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v12, v19, s[22:23] offset:-4 glc +; GFX908-NEXT: global_load_dword v4, v11, s[20:21] offset:-4 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v12, v19, s[22:23] glc +; GFX908-NEXT: global_load_dword v4, v11, s[20:21] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: ds_read_b64 v[12:13], v19 -; GFX908-NEXT: ds_read_b64 v[14:15], v0 -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[16:17] +; GFX908-NEXT: ds_read_b64 v[4:5], v11 +; GFX908-NEXT: ds_read_b64 v[6:7], v0 +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[14:15] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_cbranch_vccnz .LBB3_7 ; GFX908-NEXT: ; %bb.6: ; %bb51 @@ -630,50 +628,54 @@ ; GFX908-NEXT: v_cvt_f32_f16_e32 v21, v21 ; GFX908-NEXT: v_cvt_f32_f16_sdwa v23, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX908-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GFX908-NEXT: v_add_f32_e32 v24, v17, v12 -; GFX908-NEXT: v_add_f32_e32 v25, v18, v13 -; GFX908-NEXT: v_add_f32_e32 v26, 0, v12 -; GFX908-NEXT: v_add_f32_e32 v27, 0, v13 -; GFX908-NEXT: v_add_f32_e32 v15, v22, v15 -; GFX908-NEXT: v_add_f32_e32 v14, v21, v14 -; GFX908-NEXT: v_add_f32_e32 v13, v23, v13 -; GFX908-NEXT: v_add_f32_e32 v12, v20, v12 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v25 -; GFX908-NEXT: v_add_f32_e32 v4, v4, v24 -; GFX908-NEXT: v_add_f32_e32 v9, v9, v27 -; GFX908-NEXT: v_add_f32_e32 v8, v8, v26 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v14 -; GFX908-NEXT: v_add_f32_e32 v7, v7, v15 -; GFX908-NEXT: v_add_f32_e32 v10, v10, v12 -; GFX908-NEXT: v_add_f32_e32 v11, v11, v13 -; GFX908-NEXT: s_mov_b64 s[22:23], -1 +; GFX908-NEXT: v_add_f32_e32 v24, v10, v5 +; GFX908-NEXT: v_add_f32_e32 v25, v9, v4 +; GFX908-NEXT: v_add_f32_e32 v26, 0, v5 +; GFX908-NEXT: v_add_f32_e32 v27, 0, v4 +; GFX908-NEXT: v_add_f32_e32 v7, v22, v7 +; GFX908-NEXT: v_add_f32_e32 v6, v21, v6 +; GFX908-NEXT: v_add_f32_e32 v5, v23, v5 +; GFX908-NEXT: v_add_f32_e32 v4, v20, v4 +; GFX908-NEXT: v_add_f32_e32 v12, v12, v25 +; GFX908-NEXT: v_add_f32_e32 v18, v18, v24 +; GFX908-NEXT: v_add_f32_e32 v17, v17, v27 +; GFX908-NEXT: v_add_f32_e32 v16, v16, v26 +; GFX908-NEXT: v_add_f32_e32 v15, v15, v6 +; GFX908-NEXT: v_add_f32_e32 v14, v14, v7 +; GFX908-NEXT: v_add_f32_e32 v13, v13, v4 +; GFX908-NEXT: v_add_f32_e32 v19, v19, v5 +; GFX908-NEXT: s_mov_b64 s[20:21], -1 ; GFX908-NEXT: s_branch .LBB3_4 ; GFX908-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 -; GFX908-NEXT: s_mov_b64 s[22:23], s[14:15] -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23] +; GFX908-NEXT: s_mov_b64 s[20:21], s[12:13] +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[20:21] ; GFX908-NEXT: s_cbranch_vccz .LBB3_4 ; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 -; GFX908-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX908-NEXT: ; implicit-def: $vgpr19 +; GFX908-NEXT: ; implicit-def: $vgpr13 +; GFX908-NEXT: ; implicit-def: $vgpr14 +; GFX908-NEXT: ; implicit-def: $vgpr15 +; GFX908-NEXT: ; implicit-def: $vgpr16 +; GFX908-NEXT: ; implicit-def: $vgpr17 +; GFX908-NEXT: ; implicit-def: $vgpr18 +; GFX908-NEXT: ; implicit-def: $vgpr12 ; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX908-NEXT: ; implicit-def: $sgpr20_sgpr21 +; GFX908-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX908-NEXT: .LBB3_9: ; %loop.exit.guard ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_xor_b64 s[16:17], s[22:23], -1 -; GFX908-NEXT: .LBB3_10: ; %Flow19 +; GFX908-NEXT: s_xor_b64 s[14:15], s[20:21], -1 +; GFX908-NEXT: .LBB3_10: ; %Flow55 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_mov_b64 s[14:15], -1 -; GFX908-NEXT: s_and_b64 vcc, exec, s[16:17] +; GFX908-NEXT: s_mov_b64 s[12:13], -1 +; GFX908-NEXT: s_and_b64 vcc, exec, s[14:15] ; GFX908-NEXT: s_cbranch_vccz .LBB3_1 ; GFX908-NEXT: ; %bb.11: ; %bb12 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_add_u32 s6, s6, s4 +; GFX908-NEXT: s_add_u32 s6, s6, s8 ; GFX908-NEXT: s_addc_u32 s7, s7, 0 -; GFX908-NEXT: s_add_u32 s10, s10, s12 -; GFX908-NEXT: s_addc_u32 s11, s11, s13 -; GFX908-NEXT: s_mov_b64 s[14:15], 0 +; GFX908-NEXT: s_add_u32 s4, s4, s10 +; GFX908-NEXT: s_addc_u32 s5, s5, s11 +; GFX908-NEXT: s_mov_b64 s[12:13], 0 ; GFX908-NEXT: s_branch .LBB3_1 ; GFX908-NEXT: .LBB3_12: ; %DummyReturnBlock ; GFX908-NEXT: s_endpgm @@ -683,107 +685,109 @@ ; GFX90A-NEXT: global_load_ushort v18, v[0:1], off glc ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX90A-NEXT: s_load_dword s9, s[4:5], 0x18 -; GFX90A-NEXT: s_mov_b32 s8, 0 -; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s9, 0 +; GFX90A-NEXT: s_load_dword s4, s[4:5], 0x18 +; GFX90A-NEXT: v_mov_b32_e32 v19, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX90A-NEXT: s_sub_i32 s4, 0, s3 -; GFX90A-NEXT: v_mov_b32_e32 v19, 0 +; GFX90A-NEXT: s_sub_i32 s5, 0, s3 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v0 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, s9 -; GFX90A-NEXT: v_readfirstlane_b32 s10, v1 -; GFX90A-NEXT: s_mul_i32 s4, s4, s10 -; GFX90A-NEXT: s_mul_hi_u32 s4, s10, s4 -; GFX90A-NEXT: s_add_i32 s10, s10, s4 -; GFX90A-NEXT: s_mul_hi_u32 s4, s2, s10 -; GFX90A-NEXT: s_mul_i32 s10, s4, s3 -; GFX90A-NEXT: s_sub_i32 s2, s2, s10 -; GFX90A-NEXT: s_add_i32 s11, s4, 1 -; GFX90A-NEXT: s_sub_i32 s10, s2, s3 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, s4 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v1 +; GFX90A-NEXT: s_mul_i32 s5, s5, s8 +; GFX90A-NEXT: s_mul_hi_u32 s5, s8, s5 +; GFX90A-NEXT: s_add_i32 s8, s8, s5 +; GFX90A-NEXT: s_mul_hi_u32 s5, s2, s8 +; GFX90A-NEXT: s_mul_i32 s8, s5, s3 +; GFX90A-NEXT: s_sub_i32 s2, s2, s8 +; GFX90A-NEXT: s_add_i32 s10, s5, 1 +; GFX90A-NEXT: s_sub_i32 s8, s2, s3 ; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 -; GFX90A-NEXT: s_cselect_b32 s4, s11, s4 -; GFX90A-NEXT: s_cselect_b32 s2, s10, s2 -; GFX90A-NEXT: s_add_i32 s10, s4, 1 +; GFX90A-NEXT: s_cselect_b32 s5, s10, s5 +; GFX90A-NEXT: s_cselect_b32 s2, s8, s2 +; GFX90A-NEXT: s_add_i32 s8, s5, 1 ; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 -; GFX90A-NEXT: s_cselect_b32 s4, s10, s4 -; GFX90A-NEXT: s_lshr_b32 s9, s9, 16 -; GFX90A-NEXT: s_lshl_b64 s[12:13], s[4:5], 5 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v1, s9 +; GFX90A-NEXT: s_cselect_b32 s8, s8, s5 +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v1, s10 +; GFX90A-NEXT: s_lshl_b64 s[10:11], s[8:9], 5 ; GFX90A-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 -; GFX90A-NEXT: s_lshl_b64 s[10:11], s[6:7], 5 -; GFX90A-NEXT: s_or_b32 s10, s10, 28 +; GFX90A-NEXT: s_lshl_b64 s[4:5], s[6:7], 5 +; GFX90A-NEXT: s_or_b32 s4, s4, 28 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s5, v18 -; GFX90A-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX90A-NEXT: s_mul_i32 s1, s1, s5 -; GFX90A-NEXT: s_mul_hi_u32 s9, s0, s5 -; GFX90A-NEXT: s_mul_i32 s0, s0, s5 -; GFX90A-NEXT: s_add_i32 s1, s9, s1 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v18 +; GFX90A-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX90A-NEXT: s_mul_i32 s1, s1, s9 +; GFX90A-NEXT: s_mul_hi_u32 s12, s0, s9 +; GFX90A-NEXT: s_mul_i32 s0, s0, s9 +; GFX90A-NEXT: s_add_i32 s1, s12, s1 ; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 ; GFX90A-NEXT: s_branch .LBB3_2 -; GFX90A-NEXT: .LBB3_1: ; %Flow20 +; GFX90A-NEXT: .LBB3_1: ; %Flow56 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[14:15] +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[12:13] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_12 ; GFX90A-NEXT: .LBB3_2: ; %bb9 ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB3_5 Depth 2 -; GFX90A-NEXT: s_mov_b64 s[16:17], -1 +; GFX90A-NEXT: s_mov_b64 s[14:15], -1 ; GFX90A-NEXT: s_cbranch_scc0 .LBB3_10 ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX90A-NEXT: s_mov_b32 s9, s8 -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0 -; GFX90A-NEXT: v_cmp_gt_i64_e64 s[16:17], s[6:7], -1 -; GFX90A-NEXT: s_mov_b64 s[20:21], s[10:11] -; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_cmp_lt_i64_e64 s[12:13], s[6:7], 0 +; GFX90A-NEXT: v_cmp_gt_i64_e64 s[14:15], s[6:7], -1 +; GFX90A-NEXT: s_mov_b64 s[18:19], s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v6 +; GFX90A-NEXT: v_mov_b32_e32 v10, v6 +; GFX90A-NEXT: v_mov_b32_e32 v11, v6 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s5, v4 -; GFX90A-NEXT: v_readfirstlane_b32 s9, v5 -; GFX90A-NEXT: s_add_u32 s5, s5, 1 -; GFX90A-NEXT: s_addc_u32 s9, s9, 0 -; GFX90A-NEXT: s_mul_hi_u32 s19, s2, s5 -; GFX90A-NEXT: s_mul_i32 s22, s3, s5 -; GFX90A-NEXT: s_mul_i32 s18, s2, s5 -; GFX90A-NEXT: s_mul_i32 s5, s2, s9 -; GFX90A-NEXT: s_add_i32 s5, s19, s5 -; GFX90A-NEXT: s_add_i32 s5, s5, s22 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v4 +; GFX90A-NEXT: v_readfirstlane_b32 s16, v5 +; GFX90A-NEXT: s_add_u32 s9, s9, 1 +; GFX90A-NEXT: s_addc_u32 s17, s16, 0 +; GFX90A-NEXT: s_mul_hi_u32 s20, s2, s9 +; GFX90A-NEXT: s_mul_i32 s21, s3, s9 +; GFX90A-NEXT: s_mul_i32 s16, s2, s9 +; GFX90A-NEXT: s_mul_i32 s9, s2, s17 +; GFX90A-NEXT: s_add_i32 s9, s20, s9 +; GFX90A-NEXT: s_add_i32 s9, s9, s21 ; GFX90A-NEXT: s_branch .LBB3_5 ; GFX90A-NEXT: .LBB3_4: ; %bb58 ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX90A-NEXT: s_add_u32 s20, s20, s0 -; GFX90A-NEXT: s_addc_u32 s21, s21, s1 -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5] -; GFX90A-NEXT: s_mov_b64 s[22:23], 0 -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25] +; GFX90A-NEXT: s_add_u32 s18, s18, s0 +; GFX90A-NEXT: s_addc_u32 s19, s19, s1 +; GFX90A-NEXT: v_cmp_lt_i64_e64 s[22:23], -1, v[4:5] +; GFX90A-NEXT: s_mov_b64 s[20:21], 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_9 ; GFX90A-NEXT: .LBB3_5: ; %bb16 ; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: s_add_u32 s22, s20, s18 -; GFX90A-NEXT: s_addc_u32 s23, s21, s5 -; GFX90A-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc +; GFX90A-NEXT: s_add_u32 s20, s18, s16 +; GFX90A-NEXT: s_addc_u32 s21, s19, s9 +; GFX90A-NEXT: global_load_dword v21, v19, s[20:21] offset:-12 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc +; GFX90A-NEXT: global_load_dword v20, v19, s[20:21] offset:-8 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] offset:-4 glc +; GFX90A-NEXT: global_load_dword v14, v19, s[20:21] offset:-4 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] glc +; GFX90A-NEXT: global_load_dword v14, v19, s[20:21] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ds_read_b64 v[14:15], v19 ; GFX90A-NEXT: ds_read_b64 v[16:17], v0 -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[16:17] -; GFX90A-NEXT: ; kill: killed $sgpr22 killed $sgpr23 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[14:15] +; GFX90A-NEXT: ; kill: killed $sgpr20 killed $sgpr21 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7 ; GFX90A-NEXT: ; %bb.6: ; %bb51 @@ -797,37 +801,37 @@ ; GFX90A-NEXT: v_pk_add_f32 v[16:17], v[22:23], v[16:17] ; GFX90A-NEXT: v_pk_add_f32 v[14:15], v[20:21], v[14:15] ; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[24:25] -; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[26:27] -; GFX90A-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[16:17] -; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[14:15] -; GFX90A-NEXT: s_mov_b64 s[22:23], -1 +; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[26:27] +; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[16:17] +; GFX90A-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[14:15] +; GFX90A-NEXT: s_mov_b64 s[20:21], -1 ; GFX90A-NEXT: s_branch .LBB3_4 ; GFX90A-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 -; GFX90A-NEXT: s_mov_b64 s[22:23], s[14:15] -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23] +; GFX90A-NEXT: s_mov_b64 s[20:21], s[12:13] +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[20:21] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_4 ; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: ; implicit-def: $vgpr12_vgpr13 -; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX90A-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: ; implicit-def: $vgpr9 +; GFX90A-NEXT: ; implicit-def: $vgpr11 +; GFX90A-NEXT: ; implicit-def: $vgpr13 +; GFX90A-NEXT: ; implicit-def: $vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX90A-NEXT: ; implicit-def: $sgpr20_sgpr21 +; GFX90A-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX90A-NEXT: .LBB3_9: ; %loop.exit.guard ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_xor_b64 s[16:17], s[22:23], -1 -; GFX90A-NEXT: .LBB3_10: ; %Flow19 +; GFX90A-NEXT: s_xor_b64 s[14:15], s[20:21], -1 +; GFX90A-NEXT: .LBB3_10: ; %Flow55 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_mov_b64 s[14:15], -1 -; GFX90A-NEXT: s_and_b64 vcc, exec, s[16:17] +; GFX90A-NEXT: s_mov_b64 s[12:13], -1 +; GFX90A-NEXT: s_and_b64 vcc, exec, s[14:15] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 ; GFX90A-NEXT: ; %bb.11: ; %bb12 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_add_u32 s6, s6, s4 +; GFX90A-NEXT: s_add_u32 s6, s6, s8 ; GFX90A-NEXT: s_addc_u32 s7, s7, 0 -; GFX90A-NEXT: s_add_u32 s10, s10, s12 -; GFX90A-NEXT: s_addc_u32 s11, s11, s13 -; GFX90A-NEXT: s_mov_b64 s[14:15], 0 +; GFX90A-NEXT: s_add_u32 s4, s4, s10 +; GFX90A-NEXT: s_addc_u32 s5, s5, s11 +; GFX90A-NEXT: s_mov_b64 s[12:13], 0 ; GFX90A-NEXT: s_branch .LBB3_1 ; GFX90A-NEXT: .LBB3_12: ; %DummyReturnBlock ; GFX90A-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll @@ -0,0 +1,1060 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare %s | FileCheck %s --check-prefixes=OPT +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare -global-isel %s | FileCheck %s --check-prefixes=NOOPT +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare -amdgpu-codegenprepare-break-large-phis=0 %s | FileCheck %s --check-prefixes=NOOPT + +define amdgpu_kernel void @phi_v5f64(<5 x double> %in, ptr %out, i1 %cond) { +; OPT-LABEL: @phi_v5f64( +; OPT-NEXT: entry: +; OPT-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; OPT: then: +; OPT-NEXT: [[X:%.*]] = insertelement <5 x double> [[IN:%.*]], double 3.140000e+00, i32 3 +; OPT-NEXT: [[TMP0:%.*]] = extractelement <5 x double> [[X]], i64 0 +; OPT-NEXT: [[TMP1:%.*]] = extractelement <5 x double> [[X]], i64 1 +; OPT-NEXT: [[TMP2:%.*]] = extractelement <5 x double> [[X]], i64 2 +; OPT-NEXT: [[TMP3:%.*]] = extractelement <5 x double> [[X]], i64 3 +; OPT-NEXT: [[TMP4:%.*]] = extractelement <5 x double> [[X]], i64 4 +; OPT-NEXT: br label [[FINALLY:%.*]] +; OPT: else: +; OPT-NEXT: [[Y:%.*]] = insertelement <5 x double> [[IN]], double 9.140000e+00, i32 2 +; OPT-NEXT: [[TMP5:%.*]] = extractelement <5 x double> [[Y]], i64 0 +; OPT-NEXT: [[TMP6:%.*]] = extractelement <5 x double> [[Y]], i64 1 +; OPT-NEXT: [[TMP7:%.*]] = extractelement <5 x double> [[Y]], i64 2 +; OPT-NEXT: [[TMP8:%.*]] = extractelement <5 x double> [[Y]], i64 3 +; OPT-NEXT: [[TMP9:%.*]] = extractelement <5 x double> [[Y]], i64 4 +; OPT-NEXT: br label [[FINALLY]] +; OPT: finally: +; OPT-NEXT: [[TMP10:%.*]] = phi double [ [[TMP0]], [[THEN]] ], [ [[TMP5]], [[ELSE]] ] +; OPT-NEXT: [[TMP11:%.*]] = phi double [ [[TMP1]], [[THEN]] ], [ [[TMP6]], [[ELSE]] ] +; OPT-NEXT: [[TMP12:%.*]] = phi double [ [[TMP2]], [[THEN]] ], [ [[TMP7]], [[ELSE]] ] +; OPT-NEXT: [[TMP13:%.*]] = phi double [ [[TMP3]], [[THEN]] ], [ [[TMP8]], [[ELSE]] ] +; OPT-NEXT: [[TMP14:%.*]] = phi double [ [[TMP4]], [[THEN]] ], [ [[TMP9]], [[ELSE]] ] +; OPT-NEXT: [[TMP15:%.*]] = insertelement <5 x double> poison, double [[TMP10]], i64 0 +; OPT-NEXT: [[TMP16:%.*]] = insertelement <5 x double> [[TMP15]], double [[TMP11]], i64 1 +; OPT-NEXT: [[TMP17:%.*]] = insertelement <5 x double> [[TMP16]], double [[TMP12]], i64 2 +; OPT-NEXT: [[TMP18:%.*]] = insertelement <5 x double> [[TMP17]], double [[TMP13]], i64 3 +; OPT-NEXT: [[TMP19:%.*]] = insertelement <5 x double> [[TMP18]], double [[TMP14]], i64 4 +; OPT-NEXT: store <5 x double> [[TMP19]], ptr [[OUT:%.*]], align 1 +; OPT-NEXT: ret void +; +; NOOPT-LABEL: @phi_v5f64( +; NOOPT-NEXT: entry: +; NOOPT-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; NOOPT: then: +; NOOPT-NEXT: [[X:%.*]] = insertelement <5 x double> [[IN:%.*]], double 3.140000e+00, i32 3 +; NOOPT-NEXT: br label [[FINALLY:%.*]] +; NOOPT: else: +; NOOPT-NEXT: [[Y:%.*]] = insertelement <5 x double> [[IN]], double 9.140000e+00, i32 2 +; NOOPT-NEXT: br label [[FINALLY]] +; NOOPT: finally: +; NOOPT-NEXT: [[VAL:%.*]] = phi <5 x double> [ [[X]], [[THEN]] ], [ [[Y]], [[ELSE]] ] +; NOOPT-NEXT: store <5 x double> [[VAL]], ptr [[OUT:%.*]], align 1 +; NOOPT-NEXT: ret void +; +entry: + br i1 %cond, label %then, label %else +then: + %x = insertelement <5 x double> %in, double 3.14, i32 3 + br label %finally +else: + %y = insertelement <5 x double> %in, double 9.14, i32 2 + br label %finally +finally: + %val = phi <5 x double> [%x, %then], [%y, %else] + store <5 x double> %val, ptr %out, align 1 + ret void +} + +define amdgpu_kernel void @phi_v7f64(<7 x double> %in, ptr %out, i1 %cond) { +; OPT-LABEL: @phi_v7f64( +; OPT-NEXT: entry: +; OPT-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; OPT: then: +; OPT-NEXT: [[X:%.*]] = insertelement <7 x double> [[IN:%.*]], double 3.140000e+00, i32 3 +; OPT-NEXT: [[TMP0:%.*]] = extractelement <7 x double> [[X]], i64 0 +; OPT-NEXT: [[TMP1:%.*]] = extractelement <7 x double> [[X]], i64 1 +; OPT-NEXT: [[TMP2:%.*]] = extractelement <7 x double> [[X]], i64 2 +; OPT-NEXT: [[TMP3:%.*]] = extractelement <7 x double> [[X]], i64 3 +; OPT-NEXT: [[TMP4:%.*]] = extractelement <7 x double> [[X]], i64 4 +; OPT-NEXT: [[TMP5:%.*]] = extractelement <7 x double> [[X]], i64 5 +; OPT-NEXT: [[TMP6:%.*]] = extractelement <7 x double> [[X]], i64 6 +; OPT-NEXT: br label [[FINALLY:%.*]] +; OPT: else: +; OPT-NEXT: [[Y:%.*]] = insertelement <7 x double> [[IN]], double 9.140000e+00, i32 6 +; OPT-NEXT: [[TMP7:%.*]] = extractelement <7 x double> [[Y]], i64 0 +; OPT-NEXT: [[TMP8:%.*]] = extractelement <7 x double> [[Y]], i64 1 +; OPT-NEXT: [[TMP9:%.*]] = extractelement <7 x double> [[Y]], i64 2 +; OPT-NEXT: [[TMP10:%.*]] = extractelement <7 x double> [[Y]], i64 3 +; OPT-NEXT: [[TMP11:%.*]] = extractelement <7 x double> [[Y]], i64 4 +; OPT-NEXT: [[TMP12:%.*]] = extractelement <7 x double> [[Y]], i64 5 +; OPT-NEXT: [[TMP13:%.*]] = extractelement <7 x double> [[Y]], i64 6 +; OPT-NEXT: br label [[FINALLY]] +; OPT: finally: +; OPT-NEXT: [[TMP14:%.*]] = phi double [ [[TMP0]], [[THEN]] ], [ [[TMP7]], [[ELSE]] ] +; OPT-NEXT: [[TMP15:%.*]] = phi double [ [[TMP1]], [[THEN]] ], [ [[TMP8]], [[ELSE]] ] +; OPT-NEXT: [[TMP16:%.*]] = phi double [ [[TMP2]], [[THEN]] ], [ [[TMP9]], [[ELSE]] ] +; OPT-NEXT: [[TMP17:%.*]] = phi double [ [[TMP3]], [[THEN]] ], [ [[TMP10]], [[ELSE]] ] +; OPT-NEXT: [[TMP18:%.*]] = phi double [ [[TMP4]], [[THEN]] ], [ [[TMP11]], [[ELSE]] ] +; OPT-NEXT: [[TMP19:%.*]] = phi double [ [[TMP5]], [[THEN]] ], [ [[TMP12]], [[ELSE]] ] +; OPT-NEXT: [[TMP20:%.*]] = phi double [ [[TMP6]], [[THEN]] ], [ [[TMP13]], [[ELSE]] ] +; OPT-NEXT: [[TMP21:%.*]] = insertelement <7 x double> poison, double [[TMP14]], i64 0 +; OPT-NEXT: [[TMP22:%.*]] = insertelement <7 x double> [[TMP21]], double [[TMP15]], i64 1 +; OPT-NEXT: [[TMP23:%.*]] = insertelement <7 x double> [[TMP22]], double [[TMP16]], i64 2 +; OPT-NEXT: [[TMP24:%.*]] = insertelement <7 x double> [[TMP23]], double [[TMP17]], i64 3 +; OPT-NEXT: [[TMP25:%.*]] = insertelement <7 x double> [[TMP24]], double [[TMP18]], i64 4 +; OPT-NEXT: [[TMP26:%.*]] = insertelement <7 x double> [[TMP25]], double [[TMP19]], i64 5 +; OPT-NEXT: [[TMP27:%.*]] = insertelement <7 x double> [[TMP26]], double [[TMP20]], i64 6 +; OPT-NEXT: store <7 x double> [[TMP27]], ptr [[OUT:%.*]], align 1 +; OPT-NEXT: ret void +; +; NOOPT-LABEL: @phi_v7f64( +; NOOPT-NEXT: entry: +; NOOPT-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; NOOPT: then: +; NOOPT-NEXT: [[X:%.*]] = insertelement <7 x double> [[IN:%.*]], double 3.140000e+00, i32 3 +; NOOPT-NEXT: br label [[FINALLY:%.*]] +; NOOPT: else: +; NOOPT-NEXT: [[Y:%.*]] = insertelement <7 x double> [[IN]], double 9.140000e+00, i32 6 +; NOOPT-NEXT: br label [[FINALLY]] +; NOOPT: finally: +; NOOPT-NEXT: [[VAL:%.*]] = phi <7 x double> [ [[X]], [[THEN]] ], [ [[Y]], [[ELSE]] ] +; NOOPT-NEXT: store <7 x double> [[VAL]], ptr [[OUT:%.*]], align 1 +; NOOPT-NEXT: ret void +; +entry: + br i1 %cond, label %then, label %else +then: + %x = insertelement <7 x double> %in, double 3.14, i32 3 + br label %finally +else: + %y = insertelement <7 x double> %in, double 9.14, i32 6 + br label %finally +finally: + %val = phi <7 x double> [%x, %then], [%y, %else] + store <7 x double> %val, ptr %out, align 1 + ret void +} + +define amdgpu_kernel void @phi_v11f64(<11 x double> %in, ptr %out, i1 %cond) { +; OPT-LABEL: @phi_v11f64( +; OPT-NEXT: entry: +; OPT-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; OPT: then: +; OPT-NEXT: [[X:%.*]] = insertelement <11 x double> [[IN:%.*]], double 3.140000e+00, i32 3 +; OPT-NEXT: [[TMP0:%.*]] = extractelement <11 x double> [[X]], i64 0 +; OPT-NEXT: [[TMP1:%.*]] = extractelement <11 x double> [[X]], i64 1 +; OPT-NEXT: [[TMP2:%.*]] = extractelement <11 x double> [[X]], i64 2 +; OPT-NEXT: [[TMP3:%.*]] = extractelement <11 x double> [[X]], i64 3 +; OPT-NEXT: [[TMP4:%.*]] = extractelement <11 x double> [[X]], i64 4 +; OPT-NEXT: [[TMP5:%.*]] = extractelement <11 x double> [[X]], i64 5 +; OPT-NEXT: [[TMP6:%.*]] = extractelement <11 x double> [[X]], i64 6 +; OPT-NEXT: [[TMP7:%.*]] = extractelement <11 x double> [[X]], i64 7 +; OPT-NEXT: [[TMP8:%.*]] = extractelement <11 x double> [[X]], i64 8 +; OPT-NEXT: [[TMP9:%.*]] = extractelement <11 x double> [[X]], i64 9 +; OPT-NEXT: [[TMP10:%.*]] = extractelement <11 x double> [[X]], i64 10 +; OPT-NEXT: br label [[FINALLY:%.*]] +; OPT: else: +; OPT-NEXT: [[Y:%.*]] = insertelement <11 x double> [[IN]], double 9.140000e+00, i32 6 +; OPT-NEXT: [[TMP11:%.*]] = extractelement <11 x double> [[Y]], i64 0 +; OPT-NEXT: [[TMP12:%.*]] = extractelement <11 x double> [[Y]], i64 1 +; OPT-NEXT: [[TMP13:%.*]] = extractelement <11 x double> [[Y]], i64 2 +; OPT-NEXT: [[TMP14:%.*]] = extractelement <11 x double> [[Y]], i64 3 +; OPT-NEXT: [[TMP15:%.*]] = extractelement <11 x double> [[Y]], i64 4 +; OPT-NEXT: [[TMP16:%.*]] = extractelement <11 x double> [[Y]], i64 5 +; OPT-NEXT: [[TMP17:%.*]] = extractelement <11 x double> [[Y]], i64 6 +; OPT-NEXT: [[TMP18:%.*]] = extractelement <11 x double> [[Y]], i64 7 +; OPT-NEXT: [[TMP19:%.*]] = extractelement <11 x double> [[Y]], i64 8 +; OPT-NEXT: [[TMP20:%.*]] = extractelement <11 x double> [[Y]], i64 9 +; OPT-NEXT: [[TMP21:%.*]] = extractelement <11 x double> [[Y]], i64 10 +; OPT-NEXT: br label [[FINALLY]] +; OPT: finally: +; OPT-NEXT: [[TMP22:%.*]] = phi double [ [[TMP0]], [[THEN]] ], [ [[TMP11]], [[ELSE]] ] +; OPT-NEXT: [[TMP23:%.*]] = phi double [ [[TMP1]], [[THEN]] ], [ [[TMP12]], [[ELSE]] ] +; OPT-NEXT: [[TMP24:%.*]] = phi double [ [[TMP2]], [[THEN]] ], [ [[TMP13]], [[ELSE]] ] +; OPT-NEXT: [[TMP25:%.*]] = phi double [ [[TMP3]], [[THEN]] ], [ [[TMP14]], [[ELSE]] ] +; OPT-NEXT: [[TMP26:%.*]] = phi double [ [[TMP4]], [[THEN]] ], [ [[TMP15]], [[ELSE]] ] +; OPT-NEXT: [[TMP27:%.*]] = phi double [ [[TMP5]], [[THEN]] ], [ [[TMP16]], [[ELSE]] ] +; OPT-NEXT: [[TMP28:%.*]] = phi double [ [[TMP6]], [[THEN]] ], [ [[TMP17]], [[ELSE]] ] +; OPT-NEXT: [[TMP29:%.*]] = phi double [ [[TMP7]], [[THEN]] ], [ [[TMP18]], [[ELSE]] ] +; OPT-NEXT: [[TMP30:%.*]] = phi double [ [[TMP8]], [[THEN]] ], [ [[TMP19]], [[ELSE]] ] +; OPT-NEXT: [[TMP31:%.*]] = phi double [ [[TMP9]], [[THEN]] ], [ [[TMP20]], [[ELSE]] ] +; OPT-NEXT: [[TMP32:%.*]] = phi double [ [[TMP10]], [[THEN]] ], [ [[TMP21]], [[ELSE]] ] +; OPT-NEXT: [[TMP33:%.*]] = insertelement <11 x double> poison, double [[TMP22]], i64 0 +; OPT-NEXT: [[TMP34:%.*]] = insertelement <11 x double> [[TMP33]], double [[TMP23]], i64 1 +; OPT-NEXT: [[TMP35:%.*]] = insertelement <11 x double> [[TMP34]], double [[TMP24]], i64 2 +; OPT-NEXT: [[TMP36:%.*]] = insertelement <11 x double> [[TMP35]], double [[TMP25]], i64 3 +; OPT-NEXT: [[TMP37:%.*]] = insertelement <11 x double> [[TMP36]], double [[TMP26]], i64 4 +; OPT-NEXT: [[TMP38:%.*]] = insertelement <11 x double> [[TMP37]], double [[TMP27]], i64 5 +; OPT-NEXT: [[TMP39:%.*]] = insertelement <11 x double> [[TMP38]], double [[TMP28]], i64 6 +; OPT-NEXT: [[TMP40:%.*]] = insertelement <11 x double> [[TMP39]], double [[TMP29]], i64 7 +; OPT-NEXT: [[TMP41:%.*]] = insertelement <11 x double> [[TMP40]], double [[TMP30]], i64 8 +; OPT-NEXT: [[TMP42:%.*]] = insertelement <11 x double> [[TMP41]], double [[TMP31]], i64 9 +; OPT-NEXT: [[TMP43:%.*]] = insertelement <11 x double> [[TMP42]], double [[TMP32]], i64 10 +; OPT-NEXT: store <11 x double> [[TMP43]], ptr [[OUT:%.*]], align 1 +; OPT-NEXT: ret void +; +; NOOPT-LABEL: @phi_v11f64( +; NOOPT-NEXT: entry: +; NOOPT-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; NOOPT: then: +; NOOPT-NEXT: [[X:%.*]] = insertelement <11 x double> [[IN:%.*]], double 3.140000e+00, i32 3 +; NOOPT-NEXT: br label [[FINALLY:%.*]] +; NOOPT: else: +; NOOPT-NEXT: [[Y:%.*]] = insertelement <11 x double> [[IN]], double 9.140000e+00, i32 6 +; NOOPT-NEXT: br label [[FINALLY]] +; NOOPT: finally: +; NOOPT-NEXT: [[VAL:%.*]] = phi <11 x double> [ [[X]], [[THEN]] ], [ [[Y]], [[ELSE]] ] +; NOOPT-NEXT: store <11 x double> [[VAL]], ptr [[OUT:%.*]], align 1 +; NOOPT-NEXT: ret void +; +entry: + br i1 %cond, label %then, label %else +then: + %x = insertelement <11 x double> %in, double 3.14, i32 3 + br label %finally +else: + %y = insertelement <11 x double> %in, double 9.14, i32 6 + br label %finally +finally: + %val = phi <11 x double> [%x, %then], [%y, %else] + store <11 x double> %val, ptr %out, align 1 + ret void +} + +define amdgpu_kernel void @phi_v11f64_cst(<11 x double> %in, ptr %out, i1 %cond) { +; OPT-LABEL: @phi_v11f64_cst( +; OPT-NEXT: entry: +; OPT-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[FINALLY:%.*]] +; OPT: then: +; OPT-NEXT: [[X:%.*]] = insertelement <11 x double> [[IN:%.*]], double 3.140000e+00, i32 3 +; OPT-NEXT: [[TMP0:%.*]] = extractelement <11 x double> [[X]], i64 0 +; OPT-NEXT: [[TMP1:%.*]] = extractelement <11 x double> [[X]], i64 1 +; OPT-NEXT: [[TMP2:%.*]] = extractelement <11 x double> [[X]], i64 2 +; OPT-NEXT: [[TMP3:%.*]] = extractelement <11 x double> [[X]], i64 3 +; OPT-NEXT: [[TMP4:%.*]] = extractelement <11 x double> [[X]], i64 4 +; OPT-NEXT: [[TMP5:%.*]] = extractelement <11 x double> [[X]], i64 5 +; OPT-NEXT: [[TMP6:%.*]] = extractelement <11 x double> [[X]], i64 6 +; OPT-NEXT: [[TMP7:%.*]] = extractelement <11 x double> [[X]], i64 7 +; OPT-NEXT: [[TMP8:%.*]] = extractelement <11 x double> [[X]], i64 8 +; OPT-NEXT: [[TMP9:%.*]] = extractelement <11 x double> [[X]], i64 9 +; OPT-NEXT: [[TMP10:%.*]] = extractelement <11 x double> [[X]], i64 10 +; OPT-NEXT: br label [[FINALLY]] +; OPT: finally: +; OPT-NEXT: [[TMP11:%.*]] = phi double [ [[TMP0]], [[THEN]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +; OPT-NEXT: [[TMP12:%.*]] = phi double [ [[TMP1]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ] +; OPT-NEXT: [[TMP13:%.*]] = phi double [ [[TMP2]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ] +; OPT-NEXT: [[TMP14:%.*]] = phi double [ [[TMP3]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ] +; OPT-NEXT: [[TMP15:%.*]] = phi double [ [[TMP4]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ] +; OPT-NEXT: [[TMP16:%.*]] = phi double [ [[TMP5]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ] +; OPT-NEXT: [[TMP17:%.*]] = phi double [ [[TMP6]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ] +; OPT-NEXT: [[TMP18:%.*]] = phi double [ [[TMP7]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ] +; OPT-NEXT: [[TMP19:%.*]] = phi double [ [[TMP8]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ] +; OPT-NEXT: [[TMP20:%.*]] = phi double [ [[TMP9]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ] +; OPT-NEXT: [[TMP21:%.*]] = phi double [ [[TMP10]], [[THEN]] ], [ 0.000000e+00, [[ENTRY]] ] +; OPT-NEXT: [[TMP22:%.*]] = insertelement <11 x double> poison, double [[TMP11]], i64 0 +; OPT-NEXT: [[TMP23:%.*]] = insertelement <11 x double> [[TMP22]], double [[TMP12]], i64 1 +; OPT-NEXT: [[TMP24:%.*]] = insertelement <11 x double> [[TMP23]], double [[TMP13]], i64 2 +; OPT-NEXT: [[TMP25:%.*]] = insertelement <11 x double> [[TMP24]], double [[TMP14]], i64 3 +; OPT-NEXT: [[TMP26:%.*]] = insertelement <11 x double> [[TMP25]], double [[TMP15]], i64 4 +; OPT-NEXT: [[TMP27:%.*]] = insertelement <11 x double> [[TMP26]], double [[TMP16]], i64 5 +; OPT-NEXT: [[TMP28:%.*]] = insertelement <11 x double> [[TMP27]], double [[TMP17]], i64 6 +; OPT-NEXT: [[TMP29:%.*]] = insertelement <11 x double> [[TMP28]], double [[TMP18]], i64 7 +; OPT-NEXT: [[TMP30:%.*]] = insertelement <11 x double> [[TMP29]], double [[TMP19]], i64 8 +; OPT-NEXT: [[TMP31:%.*]] = insertelement <11 x double> [[TMP30]], double [[TMP20]], i64 9 +; OPT-NEXT: [[TMP32:%.*]] = insertelement <11 x double> [[TMP31]], double [[TMP21]], i64 10 +; OPT-NEXT: store <11 x double> [[TMP32]], ptr [[OUT:%.*]], align 1 +; OPT-NEXT: ret void +; +; NOOPT-LABEL: @phi_v11f64_cst( +; NOOPT-NEXT: entry: +; NOOPT-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[FINALLY:%.*]] +; NOOPT: then: +; NOOPT-NEXT: [[X:%.*]] = insertelement <11 x double> [[IN:%.*]], double 3.140000e+00, i32 3 +; NOOPT-NEXT: br label [[FINALLY]] +; NOOPT: finally: +; NOOPT-NEXT: [[VAL:%.*]] = phi <11 x double> [ [[X]], [[THEN]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; NOOPT-NEXT: store <11 x double> [[VAL]], ptr [[OUT:%.*]], align 1 +; NOOPT-NEXT: ret void +; +entry: + br i1 %cond, label %then, label %finally +then: + %x = insertelement <11 x double> %in, double 3.14, i32 3 + br label %finally +finally: + %val = phi <11 x double> [%x, %then], [zeroinitializer, %entry] + store <11 x double> %val, ptr %out, align 1 + ret void +} + +define amdgpu_kernel void @phi_v15i64(<15 x i64> %in, ptr %out, i1 %cond) { +; OPT-LABEL: @phi_v15i64( +; OPT-NEXT: entry: +; OPT-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; OPT: then: +; OPT-NEXT: [[X:%.*]] = insertelement <15 x i64> [[IN:%.*]], i64 42, i32 3 +; OPT-NEXT: [[TMP0:%.*]] = extractelement <15 x i64> [[X]], i64 0 +; OPT-NEXT: [[TMP1:%.*]] = extractelement <15 x i64> [[X]], i64 1 +; OPT-NEXT: [[TMP2:%.*]] = extractelement <15 x i64> [[X]], i64 2 +; OPT-NEXT: [[TMP3:%.*]] = extractelement <15 x i64> [[X]], i64 3 +; OPT-NEXT: [[TMP4:%.*]] = extractelement <15 x i64> [[X]], i64 4 +; OPT-NEXT: [[TMP5:%.*]] = extractelement <15 x i64> [[X]], i64 5 +; OPT-NEXT: [[TMP6:%.*]] = extractelement <15 x i64> [[X]], i64 6 +; OPT-NEXT: [[TMP7:%.*]] = extractelement <15 x i64> [[X]], i64 7 +; OPT-NEXT: [[TMP8:%.*]] = extractelement <15 x i64> [[X]], i64 8 +; OPT-NEXT: [[TMP9:%.*]] = extractelement <15 x i64> [[X]], i64 9 +; OPT-NEXT: [[TMP10:%.*]] = extractelement <15 x i64> [[X]], i64 10 +; OPT-NEXT: [[TMP11:%.*]] = extractelement <15 x i64> [[X]], i64 11 +; OPT-NEXT: [[TMP12:%.*]] = extractelement <15 x i64> [[X]], i64 12 +; OPT-NEXT: [[TMP13:%.*]] = extractelement <15 x i64> [[X]], i64 13 +; OPT-NEXT: [[TMP14:%.*]] = extractelement <15 x i64> [[X]], i64 14 +; OPT-NEXT: br label [[FINALLY:%.*]] +; OPT: else: +; OPT-NEXT: [[Y:%.*]] = insertelement <15 x i64> [[IN]], i64 64, i32 6 +; OPT-NEXT: [[TMP15:%.*]] = extractelement <15 x i64> [[Y]], i64 0 +; OPT-NEXT: [[TMP16:%.*]] = extractelement <15 x i64> [[Y]], i64 1 +; OPT-NEXT: [[TMP17:%.*]] = extractelement <15 x i64> [[Y]], i64 2 +; OPT-NEXT: [[TMP18:%.*]] = extractelement <15 x i64> [[Y]], i64 3 +; OPT-NEXT: [[TMP19:%.*]] = extractelement <15 x i64> [[Y]], i64 4 +; OPT-NEXT: [[TMP20:%.*]] = extractelement <15 x i64> [[Y]], i64 5 +; OPT-NEXT: [[TMP21:%.*]] = extractelement <15 x i64> [[Y]], i64 6 +; OPT-NEXT: [[TMP22:%.*]] = extractelement <15 x i64> [[Y]], i64 7 +; OPT-NEXT: [[TMP23:%.*]] = extractelement <15 x i64> [[Y]], i64 8 +; OPT-NEXT: [[TMP24:%.*]] = extractelement <15 x i64> [[Y]], i64 9 +; OPT-NEXT: [[TMP25:%.*]] = extractelement <15 x i64> [[Y]], i64 10 +; OPT-NEXT: [[TMP26:%.*]] = extractelement <15 x i64> [[Y]], i64 11 +; OPT-NEXT: [[TMP27:%.*]] = extractelement <15 x i64> [[Y]], i64 12 +; OPT-NEXT: [[TMP28:%.*]] = extractelement <15 x i64> [[Y]], i64 13 +; OPT-NEXT: [[TMP29:%.*]] = extractelement <15 x i64> [[Y]], i64 14 +; OPT-NEXT: br label [[FINALLY]] +; OPT: finally: +; OPT-NEXT: [[TMP30:%.*]] = phi i64 [ [[TMP0]], [[THEN]] ], [ [[TMP15]], [[ELSE]] ] +; OPT-NEXT: [[TMP31:%.*]] = phi i64 [ [[TMP1]], [[THEN]] ], [ [[TMP16]], [[ELSE]] ] +; OPT-NEXT: [[TMP32:%.*]] = phi i64 [ [[TMP2]], [[THEN]] ], [ [[TMP17]], [[ELSE]] ] +; OPT-NEXT: [[TMP33:%.*]] = phi i64 [ [[TMP3]], [[THEN]] ], [ [[TMP18]], [[ELSE]] ] +; OPT-NEXT: [[TMP34:%.*]] = phi i64 [ [[TMP4]], [[THEN]] ], [ [[TMP19]], [[ELSE]] ] +; OPT-NEXT: [[TMP35:%.*]] = phi i64 [ [[TMP5]], [[THEN]] ], [ [[TMP20]], [[ELSE]] ] +; OPT-NEXT: [[TMP36:%.*]] = phi i64 [ [[TMP6]], [[THEN]] ], [ [[TMP21]], [[ELSE]] ] +; OPT-NEXT: [[TMP37:%.*]] = phi i64 [ [[TMP7]], [[THEN]] ], [ [[TMP22]], [[ELSE]] ] +; OPT-NEXT: [[TMP38:%.*]] = phi i64 [ [[TMP8]], [[THEN]] ], [ [[TMP23]], [[ELSE]] ] +; OPT-NEXT: [[TMP39:%.*]] = phi i64 [ [[TMP9]], [[THEN]] ], [ [[TMP24]], [[ELSE]] ] +; OPT-NEXT: [[TMP40:%.*]] = phi i64 [ [[TMP10]], [[THEN]] ], [ [[TMP25]], [[ELSE]] ] +; OPT-NEXT: [[TMP41:%.*]] = phi i64 [ [[TMP11]], [[THEN]] ], [ [[TMP26]], [[ELSE]] ] +; OPT-NEXT: [[TMP42:%.*]] = phi i64 [ [[TMP12]], [[THEN]] ], [ [[TMP27]], [[ELSE]] ] +; OPT-NEXT: [[TMP43:%.*]] = phi i64 [ [[TMP13]], [[THEN]] ], [ [[TMP28]], [[ELSE]] ] +; OPT-NEXT: [[TMP44:%.*]] = phi i64 [ [[TMP14]], [[THEN]] ], [ [[TMP29]], [[ELSE]] ] +; OPT-NEXT: [[TMP45:%.*]] = insertelement <15 x i64> poison, i64 [[TMP30]], i64 0 +; OPT-NEXT: [[TMP46:%.*]] = insertelement <15 x i64> [[TMP45]], i64 [[TMP31]], i64 1 +; OPT-NEXT: [[TMP47:%.*]] = insertelement <15 x i64> [[TMP46]], i64 [[TMP32]], i64 2 +; OPT-NEXT: [[TMP48:%.*]] = insertelement <15 x i64> [[TMP47]], i64 [[TMP33]], i64 3 +; OPT-NEXT: [[TMP49:%.*]] = insertelement <15 x i64> [[TMP48]], i64 [[TMP34]], i64 4 +; OPT-NEXT: [[TMP50:%.*]] = insertelement <15 x i64> [[TMP49]], i64 [[TMP35]], i64 5 +; OPT-NEXT: [[TMP51:%.*]] = insertelement <15 x i64> [[TMP50]], i64 [[TMP36]], i64 6 +; OPT-NEXT: [[TMP52:%.*]] = insertelement <15 x i64> [[TMP51]], i64 [[TMP37]], i64 7 +; OPT-NEXT: [[TMP53:%.*]] = insertelement <15 x i64> [[TMP52]], i64 [[TMP38]], i64 8 +; OPT-NEXT: [[TMP54:%.*]] = insertelement <15 x i64> [[TMP53]], i64 [[TMP39]], i64 9 +; OPT-NEXT: [[TMP55:%.*]] = insertelement <15 x i64> [[TMP54]], i64 [[TMP40]], i64 10 +; OPT-NEXT: [[TMP56:%.*]] = insertelement <15 x i64> [[TMP55]], i64 [[TMP41]], i64 11 +; OPT-NEXT: [[TMP57:%.*]] = insertelement <15 x i64> [[TMP56]], i64 [[TMP42]], i64 12 +; OPT-NEXT: [[TMP58:%.*]] = insertelement <15 x i64> [[TMP57]], i64 [[TMP43]], i64 13 +; OPT-NEXT: [[TMP59:%.*]] = insertelement <15 x i64> [[TMP58]], i64 [[TMP44]], i64 14 +; OPT-NEXT: store <15 x i64> [[TMP59]], ptr [[OUT:%.*]], align 1 +; OPT-NEXT: ret void +; +; NOOPT-LABEL: @phi_v15i64( +; NOOPT-NEXT: entry: +; NOOPT-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; NOOPT: then: +; NOOPT-NEXT: [[X:%.*]] = insertelement <15 x i64> [[IN:%.*]], i64 42, i32 3 +; NOOPT-NEXT: br label [[FINALLY:%.*]] +; NOOPT: else: +; NOOPT-NEXT: [[Y:%.*]] = insertelement <15 x i64> [[IN]], i64 64, i32 6 +; NOOPT-NEXT: br label [[FINALLY]] +; NOOPT: finally: +; NOOPT-NEXT: [[VAL:%.*]] = phi <15 x i64> [ [[X]], [[THEN]] ], [ [[Y]], [[ELSE]] ] +; NOOPT-NEXT: store <15 x i64> [[VAL]], ptr [[OUT:%.*]], align 1 +; NOOPT-NEXT: ret void +; +entry: + br i1 %cond, label %then, label %else +then: + %x = insertelement <15 x i64> %in, i64 42, i32 3 + br label %finally +else: + %y = insertelement <15 x i64> %in, i64 64, i32 6 + br label %finally +finally: + %val = phi <15 x i64> [%x, %then], [%y, %else] + store <15 x i64> %val, ptr %out, align 1 + ret void +} + +define amdgpu_kernel void @phi_v27i16(<27 x i16> %in, ptr %out, i1 %cond) { +; OPT-LABEL: @phi_v27i16( +; OPT-NEXT: entry: +; OPT-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; OPT: then: +; OPT-NEXT: [[X:%.*]] = insertelement <27 x i16> [[IN:%.*]], i16 42, i32 3 +; OPT-NEXT: [[TMP0:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP1:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP2:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP3:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP4:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP5:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP6:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP7:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP8:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP9:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP10:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP11:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP12:%.*]] = shufflevector <27 x i16> [[X]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP13:%.*]] = extractelement <27 x i16> [[X]], i64 26 +; OPT-NEXT: br label [[FINALLY:%.*]] +; OPT: else: +; OPT-NEXT: [[Y:%.*]] = insertelement <27 x i16> [[IN]], i16 64, i32 6 +; OPT-NEXT: [[TMP14:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP15:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP16:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP17:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP18:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP19:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP20:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP21:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP22:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP23:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP24:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP25:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP26:%.*]] = shufflevector <27 x i16> [[Y]], <27 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP27:%.*]] = extractelement <27 x i16> [[Y]], i64 26 +; OPT-NEXT: br label [[FINALLY]] +; OPT: finally: +; OPT-NEXT: [[TMP28:%.*]] = phi <2 x i16> [ [[TMP0]], [[THEN]] ], [ [[TMP14]], [[ELSE]] ] +; OPT-NEXT: [[TMP29:%.*]] = phi <2 x i16> [ [[TMP1]], [[THEN]] ], [ [[TMP15]], [[ELSE]] ] +; OPT-NEXT: [[TMP30:%.*]] = phi <2 x i16> [ [[TMP2]], [[THEN]] ], [ [[TMP16]], [[ELSE]] ] +; OPT-NEXT: [[TMP31:%.*]] = phi <2 x i16> [ [[TMP3]], [[THEN]] ], [ [[TMP17]], [[ELSE]] ] +; OPT-NEXT: [[TMP32:%.*]] = phi <2 x i16> [ [[TMP4]], [[THEN]] ], [ [[TMP18]], [[ELSE]] ] +; OPT-NEXT: [[TMP33:%.*]] = phi <2 x i16> [ [[TMP5]], [[THEN]] ], [ [[TMP19]], [[ELSE]] ] +; OPT-NEXT: [[TMP34:%.*]] = phi <2 x i16> [ [[TMP6]], [[THEN]] ], [ [[TMP20]], [[ELSE]] ] +; OPT-NEXT: [[TMP35:%.*]] = phi <2 x i16> [ [[TMP7]], [[THEN]] ], [ [[TMP21]], [[ELSE]] ] +; OPT-NEXT: [[TMP36:%.*]] = phi <2 x i16> [ [[TMP8]], [[THEN]] ], [ [[TMP22]], [[ELSE]] ] +; OPT-NEXT: [[TMP37:%.*]] = phi <2 x i16> [ [[TMP9]], [[THEN]] ], [ [[TMP23]], [[ELSE]] ] +; OPT-NEXT: [[TMP38:%.*]] = phi <2 x i16> [ [[TMP10]], [[THEN]] ], [ [[TMP24]], [[ELSE]] ] +; OPT-NEXT: [[TMP39:%.*]] = phi <2 x i16> [ [[TMP11]], [[THEN]] ], [ [[TMP25]], [[ELSE]] ] +; OPT-NEXT: [[TMP40:%.*]] = phi <2 x i16> [ [[TMP12]], [[THEN]] ], [ [[TMP26]], [[ELSE]] ] +; OPT-NEXT: [[TMP41:%.*]] = phi i16 [ [[TMP13]], [[THEN]] ], [ [[TMP27]], [[ELSE]] ] +; OPT-NEXT: [[TMP42:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> poison, <2 x i16> [[TMP28]], i64 0) +; OPT-NEXT: [[TMP43:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[TMP42]], <2 x i16> [[TMP29]], i64 2) +; OPT-NEXT: [[TMP44:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[TMP43]], <2 x i16> [[TMP30]], i64 4) +; OPT-NEXT: [[TMP45:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[TMP44]], <2 x i16> [[TMP31]], i64 6) +; OPT-NEXT: [[TMP46:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[TMP45]], <2 x i16> [[TMP32]], i64 8) +; OPT-NEXT: [[TMP47:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[TMP46]], <2 x i16> [[TMP33]], i64 10) +; OPT-NEXT: [[TMP48:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[TMP47]], <2 x i16> [[TMP34]], i64 12) +; OPT-NEXT: [[TMP49:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[TMP48]], <2 x i16> [[TMP35]], i64 14) +; OPT-NEXT: [[TMP50:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[TMP49]], <2 x i16> [[TMP36]], i64 16) +; OPT-NEXT: [[TMP51:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[TMP50]], <2 x i16> [[TMP37]], i64 18) +; OPT-NEXT: [[TMP52:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[TMP51]], <2 x i16> [[TMP38]], i64 20) +; OPT-NEXT: [[TMP53:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[TMP52]], <2 x i16> [[TMP39]], i64 22) +; OPT-NEXT: [[TMP54:%.*]] = call <27 x i16> @llvm.vector.insert.v27i16.v2i16(<27 x i16> [[TMP53]], <2 x i16> [[TMP40]], i64 24) +; OPT-NEXT: [[TMP55:%.*]] = insertelement <27 x i16> [[TMP54]], i16 [[TMP41]], i64 26 +; OPT-NEXT: store <27 x i16> [[TMP55]], ptr [[OUT:%.*]], align 1 +; OPT-NEXT: ret void +; +; NOOPT-LABEL: @phi_v27i16( +; NOOPT-NEXT: entry: +; NOOPT-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; NOOPT: then: +; NOOPT-NEXT: [[X:%.*]] = insertelement <27 x i16> [[IN:%.*]], i16 42, i32 3 +; NOOPT-NEXT: br label [[FINALLY:%.*]] +; NOOPT: else: +; NOOPT-NEXT: [[Y:%.*]] = insertelement <27 x i16> [[IN]], i16 64, i32 6 +; NOOPT-NEXT: br label [[FINALLY]] +; NOOPT: finally: +; NOOPT-NEXT: [[VAL:%.*]] = phi <27 x i16> [ [[X]], [[THEN]] ], [ [[Y]], [[ELSE]] ] +; NOOPT-NEXT: store <27 x i16> [[VAL]], ptr [[OUT:%.*]], align 1 +; NOOPT-NEXT: ret void +; +entry: + br i1 %cond, label %then, label %else +then: + %x = insertelement <27 x i16> %in, i16 42, i32 3 + br label %finally +else: + %y = insertelement <27 x i16> %in, i16 64, i32 6 + br label %finally +finally: + %val = phi <27 x i16> [%x, %then], [%y, %else] + store <27 x i16> %val, ptr %out, align 1 + ret void +} + + +define amdgpu_kernel void @phi_v23i8(<23 x i8> %in, ptr %out, i1 %cond) { +; OPT-LABEL: @phi_v23i8( +; OPT-NEXT: entry: +; OPT-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; OPT: then: +; OPT-NEXT: [[X:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 42, i32 3 +; OPT-NEXT: [[TMP0:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> +; OPT-NEXT: [[TMP1:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> +; OPT-NEXT: [[TMP2:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> +; OPT-NEXT: [[TMP3:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> +; OPT-NEXT: [[TMP4:%.*]] = shufflevector <23 x i8> [[X]], <23 x i8> poison, <4 x i32> +; OPT-NEXT: [[TMP5:%.*]] = extractelement <23 x i8> [[X]], i64 20 +; OPT-NEXT: [[TMP6:%.*]] = extractelement <23 x i8> [[X]], i64 21 +; OPT-NEXT: [[TMP7:%.*]] = extractelement <23 x i8> [[X]], i64 22 +; OPT-NEXT: br label [[FINALLY:%.*]] +; OPT: else: +; OPT-NEXT: [[Y:%.*]] = insertelement <23 x i8> [[IN]], i8 64, i32 6 +; OPT-NEXT: [[TMP8:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> +; OPT-NEXT: [[TMP9:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> +; OPT-NEXT: [[TMP10:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> +; OPT-NEXT: [[TMP11:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> +; OPT-NEXT: [[TMP12:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> +; OPT-NEXT: [[TMP13:%.*]] = extractelement <23 x i8> [[Y]], i64 20 +; OPT-NEXT: [[TMP14:%.*]] = extractelement <23 x i8> [[Y]], i64 21 +; OPT-NEXT: [[TMP15:%.*]] = extractelement <23 x i8> [[Y]], i64 22 +; OPT-NEXT: br label [[FINALLY]] +; OPT: finally: +; OPT-NEXT: [[TMP16:%.*]] = phi <4 x i8> [ [[TMP0]], [[THEN]] ], [ [[TMP8]], [[ELSE]] ] +; OPT-NEXT: [[TMP17:%.*]] = phi <4 x i8> [ [[TMP1]], [[THEN]] ], [ [[TMP9]], [[ELSE]] ] +; OPT-NEXT: [[TMP18:%.*]] = phi <4 x i8> [ [[TMP2]], [[THEN]] ], [ [[TMP10]], [[ELSE]] ] +; OPT-NEXT: [[TMP19:%.*]] = phi <4 x i8> [ [[TMP3]], [[THEN]] ], [ [[TMP11]], [[ELSE]] ] +; OPT-NEXT: [[TMP20:%.*]] = phi <4 x i8> [ [[TMP4]], [[THEN]] ], [ [[TMP12]], [[ELSE]] ] +; OPT-NEXT: [[TMP21:%.*]] = phi i8 [ [[TMP5]], [[THEN]] ], [ [[TMP13]], [[ELSE]] ] +; OPT-NEXT: [[TMP22:%.*]] = phi i8 [ [[TMP6]], [[THEN]] ], [ [[TMP14]], [[ELSE]] ] +; OPT-NEXT: [[TMP23:%.*]] = phi i8 [ [[TMP7]], [[THEN]] ], [ [[TMP15]], [[ELSE]] ] +; OPT-NEXT: [[TMP24:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP16]], i64 0) +; OPT-NEXT: [[TMP25:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[TMP24]], <4 x i8> [[TMP17]], i64 4) +; OPT-NEXT: [[TMP26:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[TMP25]], <4 x i8> [[TMP18]], i64 8) +; OPT-NEXT: [[TMP27:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[TMP26]], <4 x i8> [[TMP19]], i64 12) +; OPT-NEXT: [[TMP28:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[TMP27]], <4 x i8> [[TMP20]], i64 16) +; OPT-NEXT: [[TMP29:%.*]] = insertelement <23 x i8> [[TMP28]], i8 [[TMP21]], i64 20 +; OPT-NEXT: [[TMP30:%.*]] = insertelement <23 x i8> [[TMP29]], i8 [[TMP22]], i64 21 +; OPT-NEXT: [[TMP31:%.*]] = insertelement <23 x i8> [[TMP30]], i8 [[TMP23]], i64 22 +; OPT-NEXT: store <23 x i8> [[TMP31]], ptr [[OUT:%.*]], align 1 +; OPT-NEXT: ret void +; +; NOOPT-LABEL: @phi_v23i8( +; NOOPT-NEXT: entry: +; NOOPT-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; NOOPT: then: +; NOOPT-NEXT: [[X:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 42, i32 3 +; NOOPT-NEXT: br label [[FINALLY:%.*]] +; NOOPT: else: +; NOOPT-NEXT: [[Y:%.*]] = insertelement <23 x i8> [[IN]], i8 64, i32 6 +; NOOPT-NEXT: br label [[FINALLY]] +; NOOPT: finally: +; NOOPT-NEXT: [[VAL:%.*]] = phi <23 x i8> [ [[X]], [[THEN]] ], [ [[Y]], [[ELSE]] ] +; NOOPT-NEXT: store <23 x i8> [[VAL]], ptr [[OUT:%.*]], align 1 +; NOOPT-NEXT: ret void +; +entry: + br i1 %cond, label %then, label %else +then: + %x = insertelement <23 x i8> %in, i8 42, i32 3 + br label %finally +else: + %y = insertelement <23 x i8> %in, i8 64, i32 6 + br label %finally +finally: + %val = phi <23 x i8> [%x, %then], [%y, %else] + store <23 x i8> %val, ptr %out, align 1 + ret void +} + + +define amdgpu_kernel void @phi_v23i8_zeroinit(<23 x i8> %in, ptr %out, i1 %cond) { +; OPT-LABEL: @phi_v23i8_zeroinit( +; OPT-NEXT: entry: +; OPT-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; OPT: then: +; OPT-NEXT: br label [[FINALLY:%.*]] +; OPT: else: +; OPT-NEXT: [[Y:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 64, i32 6 +; OPT-NEXT: [[TMP0:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> +; OPT-NEXT: [[TMP1:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> +; OPT-NEXT: [[TMP2:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> +; OPT-NEXT: [[TMP3:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> +; OPT-NEXT: [[TMP4:%.*]] = shufflevector <23 x i8> [[Y]], <23 x i8> poison, <4 x i32> +; OPT-NEXT: [[TMP5:%.*]] = extractelement <23 x i8> [[Y]], i64 20 +; OPT-NEXT: [[TMP6:%.*]] = extractelement <23 x i8> [[Y]], i64 21 +; OPT-NEXT: [[TMP7:%.*]] = extractelement <23 x i8> [[Y]], i64 22 +; OPT-NEXT: br label [[FINALLY]] +; OPT: finally: +; OPT-NEXT: [[TMP8:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[TMP0]], [[ELSE]] ] +; OPT-NEXT: [[TMP9:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[TMP1]], [[ELSE]] ] +; OPT-NEXT: [[TMP10:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[TMP2]], [[ELSE]] ] +; OPT-NEXT: [[TMP11:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[TMP3]], [[ELSE]] ] +; OPT-NEXT: [[TMP12:%.*]] = phi <4 x i8> [ zeroinitializer, [[THEN]] ], [ [[TMP4]], [[ELSE]] ] +; OPT-NEXT: [[TMP13:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[TMP5]], [[ELSE]] ] +; OPT-NEXT: [[TMP14:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[TMP6]], [[ELSE]] ] +; OPT-NEXT: [[TMP15:%.*]] = phi i8 [ 0, [[THEN]] ], [ [[TMP7]], [[ELSE]] ] +; OPT-NEXT: [[TMP16:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> poison, <4 x i8> [[TMP8]], i64 0) +; OPT-NEXT: [[TMP17:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[TMP16]], <4 x i8> [[TMP9]], i64 4) +; OPT-NEXT: [[TMP18:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[TMP17]], <4 x i8> [[TMP10]], i64 8) +; OPT-NEXT: [[TMP19:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[TMP18]], <4 x i8> [[TMP11]], i64 12) +; OPT-NEXT: [[TMP20:%.*]] = call <23 x i8> @llvm.vector.insert.v23i8.v4i8(<23 x i8> [[TMP19]], <4 x i8> [[TMP12]], i64 16) +; OPT-NEXT: [[TMP21:%.*]] = insertelement <23 x i8> [[TMP20]], i8 [[TMP13]], i64 20 +; OPT-NEXT: [[TMP22:%.*]] = insertelement <23 x i8> [[TMP21]], i8 [[TMP14]], i64 21 +; OPT-NEXT: [[TMP23:%.*]] = insertelement <23 x i8> [[TMP22]], i8 [[TMP15]], i64 22 +; OPT-NEXT: store <23 x i8> [[TMP23]], ptr [[OUT:%.*]], align 1 +; OPT-NEXT: ret void +; +; NOOPT-LABEL: @phi_v23i8_zeroinit( +; NOOPT-NEXT: entry: +; NOOPT-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; NOOPT: then: +; NOOPT-NEXT: br label [[FINALLY:%.*]] +; NOOPT: else: +; NOOPT-NEXT: [[Y:%.*]] = insertelement <23 x i8> [[IN:%.*]], i8 64, i32 6 +; NOOPT-NEXT: br label [[FINALLY]] +; NOOPT: finally: +; NOOPT-NEXT: [[VAL:%.*]] = phi <23 x i8> [ zeroinitializer, [[THEN]] ], [ [[Y]], [[ELSE]] ] +; NOOPT-NEXT: store <23 x i8> [[VAL]], ptr [[OUT:%.*]], align 1 +; NOOPT-NEXT: ret void +; +entry: + br i1 %cond, label %then, label %else +then: + br label %finally +else: + %y = insertelement <23 x i8> %in, i8 64, i32 6 + br label %finally +finally: + %val = phi <23 x i8> [zeroinitializer, %then], [%y, %else] + store <23 x i8> %val, ptr %out, align 1 + ret void +} + +define amdgpu_kernel void @phi_v15i8_random_constant_init(<15 x i8> %in, ptr %out, i1 %cond) { +; OPT-LABEL: @phi_v15i8_random_constant_init( +; OPT-NEXT: entry: +; OPT-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; OPT: then: +; OPT-NEXT: br label [[FINALLY:%.*]] +; OPT: else: +; OPT-NEXT: [[Y:%.*]] = insertelement <15 x i8> [[IN:%.*]], i8 64, i32 6 +; OPT-NEXT: [[TMP0:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <4 x i32> +; OPT-NEXT: [[TMP1:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <4 x i32> +; OPT-NEXT: [[TMP2:%.*]] = shufflevector <15 x i8> [[Y]], <15 x i8> poison, <4 x i32> +; OPT-NEXT: [[TMP3:%.*]] = extractelement <15 x i8> [[Y]], i64 12 +; OPT-NEXT: [[TMP4:%.*]] = extractelement <15 x i8> [[Y]], i64 13 +; OPT-NEXT: [[TMP5:%.*]] = extractelement <15 x i8> [[Y]], i64 14 +; OPT-NEXT: br label [[FINALLY]] +; OPT: finally: +; OPT-NEXT: [[TMP6:%.*]] = phi <4 x i8> [ , [[THEN]] ], [ [[TMP0]], [[ELSE]] ] +; OPT-NEXT: [[TMP7:%.*]] = phi <4 x i8> [ , [[THEN]] ], [ [[TMP1]], [[ELSE]] ] +; OPT-NEXT: [[TMP8:%.*]] = phi <4 x i8> [ , [[THEN]] ], [ [[TMP2]], [[ELSE]] ] +; OPT-NEXT: [[TMP9:%.*]] = phi i8 [ 13, [[THEN]] ], [ [[TMP3]], [[ELSE]] ] +; OPT-NEXT: [[TMP10:%.*]] = phi i8 [ 14, [[THEN]] ], [ [[TMP4]], [[ELSE]] ] +; OPT-NEXT: [[TMP11:%.*]] = phi i8 [ undef, [[THEN]] ], [ [[TMP5]], [[ELSE]] ] +; OPT-NEXT: [[TMP12:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> poison, <4 x i8> [[TMP6]], i64 0) +; OPT-NEXT: [[TMP13:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[TMP12]], <4 x i8> [[TMP7]], i64 4) +; OPT-NEXT: [[TMP14:%.*]] = call <15 x i8> @llvm.vector.insert.v15i8.v4i8(<15 x i8> [[TMP13]], <4 x i8> [[TMP8]], i64 8) +; OPT-NEXT: [[TMP15:%.*]] = insertelement <15 x i8> [[TMP14]], i8 [[TMP9]], i64 12 +; OPT-NEXT: [[TMP16:%.*]] = insertelement <15 x i8> [[TMP15]], i8 [[TMP10]], i64 13 +; OPT-NEXT: [[TMP17:%.*]] = insertelement <15 x i8> [[TMP16]], i8 [[TMP11]], i64 14 +; OPT-NEXT: store <15 x i8> [[TMP17]], ptr [[OUT:%.*]], align 1 +; OPT-NEXT: ret void +; +; NOOPT-LABEL: @phi_v15i8_random_constant_init( +; NOOPT-NEXT: entry: +; NOOPT-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; NOOPT: then: +; NOOPT-NEXT: br label [[FINALLY:%.*]] +; NOOPT: else: +; NOOPT-NEXT: [[Y:%.*]] = insertelement <15 x i8> [[IN:%.*]], i8 64, i32 6 +; NOOPT-NEXT: br label [[FINALLY]] +; NOOPT: finally: +; NOOPT-NEXT: [[VAL:%.*]] = phi <15 x i8> [ , [[THEN]] ], [ [[Y]], [[ELSE]] ] +; NOOPT-NEXT: store <15 x i8> [[VAL]], ptr [[OUT:%.*]], align 1 +; NOOPT-NEXT: ret void +; +entry: + br i1 %cond, label %then, label %else +then: + br label %finally +else: + %y = insertelement <15 x i8> %in, i8 64, i32 6 + br label %finally +finally: + %val = phi <15 x i8> [, %then], [%y, %else] + store <15 x i8> %val, ptr %out, align 1 + ret void +} + +define amdgpu_kernel void @phi_v23i32(<23 x i32> %in, ptr %out, i1 %cond) { +; OPT-LABEL: @phi_v23i32( +; OPT-NEXT: entry: +; OPT-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; OPT: then: +; OPT-NEXT: [[X:%.*]] = insertelement <23 x i32> [[IN:%.*]], i32 42, i32 3 +; OPT-NEXT: [[TMP0:%.*]] = extractelement <23 x i32> [[X]], i64 0 +; OPT-NEXT: [[TMP1:%.*]] = extractelement <23 x i32> [[X]], i64 1 +; OPT-NEXT: [[TMP2:%.*]] = extractelement <23 x i32> [[X]], i64 2 +; OPT-NEXT: [[TMP3:%.*]] = extractelement <23 x i32> [[X]], i64 3 +; OPT-NEXT: [[TMP4:%.*]] = extractelement <23 x i32> [[X]], i64 4 +; OPT-NEXT: [[TMP5:%.*]] = extractelement <23 x i32> [[X]], i64 5 +; OPT-NEXT: [[TMP6:%.*]] = extractelement <23 x i32> [[X]], i64 6 +; OPT-NEXT: [[TMP7:%.*]] = extractelement <23 x i32> [[X]], i64 7 +; OPT-NEXT: [[TMP8:%.*]] = extractelement <23 x i32> [[X]], i64 8 +; OPT-NEXT: [[TMP9:%.*]] = extractelement <23 x i32> [[X]], i64 9 +; OPT-NEXT: [[TMP10:%.*]] = extractelement <23 x i32> [[X]], i64 10 +; OPT-NEXT: [[TMP11:%.*]] = extractelement <23 x i32> [[X]], i64 11 +; OPT-NEXT: [[TMP12:%.*]] = extractelement <23 x i32> [[X]], i64 12 +; OPT-NEXT: [[TMP13:%.*]] = extractelement <23 x i32> [[X]], i64 13 +; OPT-NEXT: [[TMP14:%.*]] = extractelement <23 x i32> [[X]], i64 14 +; OPT-NEXT: [[TMP15:%.*]] = extractelement <23 x i32> [[X]], i64 15 +; OPT-NEXT: [[TMP16:%.*]] = extractelement <23 x i32> [[X]], i64 16 +; OPT-NEXT: [[TMP17:%.*]] = extractelement <23 x i32> [[X]], i64 17 +; OPT-NEXT: [[TMP18:%.*]] = extractelement <23 x i32> [[X]], i64 18 +; OPT-NEXT: [[TMP19:%.*]] = extractelement <23 x i32> [[X]], i64 19 +; OPT-NEXT: [[TMP20:%.*]] = extractelement <23 x i32> [[X]], i64 20 +; OPT-NEXT: [[TMP21:%.*]] = extractelement <23 x i32> [[X]], i64 21 +; OPT-NEXT: [[TMP22:%.*]] = extractelement <23 x i32> [[X]], i64 22 +; OPT-NEXT: br label [[FINALLY:%.*]] +; OPT: else: +; OPT-NEXT: [[Y:%.*]] = insertelement <23 x i32> [[IN]], i32 64, i32 6 +; OPT-NEXT: [[TMP23:%.*]] = extractelement <23 x i32> [[Y]], i64 0 +; OPT-NEXT: [[TMP24:%.*]] = extractelement <23 x i32> [[Y]], i64 1 +; OPT-NEXT: [[TMP25:%.*]] = extractelement <23 x i32> [[Y]], i64 2 +; OPT-NEXT: [[TMP26:%.*]] = extractelement <23 x i32> [[Y]], i64 3 +; OPT-NEXT: [[TMP27:%.*]] = extractelement <23 x i32> [[Y]], i64 4 +; OPT-NEXT: [[TMP28:%.*]] = extractelement <23 x i32> [[Y]], i64 5 +; OPT-NEXT: [[TMP29:%.*]] = extractelement <23 x i32> [[Y]], i64 6 +; OPT-NEXT: [[TMP30:%.*]] = extractelement <23 x i32> [[Y]], i64 7 +; OPT-NEXT: [[TMP31:%.*]] = extractelement <23 x i32> [[Y]], i64 8 +; OPT-NEXT: [[TMP32:%.*]] = extractelement <23 x i32> [[Y]], i64 9 +; OPT-NEXT: [[TMP33:%.*]] = extractelement <23 x i32> [[Y]], i64 10 +; OPT-NEXT: [[TMP34:%.*]] = extractelement <23 x i32> [[Y]], i64 11 +; OPT-NEXT: [[TMP35:%.*]] = extractelement <23 x i32> [[Y]], i64 12 +; OPT-NEXT: [[TMP36:%.*]] = extractelement <23 x i32> [[Y]], i64 13 +; OPT-NEXT: [[TMP37:%.*]] = extractelement <23 x i32> [[Y]], i64 14 +; OPT-NEXT: [[TMP38:%.*]] = extractelement <23 x i32> [[Y]], i64 15 +; OPT-NEXT: [[TMP39:%.*]] = extractelement <23 x i32> [[Y]], i64 16 +; OPT-NEXT: [[TMP40:%.*]] = extractelement <23 x i32> [[Y]], i64 17 +; OPT-NEXT: [[TMP41:%.*]] = extractelement <23 x i32> [[Y]], i64 18 +; OPT-NEXT: [[TMP42:%.*]] = extractelement <23 x i32> [[Y]], i64 19 +; OPT-NEXT: [[TMP43:%.*]] = extractelement <23 x i32> [[Y]], i64 20 +; OPT-NEXT: [[TMP44:%.*]] = extractelement <23 x i32> [[Y]], i64 21 +; OPT-NEXT: [[TMP45:%.*]] = extractelement <23 x i32> [[Y]], i64 22 +; OPT-NEXT: br label [[FINALLY]] +; OPT: finally: +; OPT-NEXT: [[TMP46:%.*]] = phi i32 [ [[TMP0]], [[THEN]] ], [ [[TMP23]], [[ELSE]] ] +; OPT-NEXT: [[TMP47:%.*]] = phi i32 [ [[TMP1]], [[THEN]] ], [ [[TMP24]], [[ELSE]] ] +; OPT-NEXT: [[TMP48:%.*]] = phi i32 [ [[TMP2]], [[THEN]] ], [ [[TMP25]], [[ELSE]] ] +; OPT-NEXT: [[TMP49:%.*]] = phi i32 [ [[TMP3]], [[THEN]] ], [ [[TMP26]], [[ELSE]] ] +; OPT-NEXT: [[TMP50:%.*]] = phi i32 [ [[TMP4]], [[THEN]] ], [ [[TMP27]], [[ELSE]] ] +; OPT-NEXT: [[TMP51:%.*]] = phi i32 [ [[TMP5]], [[THEN]] ], [ [[TMP28]], [[ELSE]] ] +; OPT-NEXT: [[TMP52:%.*]] = phi i32 [ [[TMP6]], [[THEN]] ], [ [[TMP29]], [[ELSE]] ] +; OPT-NEXT: [[TMP53:%.*]] = phi i32 [ [[TMP7]], [[THEN]] ], [ [[TMP30]], [[ELSE]] ] +; OPT-NEXT: [[TMP54:%.*]] = phi i32 [ [[TMP8]], [[THEN]] ], [ [[TMP31]], [[ELSE]] ] +; OPT-NEXT: [[TMP55:%.*]] = phi i32 [ [[TMP9]], [[THEN]] ], [ [[TMP32]], [[ELSE]] ] +; OPT-NEXT: [[TMP56:%.*]] = phi i32 [ [[TMP10]], [[THEN]] ], [ [[TMP33]], [[ELSE]] ] +; OPT-NEXT: [[TMP57:%.*]] = phi i32 [ [[TMP11]], [[THEN]] ], [ [[TMP34]], [[ELSE]] ] +; OPT-NEXT: [[TMP58:%.*]] = phi i32 [ [[TMP12]], [[THEN]] ], [ [[TMP35]], [[ELSE]] ] +; OPT-NEXT: [[TMP59:%.*]] = phi i32 [ [[TMP13]], [[THEN]] ], [ [[TMP36]], [[ELSE]] ] +; OPT-NEXT: [[TMP60:%.*]] = phi i32 [ [[TMP14]], [[THEN]] ], [ [[TMP37]], [[ELSE]] ] +; OPT-NEXT: [[TMP61:%.*]] = phi i32 [ [[TMP15]], [[THEN]] ], [ [[TMP38]], [[ELSE]] ] +; OPT-NEXT: [[TMP62:%.*]] = phi i32 [ [[TMP16]], [[THEN]] ], [ [[TMP39]], [[ELSE]] ] +; OPT-NEXT: [[TMP63:%.*]] = phi i32 [ [[TMP17]], [[THEN]] ], [ [[TMP40]], [[ELSE]] ] +; OPT-NEXT: [[TMP64:%.*]] = phi i32 [ [[TMP18]], [[THEN]] ], [ [[TMP41]], [[ELSE]] ] +; OPT-NEXT: [[TMP65:%.*]] = phi i32 [ [[TMP19]], [[THEN]] ], [ [[TMP42]], [[ELSE]] ] +; OPT-NEXT: [[TMP66:%.*]] = phi i32 [ [[TMP20]], [[THEN]] ], [ [[TMP43]], [[ELSE]] ] +; OPT-NEXT: [[TMP67:%.*]] = phi i32 [ [[TMP21]], [[THEN]] ], [ [[TMP44]], [[ELSE]] ] +; OPT-NEXT: [[TMP68:%.*]] = phi i32 [ [[TMP22]], [[THEN]] ], [ [[TMP45]], [[ELSE]] ] +; OPT-NEXT: [[TMP69:%.*]] = insertelement <23 x i32> poison, i32 [[TMP46]], i64 0 +; OPT-NEXT: [[TMP70:%.*]] = insertelement <23 x i32> [[TMP69]], i32 [[TMP47]], i64 1 +; OPT-NEXT: [[TMP71:%.*]] = insertelement <23 x i32> [[TMP70]], i32 [[TMP48]], i64 2 +; OPT-NEXT: [[TMP72:%.*]] = insertelement <23 x i32> [[TMP71]], i32 [[TMP49]], i64 3 +; OPT-NEXT: [[TMP73:%.*]] = insertelement <23 x i32> [[TMP72]], i32 [[TMP50]], i64 4 +; OPT-NEXT: [[TMP74:%.*]] = insertelement <23 x i32> [[TMP73]], i32 [[TMP51]], i64 5 +; OPT-NEXT: [[TMP75:%.*]] = insertelement <23 x i32> [[TMP74]], i32 [[TMP52]], i64 6 +; OPT-NEXT: [[TMP76:%.*]] = insertelement <23 x i32> [[TMP75]], i32 [[TMP53]], i64 7 +; OPT-NEXT: [[TMP77:%.*]] = insertelement <23 x i32> [[TMP76]], i32 [[TMP54]], i64 8 +; OPT-NEXT: [[TMP78:%.*]] = insertelement <23 x i32> [[TMP77]], i32 [[TMP55]], i64 9 +; OPT-NEXT: [[TMP79:%.*]] = insertelement <23 x i32> [[TMP78]], i32 [[TMP56]], i64 10 +; OPT-NEXT: [[TMP80:%.*]] = insertelement <23 x i32> [[TMP79]], i32 [[TMP57]], i64 11 +; OPT-NEXT: [[TMP81:%.*]] = insertelement <23 x i32> [[TMP80]], i32 [[TMP58]], i64 12 +; OPT-NEXT: [[TMP82:%.*]] = insertelement <23 x i32> [[TMP81]], i32 [[TMP59]], i64 13 +; OPT-NEXT: [[TMP83:%.*]] = insertelement <23 x i32> [[TMP82]], i32 [[TMP60]], i64 14 +; OPT-NEXT: [[TMP84:%.*]] = insertelement <23 x i32> [[TMP83]], i32 [[TMP61]], i64 15 +; OPT-NEXT: [[TMP85:%.*]] = insertelement <23 x i32> [[TMP84]], i32 [[TMP62]], i64 16 +; OPT-NEXT: [[TMP86:%.*]] = insertelement <23 x i32> [[TMP85]], i32 [[TMP63]], i64 17 +; OPT-NEXT: [[TMP87:%.*]] = insertelement <23 x i32> [[TMP86]], i32 [[TMP64]], i64 18 +; OPT-NEXT: [[TMP88:%.*]] = insertelement <23 x i32> [[TMP87]], i32 [[TMP65]], i64 19 +; OPT-NEXT: [[TMP89:%.*]] = insertelement <23 x i32> [[TMP88]], i32 [[TMP66]], i64 20 +; OPT-NEXT: [[TMP90:%.*]] = insertelement <23 x i32> [[TMP89]], i32 [[TMP67]], i64 21 +; OPT-NEXT: [[TMP91:%.*]] = insertelement <23 x i32> [[TMP90]], i32 [[TMP68]], i64 22 +; OPT-NEXT: store <23 x i32> [[TMP91]], ptr [[OUT:%.*]], align 1 +; OPT-NEXT: ret void +; +; NOOPT-LABEL: @phi_v23i32( +; NOOPT-NEXT: entry: +; NOOPT-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; NOOPT: then: +; NOOPT-NEXT: [[X:%.*]] = insertelement <23 x i32> [[IN:%.*]], i32 42, i32 3 +; NOOPT-NEXT: br label [[FINALLY:%.*]] +; NOOPT: else: +; NOOPT-NEXT: [[Y:%.*]] = insertelement <23 x i32> [[IN]], i32 64, i32 6 +; NOOPT-NEXT: br label [[FINALLY]] +; NOOPT: finally: +; NOOPT-NEXT: [[VAL:%.*]] = phi <23 x i32> [ [[X]], [[THEN]] ], [ [[Y]], [[ELSE]] ] +; NOOPT-NEXT: store <23 x i32> [[VAL]], ptr [[OUT:%.*]], align 1 +; NOOPT-NEXT: ret void +; +entry: + br i1 %cond, label %then, label %else +then: + %x = insertelement <23 x i32> %in, i32 42, i32 3 + br label %finally +else: + %y = insertelement <23 x i32> %in, i32 64, i32 6 + br label %finally +finally: + %val = phi <23 x i32> [%x, %then], [%y, %else] + store <23 x i32> %val, ptr %out, align 1 + ret void +} + +define amdgpu_kernel void @phi_v16i64(<16 x i64> %in, ptr %out, i1 %cond) { +; OPT-LABEL: @phi_v16i64( +; OPT-NEXT: entry: +; OPT-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; OPT: then: +; OPT-NEXT: [[X:%.*]] = insertelement <16 x i64> [[IN:%.*]], i64 42, i32 3 +; OPT-NEXT: [[TMP0:%.*]] = extractelement <16 x i64> [[X]], i64 0 +; OPT-NEXT: [[TMP1:%.*]] = extractelement <16 x i64> [[X]], i64 1 +; OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i64> [[X]], i64 2 +; OPT-NEXT: [[TMP3:%.*]] = extractelement <16 x i64> [[X]], i64 3 +; OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i64> [[X]], i64 4 +; OPT-NEXT: [[TMP5:%.*]] = extractelement <16 x i64> [[X]], i64 5 +; OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i64> [[X]], i64 6 +; OPT-NEXT: [[TMP7:%.*]] = extractelement <16 x i64> [[X]], i64 7 +; OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i64> [[X]], i64 8 +; OPT-NEXT: [[TMP9:%.*]] = extractelement <16 x i64> [[X]], i64 9 +; OPT-NEXT: [[TMP10:%.*]] = extractelement <16 x i64> [[X]], i64 10 +; OPT-NEXT: [[TMP11:%.*]] = extractelement <16 x i64> [[X]], i64 11 +; OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i64> [[X]], i64 12 +; OPT-NEXT: [[TMP13:%.*]] = extractelement <16 x i64> [[X]], i64 13 +; OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i64> [[X]], i64 14 +; OPT-NEXT: [[TMP15:%.*]] = extractelement <16 x i64> [[X]], i64 15 +; OPT-NEXT: br label [[FINALLY:%.*]] +; OPT: else: +; OPT-NEXT: [[Y:%.*]] = insertelement <16 x i64> [[IN]], i64 64, i32 6 +; OPT-NEXT: [[TMP16:%.*]] = extractelement <16 x i64> [[Y]], i64 0 +; OPT-NEXT: [[TMP17:%.*]] = extractelement <16 x i64> [[Y]], i64 1 +; OPT-NEXT: [[TMP18:%.*]] = extractelement <16 x i64> [[Y]], i64 2 +; OPT-NEXT: [[TMP19:%.*]] = extractelement <16 x i64> [[Y]], i64 3 +; OPT-NEXT: [[TMP20:%.*]] = extractelement <16 x i64> [[Y]], i64 4 +; OPT-NEXT: [[TMP21:%.*]] = extractelement <16 x i64> [[Y]], i64 5 +; OPT-NEXT: [[TMP22:%.*]] = extractelement <16 x i64> [[Y]], i64 6 +; OPT-NEXT: [[TMP23:%.*]] = extractelement <16 x i64> [[Y]], i64 7 +; OPT-NEXT: [[TMP24:%.*]] = extractelement <16 x i64> [[Y]], i64 8 +; OPT-NEXT: [[TMP25:%.*]] = extractelement <16 x i64> [[Y]], i64 9 +; OPT-NEXT: [[TMP26:%.*]] = extractelement <16 x i64> [[Y]], i64 10 +; OPT-NEXT: [[TMP27:%.*]] = extractelement <16 x i64> [[Y]], i64 11 +; OPT-NEXT: [[TMP28:%.*]] = extractelement <16 x i64> [[Y]], i64 12 +; OPT-NEXT: [[TMP29:%.*]] = extractelement <16 x i64> [[Y]], i64 13 +; OPT-NEXT: [[TMP30:%.*]] = extractelement <16 x i64> [[Y]], i64 14 +; OPT-NEXT: [[TMP31:%.*]] = extractelement <16 x i64> [[Y]], i64 15 +; OPT-NEXT: br label [[FINALLY]] +; OPT: finally: +; OPT-NEXT: [[TMP32:%.*]] = phi i64 [ [[TMP0]], [[THEN]] ], [ [[TMP16]], [[ELSE]] ] +; OPT-NEXT: [[TMP33:%.*]] = phi i64 [ [[TMP1]], [[THEN]] ], [ [[TMP17]], [[ELSE]] ] +; OPT-NEXT: [[TMP34:%.*]] = phi i64 [ [[TMP2]], [[THEN]] ], [ [[TMP18]], [[ELSE]] ] +; OPT-NEXT: [[TMP35:%.*]] = phi i64 [ [[TMP3]], [[THEN]] ], [ [[TMP19]], [[ELSE]] ] +; OPT-NEXT: [[TMP36:%.*]] = phi i64 [ [[TMP4]], [[THEN]] ], [ [[TMP20]], [[ELSE]] ] +; OPT-NEXT: [[TMP37:%.*]] = phi i64 [ [[TMP5]], [[THEN]] ], [ [[TMP21]], [[ELSE]] ] +; OPT-NEXT: [[TMP38:%.*]] = phi i64 [ [[TMP6]], [[THEN]] ], [ [[TMP22]], [[ELSE]] ] +; OPT-NEXT: [[TMP39:%.*]] = phi i64 [ [[TMP7]], [[THEN]] ], [ [[TMP23]], [[ELSE]] ] +; OPT-NEXT: [[TMP40:%.*]] = phi i64 [ [[TMP8]], [[THEN]] ], [ [[TMP24]], [[ELSE]] ] +; OPT-NEXT: [[TMP41:%.*]] = phi i64 [ [[TMP9]], [[THEN]] ], [ [[TMP25]], [[ELSE]] ] +; OPT-NEXT: [[TMP42:%.*]] = phi i64 [ [[TMP10]], [[THEN]] ], [ [[TMP26]], [[ELSE]] ] +; OPT-NEXT: [[TMP43:%.*]] = phi i64 [ [[TMP11]], [[THEN]] ], [ [[TMP27]], [[ELSE]] ] +; OPT-NEXT: [[TMP44:%.*]] = phi i64 [ [[TMP12]], [[THEN]] ], [ [[TMP28]], [[ELSE]] ] +; OPT-NEXT: [[TMP45:%.*]] = phi i64 [ [[TMP13]], [[THEN]] ], [ [[TMP29]], [[ELSE]] ] +; OPT-NEXT: [[TMP46:%.*]] = phi i64 [ [[TMP14]], [[THEN]] ], [ [[TMP30]], [[ELSE]] ] +; OPT-NEXT: [[TMP47:%.*]] = phi i64 [ [[TMP15]], [[THEN]] ], [ [[TMP31]], [[ELSE]] ] +; OPT-NEXT: [[TMP48:%.*]] = insertelement <16 x i64> poison, i64 [[TMP32]], i64 0 +; OPT-NEXT: [[TMP49:%.*]] = insertelement <16 x i64> [[TMP48]], i64 [[TMP33]], i64 1 +; OPT-NEXT: [[TMP50:%.*]] = insertelement <16 x i64> [[TMP49]], i64 [[TMP34]], i64 2 +; OPT-NEXT: [[TMP51:%.*]] = insertelement <16 x i64> [[TMP50]], i64 [[TMP35]], i64 3 +; OPT-NEXT: [[TMP52:%.*]] = insertelement <16 x i64> [[TMP51]], i64 [[TMP36]], i64 4 +; OPT-NEXT: [[TMP53:%.*]] = insertelement <16 x i64> [[TMP52]], i64 [[TMP37]], i64 5 +; OPT-NEXT: [[TMP54:%.*]] = insertelement <16 x i64> [[TMP53]], i64 [[TMP38]], i64 6 +; OPT-NEXT: [[TMP55:%.*]] = insertelement <16 x i64> [[TMP54]], i64 [[TMP39]], i64 7 +; OPT-NEXT: [[TMP56:%.*]] = insertelement <16 x i64> [[TMP55]], i64 [[TMP40]], i64 8 +; OPT-NEXT: [[TMP57:%.*]] = insertelement <16 x i64> [[TMP56]], i64 [[TMP41]], i64 9 +; OPT-NEXT: [[TMP58:%.*]] = insertelement <16 x i64> [[TMP57]], i64 [[TMP42]], i64 10 +; OPT-NEXT: [[TMP59:%.*]] = insertelement <16 x i64> [[TMP58]], i64 [[TMP43]], i64 11 +; OPT-NEXT: [[TMP60:%.*]] = insertelement <16 x i64> [[TMP59]], i64 [[TMP44]], i64 12 +; OPT-NEXT: [[TMP61:%.*]] = insertelement <16 x i64> [[TMP60]], i64 [[TMP45]], i64 13 +; OPT-NEXT: [[TMP62:%.*]] = insertelement <16 x i64> [[TMP61]], i64 [[TMP46]], i64 14 +; OPT-NEXT: [[TMP63:%.*]] = insertelement <16 x i64> [[TMP62]], i64 [[TMP47]], i64 15 +; OPT-NEXT: store <16 x i64> [[TMP63]], ptr [[OUT:%.*]], align 1 +; OPT-NEXT: ret void +; +; NOOPT-LABEL: @phi_v16i64( +; NOOPT-NEXT: entry: +; NOOPT-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; NOOPT: then: +; NOOPT-NEXT: [[X:%.*]] = insertelement <16 x i64> [[IN:%.*]], i64 42, i32 3 +; NOOPT-NEXT: br label [[FINALLY:%.*]] +; NOOPT: else: +; NOOPT-NEXT: [[Y:%.*]] = insertelement <16 x i64> [[IN]], i64 64, i32 6 +; NOOPT-NEXT: br label [[FINALLY]] +; NOOPT: finally: +; NOOPT-NEXT: [[VAL:%.*]] = phi <16 x i64> [ [[X]], [[THEN]] ], [ [[Y]], [[ELSE]] ] +; NOOPT-NEXT: store <16 x i64> [[VAL]], ptr [[OUT:%.*]], align 1 +; NOOPT-NEXT: ret void +; +entry: + br i1 %cond, label %then, label %else +then: + %x = insertelement <16 x i64> %in, i64 42, i32 3 + br label %finally +else: + %y = insertelement <16 x i64> %in, i64 64, i32 6 + br label %finally +finally: + %val = phi <16 x i64> [%x, %then], [%y, %else] + store <16 x i64> %val, ptr %out, align 1 + ret void +} + +define amdgpu_kernel void @phi_v7i16(<7 x i16> %in, ptr %out, i1 %cond) { +; OPT-LABEL: @phi_v7i16( +; OPT-NEXT: entry: +; OPT-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; OPT: then: +; OPT-NEXT: [[X:%.*]] = insertelement <7 x i16> [[IN:%.*]], i16 3, i32 3 +; OPT-NEXT: [[TMP0:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP1:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP2:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP3:%.*]] = extractelement <7 x i16> [[X]], i64 6 +; OPT-NEXT: br label [[FINALLY:%.*]] +; OPT: else: +; OPT-NEXT: [[Y:%.*]] = insertelement <7 x i16> [[IN]], i16 9, i32 6 +; OPT-NEXT: [[TMP4:%.*]] = shufflevector <7 x i16> [[Y]], <7 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP5:%.*]] = shufflevector <7 x i16> [[Y]], <7 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP6:%.*]] = shufflevector <7 x i16> [[Y]], <7 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP7:%.*]] = extractelement <7 x i16> [[Y]], i64 6 +; OPT-NEXT: br label [[FINALLY]] +; OPT: finally: +; OPT-NEXT: [[TMP8:%.*]] = phi <2 x i16> [ [[TMP0]], [[THEN]] ], [ [[TMP4]], [[ELSE]] ] +; OPT-NEXT: [[TMP9:%.*]] = phi <2 x i16> [ [[TMP1]], [[THEN]] ], [ [[TMP5]], [[ELSE]] ] +; OPT-NEXT: [[TMP10:%.*]] = phi <2 x i16> [ [[TMP2]], [[THEN]] ], [ [[TMP6]], [[ELSE]] ] +; OPT-NEXT: [[TMP11:%.*]] = phi i16 [ [[TMP3]], [[THEN]] ], [ [[TMP7]], [[ELSE]] ] +; OPT-NEXT: [[TMP12:%.*]] = call <7 x i16> @llvm.vector.insert.v7i16.v2i16(<7 x i16> poison, <2 x i16> [[TMP8]], i64 0) +; OPT-NEXT: [[TMP13:%.*]] = call <7 x i16> @llvm.vector.insert.v7i16.v2i16(<7 x i16> [[TMP12]], <2 x i16> [[TMP9]], i64 2) +; OPT-NEXT: [[TMP14:%.*]] = call <7 x i16> @llvm.vector.insert.v7i16.v2i16(<7 x i16> [[TMP13]], <2 x i16> [[TMP10]], i64 4) +; OPT-NEXT: [[TMP15:%.*]] = insertelement <7 x i16> [[TMP14]], i16 [[TMP11]], i64 6 +; OPT-NEXT: store <7 x i16> [[TMP15]], ptr [[OUT:%.*]], align 1 +; OPT-NEXT: ret void +; +; NOOPT-LABEL: @phi_v7i16( +; NOOPT-NEXT: entry: +; NOOPT-NEXT: br i1 [[COND:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] +; NOOPT: then: +; NOOPT-NEXT: [[X:%.*]] = insertelement <7 x i16> [[IN:%.*]], i16 3, i32 3 +; NOOPT-NEXT: br label [[FINALLY:%.*]] +; NOOPT: else: +; NOOPT-NEXT: [[Y:%.*]] = insertelement <7 x i16> [[IN]], i16 9, i32 6 +; NOOPT-NEXT: br label [[FINALLY]] +; NOOPT: finally: +; NOOPT-NEXT: [[VAL:%.*]] = phi <7 x i16> [ [[X]], [[THEN]] ], [ [[Y]], [[ELSE]] ] +; NOOPT-NEXT: store <7 x i16> [[VAL]], ptr [[OUT:%.*]], align 1 +; NOOPT-NEXT: ret void +; +entry: + br i1 %cond, label %then, label %else +then: + %x = insertelement <7 x i16> %in, i16 3, i32 3 + br label %finally +else: + %y = insertelement <7 x i16> %in, i16 9, i32 6 + br label %finally +finally: + %val = phi <7 x i16> [%x, %then], [%y, %else] + store <7 x i16> %val, ptr %out, align 1 + ret void +} + + +define amdgpu_kernel void @phi_v7i16_switch(<7 x i16> %in, ptr %out, i8 %cond) { +; OPT-LABEL: @phi_v7i16_switch( +; OPT-NEXT: entry: +; OPT-NEXT: [[X:%.*]] = insertelement <7 x i16> [[IN:%.*]], i16 3, i32 3 +; OPT-NEXT: switch i8 [[COND:%.*]], label [[ELSE:%.*]] [ +; OPT-NEXT: i8 0, label [[THEN_1:%.*]] +; OPT-NEXT: i8 3, label [[THEN_2:%.*]] +; OPT-NEXT: ] +; OPT: then.1: +; OPT-NEXT: [[TMP0:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP1:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP2:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP3:%.*]] = extractelement <7 x i16> [[X]], i64 6 +; OPT-NEXT: br label [[FINALLY:%.*]] +; OPT: then.2: +; OPT-NEXT: [[TMP4:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP5:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP6:%.*]] = shufflevector <7 x i16> [[X]], <7 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP7:%.*]] = extractelement <7 x i16> [[X]], i64 6 +; OPT-NEXT: br label [[FINALLY]] +; OPT: else: +; OPT-NEXT: [[Y:%.*]] = insertelement <7 x i16> [[IN]], i16 9, i32 6 +; OPT-NEXT: [[TMP8:%.*]] = shufflevector <7 x i16> [[Y]], <7 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP9:%.*]] = shufflevector <7 x i16> [[Y]], <7 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP10:%.*]] = shufflevector <7 x i16> [[Y]], <7 x i16> poison, <2 x i32> +; OPT-NEXT: [[TMP11:%.*]] = extractelement <7 x i16> [[Y]], i64 6 +; OPT-NEXT: br label [[FINALLY]] +; OPT: finally: +; OPT-NEXT: [[TMP12:%.*]] = phi <2 x i16> [ [[TMP0]], [[THEN_1]] ], [ [[TMP4]], [[THEN_2]] ], [ [[TMP8]], [[ELSE]] ] +; OPT-NEXT: [[TMP13:%.*]] = phi <2 x i16> [ [[TMP1]], [[THEN_1]] ], [ [[TMP5]], [[THEN_2]] ], [ [[TMP9]], [[ELSE]] ] +; OPT-NEXT: [[TMP14:%.*]] = phi <2 x i16> [ [[TMP2]], [[THEN_1]] ], [ [[TMP6]], [[THEN_2]] ], [ [[TMP10]], [[ELSE]] ] +; OPT-NEXT: [[TMP15:%.*]] = phi i16 [ [[TMP3]], [[THEN_1]] ], [ [[TMP7]], [[THEN_2]] ], [ [[TMP11]], [[ELSE]] ] +; OPT-NEXT: [[TMP16:%.*]] = call <7 x i16> @llvm.vector.insert.v7i16.v2i16(<7 x i16> poison, <2 x i16> [[TMP12]], i64 0) +; OPT-NEXT: [[TMP17:%.*]] = call <7 x i16> @llvm.vector.insert.v7i16.v2i16(<7 x i16> [[TMP16]], <2 x i16> [[TMP13]], i64 2) +; OPT-NEXT: [[TMP18:%.*]] = call <7 x i16> @llvm.vector.insert.v7i16.v2i16(<7 x i16> [[TMP17]], <2 x i16> [[TMP14]], i64 4) +; OPT-NEXT: [[TMP19:%.*]] = insertelement <7 x i16> [[TMP18]], i16 [[TMP15]], i64 6 +; OPT-NEXT: store <7 x i16> [[TMP19]], ptr [[OUT:%.*]], align 1 +; OPT-NEXT: ret void +; +; NOOPT-LABEL: @phi_v7i16_switch( +; NOOPT-NEXT: entry: +; NOOPT-NEXT: [[X:%.*]] = insertelement <7 x i16> [[IN:%.*]], i16 3, i32 3 +; NOOPT-NEXT: switch i8 [[COND:%.*]], label [[ELSE:%.*]] [ +; NOOPT-NEXT: i8 0, label [[THEN_1:%.*]] +; NOOPT-NEXT: i8 3, label [[THEN_2:%.*]] +; NOOPT-NEXT: ] +; NOOPT: then.1: +; NOOPT-NEXT: br label [[FINALLY:%.*]] +; NOOPT: then.2: +; NOOPT-NEXT: br label [[FINALLY]] +; NOOPT: else: +; NOOPT-NEXT: [[Y:%.*]] = insertelement <7 x i16> [[IN]], i16 9, i32 6 +; NOOPT-NEXT: br label [[FINALLY]] +; NOOPT: finally: +; NOOPT-NEXT: [[VAL:%.*]] = phi <7 x i16> [ [[X]], [[THEN_1]] ], [ [[X]], [[THEN_2]] ], [ [[Y]], [[ELSE]] ] +; NOOPT-NEXT: store <7 x i16> [[VAL]], ptr [[OUT:%.*]], align 1 +; NOOPT-NEXT: ret void +; +entry: + %x = insertelement <7 x i16> %in, i16 3, i32 3 + switch i8 %cond, label %else [ + i8 0, label %then.1 + i8 3, label %then.2 + ] +then.1: + br label %finally +then.2: + br label %finally +else: + %y = insertelement <7 x i16> %in, i16 9, i32 6 + br label %finally +finally: + %val = phi <7 x i16> [%x, %then.1], [%x, %then.2], [%y, %else] + store <7 x i16> %val, ptr %out, align 1 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-remove-redundant-endcf < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-remove-redundant-endcf -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; Disabled endcf collapse at -O0. -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -O0 -amdgpu-remove-redundant-endcf < %s | FileCheck -enable-var-scope -check-prefix=GCN-O0 %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -O0 -amdgpu-remove-redundant-endcf -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN-O0 %s + +; Note: Breaking large PHIs is disabled to branches from being eliminated (in scc_liveness) define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) { ; GCN-LABEL: simple_nested_if: diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -210,11 +210,11 @@ ; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GCN-NEXT: s_branch .LBB4_3 ; GCN-NEXT: .LBB4_2: -; GCN-NEXT: s_mov_b32 s4, 0 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: .LBB4_3: ; %if.end +; GCN-NEXT: s_mov_b32 s4, 0xffff +; GCN-NEXT: v_bfi_b32 v0, s4, v0, v0 ; GCN-NEXT: global_store_short v[0:1], v1, off ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_endpgm @@ -263,10 +263,8 @@ ; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GCN-NEXT: s_branch .LBB5_3 ; GCN-NEXT: .LBB5_2: -; GCN-NEXT: s_mov_b32 s4, 0 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: .LBB5_3: ; %if.end ; GCN-NEXT: global_store_short v[0:1], v1, off ; GCN-NEXT: global_store_dword v[0:1], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/debug-value.ll b/llvm/test/CodeGen/AMDGPU/debug-value.ll --- a/llvm/test/CodeGen/AMDGPU/debug-value.ll +++ b/llvm/test/CodeGen/AMDGPU/debug-value.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -amdgpu-codegenprepare-break-large-phis=0 < %s | FileCheck %s %struct.wombat = type { [4 x i32], [4 x i32], [4 x i32] } diff --git a/llvm/test/CodeGen/AMDGPU/early-if-convert.ll b/llvm/test/CodeGen/AMDGPU/early-if-convert.ll --- a/llvm/test/CodeGen/AMDGPU/early-if-convert.ll +++ b/llvm/test/CodeGen/AMDGPU/early-if-convert.ll @@ -1,6 +1,9 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -amdgpu-codegenprepare-break-large-phis=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; XUN: llc -march=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; Note: breaking up large PHIs is disabled to prevent some testcases from becoming +; branchless. + ; FIXME: This leaves behind a now unnecessary and with exec ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle: diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -28,28 +28,28 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB0_3 ; SI-NEXT: s_branch .LBB0_4 ; SI-NEXT: .LBB0_2: ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB0_3: ; %T ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -63,28 +63,28 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v1 +; SI-NEXT: v_or_b32_e32 v3, v3, v0 ; SI-NEXT: .LBB0_4: ; %exit ; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 -; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 -; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v1, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v2, v4, 0, 16 ; SI-NEXT: v_mov_b32_e32 v3, 0xffff ; SI-NEXT: v_mov_b32_e32 v4, 0x8000 ; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000 ; SI-NEXT: v_bfrev_b32_e32 v6, 1 ; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 +; SI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 ; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v4 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_or_b32_e32 v2, v3, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: vec_8xi16_extract_4xi16: @@ -97,7 +97,7 @@ ; GFX9-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-NEXT: s_branch .LBB0_4 ; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: .LBB0_3: ; %T ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -158,18 +158,18 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v5, v6, v7 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB1_3 ; SI-NEXT: s_branch .LBB1_4 ; SI-NEXT: .LBB1_2: ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB1_3: ; %T ; SI-NEXT: s_mov_b32 s6, 0 @@ -184,39 +184,39 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v0 -; SI-NEXT: v_or_b32_e32 v5, v5, v1 +; SI-NEXT: v_or_b32_e32 v3, v3, v1 +; SI-NEXT: v_or_b32_e32 v5, v5, v0 ; SI-NEXT: .LBB1_4: ; %exit ; SI-NEXT: v_bfe_i32 v0, v5, 0, 16 -; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 -; SI-NEXT: v_bfe_i32 v3, v3, 0, 16 -; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v1, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v2, v3, 0, 16 +; SI-NEXT: v_bfe_i32 v3, v4, 0, 16 ; SI-NEXT: v_mov_b32_e32 v4, 0xffff ; SI-NEXT: v_mov_b32_e32 v5, 0x8000 ; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000 ; SI-NEXT: v_bfrev_b32_e32 v7, 1 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3 +; SI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3 -; SI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 -; SI-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v4 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; SI-NEXT: v_or_b32_e32 v0, v2, v3 +; SI-NEXT: v_or_b32_e32 v2, v4, v5 +; SI-NEXT: v_alignbit_b32 v1, v2, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: vec_8xi16_extract_4xi16_2: @@ -229,7 +229,7 @@ ; GFX9-NEXT: s_cbranch_execz .LBB1_3 ; GFX9-NEXT: s_branch .LBB1_4 ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: .LBB1_3: ; %T ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -282,28 +282,24 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB2_3 ; SI-NEXT: s_branch .LBB2_4 ; SI-NEXT: .LBB2_2: +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB2_3: ; %T ; SI-NEXT: s_mov_b32 s6, 0 @@ -314,25 +310,21 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; SI-NEXT: .LBB2_4: ; %exit ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 @@ -344,10 +336,10 @@ ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; SI-NEXT: v_mov_b32_e32 v3, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -361,7 +353,7 @@ ; GFX9-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-NEXT: s_branch .LBB2_4 ; GFX9-NEXT: .LBB2_2: -; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: .LBB2_3: ; %T ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -444,28 +436,28 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_or_b32_e32 v3, v6, v3 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB3_3 ; SI-NEXT: s_branch .LBB3_4 ; SI-NEXT: .LBB3_2: ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB3_3: ; %T ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -495,28 +487,28 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v1 +; SI-NEXT: v_or_b32_e32 v3, v3, v0 ; SI-NEXT: .LBB3_4: ; %exit ; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 -; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 -; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v1, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v2, v4, 0, 16 ; SI-NEXT: v_mov_b32_e32 v3, 0xffff ; SI-NEXT: v_mov_b32_e32 v4, 0x8000 ; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000 ; SI-NEXT: v_bfrev_b32_e32 v6, 1 ; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000 -; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 +; SI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 ; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v1, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v4 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_or_b32_e32 v2, v3, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: vec_16xi16_extract_4xi16: @@ -524,33 +516,36 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cbranch_scc0 .LBB3_2 ; GFX9-NEXT: ; %bb.1: ; %F -; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 ; GFX9-NEXT: s_cbranch_execz .LBB3_3 ; GFX9-NEXT: s_branch .LBB3_4 ; GFX9-NEXT: .LBB3_2: -; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: .LBB3_3: ; %T -; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 ; GFX9-NEXT: .LBB3_4: ; %exit -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,0] +; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: v_bfi_b32 v0, s4, v4, v4 +; GFX9-NEXT: v_bfi_b32 v0, s4, v4, v0 +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-NEXT: v_pk_ashrrev_i16 v2, 15, v5 op_sel_hi:[0,0] ; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0 -; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] -; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX9-NEXT: v_perm_b32 v1, v2, v3, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F @@ -591,11 +586,11 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -613,17 +608,17 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; SI-NEXT: v_or_b32_e32 v3, v6, v3 +; SI-NEXT: v_or_b32_e32 v2, v7, v2 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB4_3 ; SI-NEXT: s_branch .LBB4_4 ; SI-NEXT: .LBB4_2: -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB4_3: ; %T @@ -639,39 +634,39 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:18 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:20 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:22 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:24 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:26 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:28 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v2, v1 +; SI-NEXT: v_or_b32_e32 v2, v6, v0 ; SI-NEXT: .LBB4_4: ; %exit ; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 -; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 +; SI-NEXT: v_bfe_i32 v1, v5, 0, 16 ; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 -; SI-NEXT: v_bfe_i32 v3, v5, 0, 16 +; SI-NEXT: v_bfe_i32 v3, v4, 0, 16 ; SI-NEXT: v_mov_b32_e32 v4, 0xffff ; SI-NEXT: v_mov_b32_e32 v5, 0x8000 ; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000 @@ -695,33 +690,35 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cbranch_scc0 .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; %F -; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-NEXT: s_branch .LBB4_4 ; GFX9-NEXT: .LBB4_2: -; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr7 ; GFX9-NEXT: .LBB4_3: ; %T -; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 ; GFX9-NEXT: .LBB4_4: ; %exit -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v7 op_sel_hi:[0,1] +; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: v_bfi_b32 v0, s4, v6, v6 +; GFX9-NEXT: v_bfi_b32 v0, s4, v6, v0 +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_ashrrev_i16 v2, 15, v7 op_sel_hi:[0,1] ; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0 -; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v6 op_sel_hi:[0,1] -; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX9-NEXT: v_perm_b32 v1, v2, v3, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F @@ -760,44 +757,40 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:18 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:20 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:22 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:24 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:26 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:28 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB5_3 ; SI-NEXT: s_branch .LBB5_4 ; SI-NEXT: .LBB5_2: +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_mov_b64 vcc, 0 ; SI-NEXT: .LBB5_3: ; %T ; SI-NEXT: s_mov_b32 s6, 0 @@ -808,41 +801,37 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:14 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:18 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:20 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:22 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:24 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:26 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:28 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; SI-NEXT: .LBB5_4: ; %exit ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 @@ -854,10 +843,10 @@ ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; SI-NEXT: v_mov_b32_e32 v3, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -866,38 +855,41 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_cbranch_scc0 .LBB5_2 ; GFX9-NEXT: ; %bb.1: ; %F -; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 ; GFX9-NEXT: s_cbranch_execz .LBB5_3 ; GFX9-NEXT: s_branch .LBB5_4 ; GFX9-NEXT: .LBB5_2: -; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr5 ; GFX9-NEXT: .LBB5_3: ; %T -; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 ; GFX9-NEXT: .LBB5_4: ; %exit +; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: v_bfi_b32 v0, s4, v4, v4 +; GFX9-NEXT: v_bfi_b32 v0, s4, v4, v0 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v5, v5, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x3900 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x3d00 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x3800 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x3900 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x3d00 ; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v3, vcc -; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v2, vcc -; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v4, v1 src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: v_pack_b32_f16 v1, v5, v6 +; GFX9-NEXT: v_perm_b32 v1, v5, v5, s4 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v4, vcc +; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v4, vcc +; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_pack_b32_f16 v0, v5, v0 +; GFX9-NEXT: v_pack_b32_f16 v1, v6, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -2004,14 +2004,15 @@ ; SI-NEXT: .LBB42_2: ; %if ; SI-NEXT: s_load_dword s7, s[2:3], 0x0 ; SI-NEXT: .LBB42_3: ; %endif -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB42_4: +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB42_2 ; ; VI-LABEL: insert_split_bb: @@ -2028,14 +2029,15 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s7, s[2:3], 0x0 ; VI-NEXT: .LBB42_3: ; %endif -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; VI-NEXT: .LBB42_4: +; VI-NEXT: ; implicit-def: $sgpr7 ; VI-NEXT: s_branch .LBB42_2 entry: %0 = insertelement <2 x i32> undef, i32 %a, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll b/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll --- a/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll @@ -10,22 +10,36 @@ ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_add_f32_e32 v3, v2, v2 -; CHECK-NEXT: v_add_f32_e32 v0, v0, v0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: ; kill: killed $vgpr1 -; CHECK-NEXT: .LBB0_1: ; %bb1 +; CHECK-NEXT: s_branch .LBB0_2 +; CHECK-NEXT: .LBB0_1: ; %Flow13 +; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: s_or_b64 exec, exec, s[10:11] +; CHECK-NEXT: s_and_b64 s[6:7], exec, s[8:9] +; CHECK-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_cbranch_execz .LBB0_6 +; CHECK-NEXT: .LBB0_2: ; %bb1 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_cmp_eq_f32_e64 s[6:7], 0, v2 ; CHECK-NEXT: v_cmp_neq_f32_e32 vcc, 0, v2 -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB0_1 -; CHECK-NEXT: ; %bb.2: ; %bb2 -; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_and_saveexec_b64 s[8:9], vcc +; CHECK-NEXT: ; %bb.3: ; %bb2 +; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: v_mul_f32_e32 v2, v3, v2 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: s_cbranch_execnz .LBB0_1 -; CHECK-NEXT: ; %bb.3: ; %DummyReturnBlock +; CHECK-NEXT: s_or_b64 s[6:7], s[6:7], exec +; CHECK-NEXT: ; %bb.4: ; %Flow +; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] +; CHECK-NEXT: s_mov_b64 s[8:9], -1 +; CHECK-NEXT: s_and_saveexec_b64 s[10:11], s[6:7] +; CHECK-NEXT: s_cbranch_execz .LBB0_1 +; CHECK-NEXT: ; %bb.5: ; %bb3 +; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: s_xor_b64 s[8:9], exec, -1 +; CHECK-NEXT: s_branch .LBB0_1 +; CHECK-NEXT: .LBB0_6: ; %DummyReturnBlock +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: br label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -4,9 +4,7 @@ ; GCN-LABEL: {{^}}test_mfma_loop_zeroinit: -; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0 -; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] +; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} ; Check that we do not copy agprs to vgprs and back inside the loop. @@ -47,11 +45,8 @@ ; 3 vgprs are needed to avoid wait states between writes. ; Check that we do not use 32 temp sgprs as well. -; GFX908: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000 -; GFX940_A: s_mov_b32 [[TMP:s[0-9]+]], 0x42f60000 -; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] -; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]] -; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] +; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000 +; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] ; GCN: [[LOOP:.LBB[0-9_]+]]: ; GCN-NOT: v_accvgpr @@ -84,10 +79,8 @@ ; GCN-LABEL: {{^}}test_mfma_loop_non_splat: -; GCN: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0{{$}} +; GCN-COUNT-31: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} ; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0{{$}} -; GFX908-COUNT-30: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GFX90A-COUNT-30: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]{{$}} ; GCN: [[LOOP:.LBB[0-9_]+]]: ; GCN-NOT: v_accvgpr @@ -123,73 +116,105 @@ ; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only. ; 3 vgprs are needed to avoid wait states between writes. -; GFX908: v_mov_b32_e32 [[TMP1:v[0-9]+]], 0x42f60000 -; GFX908: v_mov_b32_e32 [[TMP2:v[0-9]+]], 0x42f80000 -; GFX908: v_mov_b32_e32 [[TMP3:v[0-9]+]], 0x42fe0000 -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] - -; GFX940_A-COUNT-32: s_mov_b32 s{{[0-9]+}}, 0x4{{[0-9a-f]+}} -; GFX940_A-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x42f80000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x42fa0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x42fc0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x42fe0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43000000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43010000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43020000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43030000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43040000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43050000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43060000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43070000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43080000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43090000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x430a0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x430b0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x430c0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x430d0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x430e0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x430f0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43100000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43110000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43120000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43130000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43140000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43150000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43160000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43170000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43180000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x43190000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 +; GFX-908: v_mov_b32_e32 v0, 0x431a0000 +; GFX-908: s_nop 1 +; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0 + +; FIXME: Constant is now in VGPR instead of SGPR. + +; GFX940_A: v_mov_b32_e32 v{{[0-9]+}}, 0x4{{[0-9a-f]+}} +; GFX940_A-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} ; GCN: [[LOOP:.LBB[0-9_]+]]: ; GCN-NOT: v_accvgpr @@ -292,8 +317,7 @@ ; GFX908: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}} ; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] -; GFX940_A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], s{{[0-9]+}} -; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] +; GFX940_A-COUNT-32: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], s{{[0-9]+}} ; GCN: [[LOOP:.LBB[0-9_]+]]: ; GCN-NOT: v_accvgpr @@ -395,7 +419,7 @@ ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} ; GFX90A-DAG: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0 -; GFX90A-COUNT-28: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] +; GFX90A-COUNT-28: v_accvgpr_write_b32 a{{[0-9]+}}, 0 ; GCN: [[LOOP:.LBB[0-9_]+]]: ; GCN-NOT: v_accvgpr @@ -483,10 +507,8 @@ ; Check that we are using only one tmp VGPR. -; GCN: v_accvgpr_read_b32 [[TMP:v[0-9]+]], a{{[0-9]+}} -; GFX908-COUNT-31: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]{{$}} -; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]]{{$}} -; GFX90A-COUNT-29: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] +; GFX908: v_accvgpr_read_b32 [[TMP:v[0-9]+]], a{{[0-9]+}} +; GFX940_A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} ; GCN: [[LOOP:.LBB[0-9_]+]]: ; GCN-NOT: v_accvgpr @@ -554,9 +576,7 @@ ; GCN-LABEL: {{^}}test_mfma_nested_loop_zeroinit: -; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0 -; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] +; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} ; Check that we do not copy agprs to vgprs and back in an outer loop. diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll --- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -1,9 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=tahiti -amdgpu-dce-in-ra=0 -o - %s | FileCheck %s +; RUN: llc -march=amdgcn -amdgpu-codegenprepare-break-large-phis=0 -mcpu=tahiti -amdgpu-dce-in-ra=0 -o - %s | FileCheck %s ; Don't crash when the use of an undefined value is only detected by the ; register coalescer because it is hidden with subregister insert/extract. target triple="amdgcn--" +; NOTE: breaking large PHIs is disabled here else this example is completely optimized out +; before reaching codegen. + define amdgpu_kernel void @foobar(float %a0, float %a1, ptr addrspace(1) %out) nounwind { ; CHECK-LABEL: foobar: ; CHECK: ; %bb.0: ; %entry diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -32,138 +32,116 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i32 %tmp5.i.i, i32 %tmp427.i, i1 %tmp438.i, double %tmp27.i, i1 %tmp48.i) { ; GLOBALNESS1-LABEL: kernel: ; GLOBALNESS1: ; %bb.0: ; %bb -; GLOBALNESS1-NEXT: s_mov_b64 s[54:55], s[6:7] -; GLOBALNESS1-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 +; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[6:7] +; GLOBALNESS1-NEXT: s_load_dwordx4 s[76:79], s[8:9], 0x0 ; GLOBALNESS1-NEXT: s_load_dword s6, s[8:9], 0x14 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v0 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, v0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v40, 0 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS1-NEXT: global_store_dword v[0:1], v40, off ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: global_load_dword v0, v40, s[36:37] -; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GLOBALNESS1-NEXT: s_mov_b64 s[64:65], s[4:5] +; GLOBALNESS1-NEXT: global_load_dword v0, v40, s[76:77] +; GLOBALNESS1-NEXT: s_mov_b64 s[40:41], s[4:5] ; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 ; GLOBALNESS1-NEXT: s_load_dword s7, s[8:9], 0x20 +; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s17 ; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, 0x40994400 -; GLOBALNESS1-NEXT: s_bitcmp1_b32 s38, 0 +; GLOBALNESS1-NEXT: s_bitcmp1_b32 s78, 0 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[36:37], s[4:5], v[40:41] -; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[40:41], s[4:5], 0 +; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[40:41] +; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS1-NEXT: s_xor_b64 s[94:95], s[4:5], -1 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] +; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s6, 0 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS1-NEXT: s_xor_b64 s[88:89], s[4:5], -1 +; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s7, 0 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[48:49], 1, v1 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS1-NEXT: s_getpc_b64 s[6:7] ; GLOBALNESS1-NEXT: s_add_u32 s6, s6, wobble@gotpcrel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s7, s7, wobble@gotpcrel32@hi+12 -; GLOBALNESS1-NEXT: s_xor_b64 s[86:87], s[4:5], -1 -; GLOBALNESS1-NEXT: s_load_dwordx2 s[66:67], s[6:7], 0x0 -; GLOBALNESS1-NEXT: s_mov_b32 s98, s16 -; GLOBALNESS1-NEXT: s_mov_b64 s[62:63], s[8:9] -; GLOBALNESS1-NEXT: s_mov_b32 s99, s15 -; GLOBALNESS1-NEXT: s_mov_b32 s100, s14 +; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v1 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GLOBALNESS1-NEXT: s_load_dwordx2 s[76:77], s[6:7], 0x0 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v1 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v2 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3 +; GLOBALNESS1-NEXT: s_mov_b32 s70, s16 +; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9] +; GLOBALNESS1-NEXT: s_mov_b32 s71, s15 +; GLOBALNESS1-NEXT: s_mov_b32 s72, s14 ; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11] -; GLOBALNESS1-NEXT: s_mov_b64 s[92:93], 0x80 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1 -; GLOBALNESS1-NEXT: s_mov_b32 s69, 0x3ff00000 +; GLOBALNESS1-NEXT: s_mov_b64 s[74:75], 0x80 ; GLOBALNESS1-NEXT: s_mov_b32 s32, 0 -; GLOBALNESS1-NEXT: ; implicit-def: $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47_agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63 +; GLOBALNESS1-NEXT: ; implicit-def: $vgpr44_vgpr45 ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v42, s4, 0 -; GLOBALNESS1-NEXT: v_writelane_b32 v42, s5, 1 -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v42, s4, 2 -; GLOBALNESS1-NEXT: v_writelane_b32 v42, s5, 3 -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v42, s4, 4 -; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[90:91], 1, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v42, s5, 5 +; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 0, v0 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 1, v0 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v1 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v2 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v3 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[60:61], 1, v0 ; GLOBALNESS1-NEXT: s_branch .LBB1_4 ; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_readlane_b32 s6, v42, 4 -; GLOBALNESS1-NEXT: v_readlane_b32 s7, v42, 5 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[60:61] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_29 -; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow6 +; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], 0 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GLOBALNESS1-NEXT: .LBB1_3: ; %Flow19 +; GLOBALNESS1-NEXT: .LBB1_3: ; %Flow28 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a63, v31 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a62, v30 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a61, v29 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a60, v28 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a59, v27 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a58, v26 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a57, v25 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a56, v24 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a55, v23 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a54, v22 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a53, v21 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a52, v20 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a51, v19 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a50, v18 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a49, v17 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a48, v16 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a47, v15 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a46, v14 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a45, v13 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a44, v12 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a43, v11 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a42, v10 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a41, v9 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a40, v8 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a39, v7 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a38, v6 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a37, v5 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a36, v4 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a35, v3 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a34, v2 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a33, v1 -; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a32, v0 +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[44:45], v[0:1], v[0:1] op_sel:[0,1] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_30 ; GLOBALNESS1-NEXT: .LBB1_4: ; %bb5 ; GLOBALNESS1-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS1-NEXT: ; Child Loop BB1_15 Depth 2 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], s[92:93], s[92:93] op_sel:[0,1] -; GLOBALNESS1-NEXT: flat_load_dword v44, v[0:1] -; GLOBALNESS1-NEXT: s_add_u32 s8, s62, 40 +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], s[74:75], s[74:75] op_sel:[0,1] +; GLOBALNESS1-NEXT: flat_load_dword v43, v[0:1] +; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS1-NEXT: buffer_store_dword v40, off, s[0:3], 0 -; GLOBALNESS1-NEXT: flat_load_dword v45, v[0:1] -; GLOBALNESS1-NEXT: s_addc_u32 s9, s63, 0 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] +; GLOBALNESS1-NEXT: flat_load_dword v46, v[0:1] +; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0 +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[66:67] -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[42:43] +; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[46:47] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_8 ; GLOBALNESS1-NEXT: ; %bb.5: ; %NodeBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lt_i32 s39, 1 +; GLOBALNESS1-NEXT: s_cmp_lt_i32 s79, 1 ; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_7 -; GLOBALNESS1-NEXT: ; %bb.6: ; %LeafBlock3 +; GLOBALNESS1-NEXT: ; %bb.6: ; %LeafBlock12 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lg_u32 s39, 1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s79, 1 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS1-NEXT: s_cbranch_execnz .LBB1_8 @@ -172,57 +150,24 @@ ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], 0 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS1-NEXT: s_branch .LBB1_23 -; GLOBALNESS1-NEXT: .LBB1_8: ; %Flow16 +; GLOBALNESS1-NEXT: .LBB1_8: ; %Flow25 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_24 ; GLOBALNESS1-NEXT: .LBB1_9: ; %baz.exit.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS1-NEXT: flat_load_dword v0, v[32:33] -; GLOBALNESS1-NEXT: s_mov_b32 s68, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s70, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s71, s69 -; GLOBALNESS1-NEXT: s_mov_b32 s72, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s73, s69 -; GLOBALNESS1-NEXT: s_mov_b32 s74, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s75, s69 -; GLOBALNESS1-NEXT: s_mov_b32 s76, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s77, s69 -; GLOBALNESS1-NEXT: s_mov_b32 s78, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s79, s69 -; GLOBALNESS1-NEXT: s_mov_b32 s80, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s81, s69 -; GLOBALNESS1-NEXT: s_mov_b32 s82, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s83, s69 -; GLOBALNESS1-NEXT: s_mov_b32 s84, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s85, s69 +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 +; GLOBALNESS1-NEXT: flat_load_dword v0, v[2:3] ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[96:97], 0, v0 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], s[68:69], s[68:69] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], s[70:71], s[70:71] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[4:5], s[72:73], s[72:73] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[6:7], s[74:75], s[74:75] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[8:9], s[76:77], s[76:77] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[10:11], s[78:79], s[78:79] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[12:13], s[80:81], s[80:81] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[14:15], s[82:83], s[82:83] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[16:17], s[84:85], s[84:85] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[18:19], s[86:87], s[86:87] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[20:21], s[88:89], s[88:89] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[22:23], s[90:91], s[90:91] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[24:25], s[92:93], s[92:93] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[26:27], s[94:95], s[94:95] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[28:29], s[96:97], s[96:97] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[30:31], s[98:99], s[98:99] op_sel:[0,1] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[70:71], s[96:97] +; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[62:63], 0, v0 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x3ff00000 +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[80:81], s[62:63] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_26 ; GLOBALNESS1-NEXT: ; %bb.10: ; %bb33.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[32:33], off -; GLOBALNESS1-NEXT: v_readlane_b32 s4, v42, 0 -; GLOBALNESS1-NEXT: v_readlane_b32 s5, v42, 1 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[2:3], off +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[54:55] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_12 ; GLOBALNESS1-NEXT: ; %bb.11: ; %bb39.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -231,166 +176,123 @@ ; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[40:41], off ; GLOBALNESS1-NEXT: .LBB1_12: ; %bb44.lr.ph.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v45 -; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v44, vcc -; GLOBALNESS1-NEXT: s_mov_b64 s[72:73], s[42:43] -; GLOBALNESS1-NEXT: s_mov_b32 s75, s39 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 +; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v43, vcc ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e64 s[56:57], 0, v[0:1] -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[58:59], 0, v2 +; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e64 s[64:65], 0, v[0:1] +; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[66:67], 0, v2 ; GLOBALNESS1-NEXT: s_branch .LBB1_15 -; GLOBALNESS1-NEXT: .LBB1_13: ; %Flow7 +; GLOBALNESS1-NEXT: .LBB1_13: ; %Flow16 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS1-NEXT: .LBB1_14: ; %bb63.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[86:87] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[52:53] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_25 ; GLOBALNESS1-NEXT: .LBB1_15: ; %bb44.i ; GLOBALNESS1-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS1-NEXT: ; => This Inner Loop Header: Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[94:95] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[48:49] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_14 ; GLOBALNESS1-NEXT: ; %bb.16: ; %bb46.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[88:89] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[50:51] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_14 ; GLOBALNESS1-NEXT: ; %bb.17: ; %bb50.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[42:43] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_20 ; GLOBALNESS1-NEXT: ; %bb.18: ; %bb3.i.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[40:41] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[44:45] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_20 ; GLOBALNESS1-NEXT: ; %bb.19: ; %bb6.i.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[56:57] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[64:65] ; GLOBALNESS1-NEXT: .LBB1_20: ; %spam.exit.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[90:91] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[56:57] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_14 ; GLOBALNESS1-NEXT: ; %bb.21: ; %bb55.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_add_u32 s60, s62, 40 -; GLOBALNESS1-NEXT: s_addc_u32 s61, s63, 0 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] -; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[60:61] +; GLOBALNESS1-NEXT: s_add_u32 s68, s38, 40 +; GLOBALNESS1-NEXT: s_addc_u32 s69, s39, 0 +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[68:69] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 -; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[66:67] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[44:45], 0, 0 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] -; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[60:61] +; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[46:47], 0, 0 +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[68:69] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], a[32:33], off -; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[66:67] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[58:59] +; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off +; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[66:67] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_13 ; GLOBALNESS1-NEXT: ; %bb.22: ; %bb62.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[40:41], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[40:41], off ; GLOBALNESS1-NEXT: s_branch .LBB1_13 ; GLOBALNESS1-NEXT: .LBB1_23: ; %LeafBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lg_u32 s39, 0 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s79, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 ; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_9 ; GLOBALNESS1-NEXT: .LBB1_24: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 -; GLOBALNESS1-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GLOBALNESS1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GLOBALNESS1-NEXT: s_branch .LBB1_3 -; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow14 +; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow23 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[36:37] -; GLOBALNESS1-NEXT: s_mov_b32 s36, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s37, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s38, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s39, s93 -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[40:41] -; GLOBALNESS1-NEXT: s_mov_b32 s40, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s41, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s42, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s43, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s44, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s45, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s46, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s47, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s48, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s49, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s50, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s51, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s52, s93 -; GLOBALNESS1-NEXT: s_mov_b32 s53, s93 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], s[38:39], s[38:39] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[4:5], s[40:41], s[40:41] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[6:7], s[42:43], s[42:43] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[8:9], s[44:45], s[44:45] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[10:11], s[46:47], s[46:47] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[12:13], s[48:49], s[48:49] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[14:15], s[50:51], s[50:51] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[16:17], s[52:53], s[52:53] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[18:19], s[54:55], s[54:55] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[20:21], s[56:57], s[56:57] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[22:23], s[58:59], s[58:59] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[24:25], s[60:61], s[60:61] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[26:27], s[62:63], s[62:63] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[28:29], s[64:65], s[64:65] op_sel:[0,1] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[30:31], s[66:67], s[66:67] op_sel:[0,1] -; GLOBALNESS1-NEXT: s_mov_b64 s[40:41], s[6:7] -; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[4:5] -; GLOBALNESS1-NEXT: s_mov_b32 s39, s75 -; GLOBALNESS1-NEXT: s_mov_b64 s[42:43], s[72:73] -; GLOBALNESS1-NEXT: .LBB1_26: ; %Flow15 +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 +; GLOBALNESS1-NEXT: .LBB1_26: ; %Flow24 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[70:71] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[96:97] +; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[80:81] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[62:63] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS1-NEXT: ; %bb.27: ; %bb67.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_readlane_b32 s6, v42, 2 -; GLOBALNESS1-NEXT: v_readlane_b32 s7, v42, 3 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[58:59] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[40:41], off +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 +; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[40:41], off ; GLOBALNESS1-NEXT: s_branch .LBB1_1 ; GLOBALNESS1-NEXT: .LBB1_29: ; %bb73.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[40:41], off +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 +; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[40:41], off ; GLOBALNESS1-NEXT: s_branch .LBB1_2 ; GLOBALNESS1-NEXT: .LBB1_30: ; %loop.exit.guard ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_32 ; GLOBALNESS1-NEXT: ; %bb.31: ; %bb7.i.i -; GLOBALNESS1-NEXT: s_add_u32 s8, s62, 40 -; GLOBALNESS1-NEXT: s_addc_u32 s9, s63, 0 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] +; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40 +; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0 +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 ; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 @@ -400,15 +302,15 @@ ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_34 ; GLOBALNESS1-NEXT: ; %bb.33: ; %bb11.i.i -; GLOBALNESS1-NEXT: s_add_u32 s8, s62, 40 -; GLOBALNESS1-NEXT: s_addc_u32 s9, s63, 0 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] +; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40 +; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0 +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 -; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 -; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 +; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 +; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 ; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 @@ -417,138 +319,116 @@ ; ; GLOBALNESS0-LABEL: kernel: ; GLOBALNESS0: ; %bb.0: ; %bb -; GLOBALNESS0-NEXT: s_mov_b64 s[54:55], s[6:7] -; GLOBALNESS0-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 +; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[6:7] +; GLOBALNESS0-NEXT: s_load_dwordx4 s[72:75], s[8:9], 0x0 ; GLOBALNESS0-NEXT: s_load_dword s6, s[8:9], 0x14 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v0 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, v0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v40, 0 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS0-NEXT: global_store_dword v[0:1], v40, off ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: global_load_dword v0, v40, s[36:37] -; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GLOBALNESS0-NEXT: s_mov_b64 s[62:63], s[4:5] +; GLOBALNESS0-NEXT: global_load_dword v0, v40, s[72:73] +; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[4:5] ; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 ; GLOBALNESS0-NEXT: s_load_dword s7, s[8:9], 0x20 +; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s17 ; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, 0x40994400 -; GLOBALNESS0-NEXT: s_bitcmp1_b32 s38, 0 +; GLOBALNESS0-NEXT: s_bitcmp1_b32 s74, 0 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[36:37], s[4:5], v[40:41] -; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[40:41], s[4:5], 0 +; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[40:41] +; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS0-NEXT: s_xor_b64 s[94:95], s[4:5], -1 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] +; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s6, 0 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS0-NEXT: s_xor_b64 s[88:89], s[4:5], -1 +; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s7, 0 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[48:49], 1, v1 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS0-NEXT: s_getpc_b64 s[6:7] ; GLOBALNESS0-NEXT: s_add_u32 s6, s6, wobble@gotpcrel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s7, s7, wobble@gotpcrel32@hi+12 -; GLOBALNESS0-NEXT: s_xor_b64 s[86:87], s[4:5], -1 -; GLOBALNESS0-NEXT: s_load_dwordx2 s[66:67], s[6:7], 0x0 -; GLOBALNESS0-NEXT: s_mov_b32 s98, s16 -; GLOBALNESS0-NEXT: s_mov_b64 s[60:61], s[8:9] -; GLOBALNESS0-NEXT: s_mov_b32 s99, s15 -; GLOBALNESS0-NEXT: s_mov_b32 s100, s14 +; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v1 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GLOBALNESS0-NEXT: s_load_dwordx2 s[78:79], s[6:7], 0x0 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v1 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v2 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3 +; GLOBALNESS0-NEXT: s_mov_b32 s68, s16 +; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9] +; GLOBALNESS0-NEXT: s_mov_b32 s69, s15 +; GLOBALNESS0-NEXT: s_mov_b32 s70, s14 ; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11] -; GLOBALNESS0-NEXT: s_mov_b64 s[92:93], 0x80 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1 -; GLOBALNESS0-NEXT: s_mov_b32 s69, 0x3ff00000 +; GLOBALNESS0-NEXT: s_mov_b64 s[76:77], 0x80 ; GLOBALNESS0-NEXT: s_mov_b32 s32, 0 -; GLOBALNESS0-NEXT: ; implicit-def: $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47_agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63 +; GLOBALNESS0-NEXT: ; implicit-def: $vgpr44_vgpr45 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s4, 0 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s5, 1 -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s4, 2 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s5, 3 -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s4, 4 -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[90:91], 1, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s5, 5 +; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 0, v0 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 1, v0 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[54:55], 1, v1 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[56:57], 1, v2 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[58:59], 1, v3 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[60:61], 1, v0 ; GLOBALNESS0-NEXT: s_branch .LBB1_4 ; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v42, 4 -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v42, 5 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[60:61] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_29 -; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow6 +; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], 0 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GLOBALNESS0-NEXT: .LBB1_3: ; %Flow19 +; GLOBALNESS0-NEXT: .LBB1_3: ; %Flow28 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a63, v31 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a62, v30 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a61, v29 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a60, v28 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a59, v27 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a58, v26 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a57, v25 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a56, v24 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a55, v23 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a54, v22 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a53, v21 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a52, v20 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a51, v19 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a50, v18 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a49, v17 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a48, v16 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a47, v15 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a46, v14 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a45, v13 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a44, v12 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a43, v11 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a42, v10 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a41, v9 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a40, v8 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a39, v7 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a38, v6 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a37, v5 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a36, v4 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a35, v3 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a34, v2 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a33, v1 -; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a32, v0 +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[44:45], v[0:1], v[0:1] op_sel:[0,1] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_30 ; GLOBALNESS0-NEXT: .LBB1_4: ; %bb5 ; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS0-NEXT: ; Child Loop BB1_15 Depth 2 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[92:93], s[92:93] op_sel:[0,1] -; GLOBALNESS0-NEXT: flat_load_dword v44, v[0:1] -; GLOBALNESS0-NEXT: s_add_u32 s8, s60, 40 +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[76:77], s[76:77] op_sel:[0,1] +; GLOBALNESS0-NEXT: flat_load_dword v43, v[0:1] +; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS0-NEXT: buffer_store_dword v40, off, s[0:3], 0 -; GLOBALNESS0-NEXT: flat_load_dword v45, v[0:1] -; GLOBALNESS0-NEXT: s_addc_u32 s9, s61, 0 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] +; GLOBALNESS0-NEXT: flat_load_dword v46, v[0:1] +; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[66:67] -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[42:43] +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[46:47] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_8 ; GLOBALNESS0-NEXT: ; %bb.5: ; %NodeBlock ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_cmp_lt_i32 s39, 1 +; GLOBALNESS0-NEXT: s_cmp_lt_i32 s75, 1 ; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_7 -; GLOBALNESS0-NEXT: ; %bb.6: ; %LeafBlock3 +; GLOBALNESS0-NEXT: ; %bb.6: ; %LeafBlock12 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_cmp_lg_u32 s39, 1 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s75, 1 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS0-NEXT: s_cbranch_execnz .LBB1_8 @@ -557,57 +437,24 @@ ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], 0 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS0-NEXT: s_branch .LBB1_23 -; GLOBALNESS0-NEXT: .LBB1_8: ; %Flow16 +; GLOBALNESS0-NEXT: .LBB1_8: ; %Flow25 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_24 ; GLOBALNESS0-NEXT: .LBB1_9: ; %baz.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS0-NEXT: flat_load_dword v0, v[32:33] -; GLOBALNESS0-NEXT: s_mov_b32 s68, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s70, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s71, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s72, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s73, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s74, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s75, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s76, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s77, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s78, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s79, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s80, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s81, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s82, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s83, s69 -; GLOBALNESS0-NEXT: s_mov_b32 s84, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s85, s69 +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 +; GLOBALNESS0-NEXT: flat_load_dword v0, v[2:3] ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[96:97], 0, v0 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[68:69], s[68:69] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], s[70:71], s[70:71] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[4:5], s[72:73], s[72:73] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[6:7], s[74:75], s[74:75] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[8:9], s[76:77], s[76:77] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[10:11], s[78:79], s[78:79] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[12:13], s[80:81], s[80:81] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[14:15], s[82:83], s[82:83] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[16:17], s[84:85], s[84:85] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[18:19], s[86:87], s[86:87] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[20:21], s[88:89], s[88:89] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[22:23], s[90:91], s[90:91] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[24:25], s[92:93], s[92:93] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[26:27], s[94:95], s[94:95] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[28:29], s[96:97], s[96:97] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[30:31], s[98:99], s[98:99] op_sel:[0,1] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[70:71], s[96:97] +; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[62:63], 0, v0 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x3ff00000 +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[80:81], s[62:63] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_26 ; GLOBALNESS0-NEXT: ; %bb.10: ; %bb33.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[32:33], off -; GLOBALNESS0-NEXT: v_readlane_b32 s4, v42, 0 -; GLOBALNESS0-NEXT: v_readlane_b32 s5, v42, 1 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[2:3], off +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[54:55] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_12 ; GLOBALNESS0-NEXT: ; %bb.11: ; %bb39.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -616,166 +463,123 @@ ; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[40:41], off ; GLOBALNESS0-NEXT: .LBB1_12: ; %bb44.lr.ph.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v45 -; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v44, vcc -; GLOBALNESS0-NEXT: s_mov_b64 s[72:73], s[42:43] -; GLOBALNESS0-NEXT: s_mov_b32 s75, s39 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 +; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v43, vcc ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e64 s[56:57], 0, v[0:1] -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[58:59], 0, v2 +; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e64 s[64:65], 0, v[0:1] +; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[66:67], 0, v2 ; GLOBALNESS0-NEXT: s_branch .LBB1_15 -; GLOBALNESS0-NEXT: .LBB1_13: ; %Flow7 +; GLOBALNESS0-NEXT: .LBB1_13: ; %Flow16 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] ; GLOBALNESS0-NEXT: .LBB1_14: ; %bb63.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[86:87] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[52:53] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_25 ; GLOBALNESS0-NEXT: .LBB1_15: ; %bb44.i ; GLOBALNESS0-NEXT: ; Parent Loop BB1_4 Depth=1 ; GLOBALNESS0-NEXT: ; => This Inner Loop Header: Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[94:95] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[48:49] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_14 ; GLOBALNESS0-NEXT: ; %bb.16: ; %bb46.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[88:89] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[50:51] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_14 ; GLOBALNESS0-NEXT: ; %bb.17: ; %bb50.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[42:43] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_20 ; GLOBALNESS0-NEXT: ; %bb.18: ; %bb3.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[40:41] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[44:45] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_20 ; GLOBALNESS0-NEXT: ; %bb.19: ; %bb6.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[56:57] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[64:65] ; GLOBALNESS0-NEXT: .LBB1_20: ; %spam.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[90:91] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[56:57] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_14 ; GLOBALNESS0-NEXT: ; %bb.21: ; %bb55.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_add_u32 s64, s60, 40 -; GLOBALNESS0-NEXT: s_addc_u32 s65, s61, 0 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] -; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[64:65] +; GLOBALNESS0-NEXT: s_add_u32 s72, s38, 40 +; GLOBALNESS0-NEXT: s_addc_u32 s73, s39, 0 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[72:73] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[66:67] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[44:45], 0, 0 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] -; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[64:65] +; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[46:47], 0, 0 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[72:73] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], a[32:33], off -; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[66:67] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[58:59] +; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off +; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[66:67] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_13 ; GLOBALNESS0-NEXT: ; %bb.22: ; %bb62.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[40:41], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[40:41], off ; GLOBALNESS0-NEXT: s_branch .LBB1_13 ; GLOBALNESS0-NEXT: .LBB1_23: ; %LeafBlock ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_cmp_lg_u32 s39, 0 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s75, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 ; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_9 ; GLOBALNESS0-NEXT: .LBB1_24: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 -; GLOBALNESS0-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GLOBALNESS0-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GLOBALNESS0-NEXT: s_branch .LBB1_3 -; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow14 +; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow23 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[36:37] -; GLOBALNESS0-NEXT: s_mov_b32 s36, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s37, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s38, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s39, s93 -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[40:41] -; GLOBALNESS0-NEXT: s_mov_b32 s40, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s41, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s42, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s43, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s44, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s45, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s46, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s47, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s48, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s49, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s50, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s51, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s52, s93 -; GLOBALNESS0-NEXT: s_mov_b32 s53, s93 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], s[38:39], s[38:39] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[4:5], s[40:41], s[40:41] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[6:7], s[42:43], s[42:43] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[8:9], s[44:45], s[44:45] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[10:11], s[46:47], s[46:47] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[12:13], s[48:49], s[48:49] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[14:15], s[50:51], s[50:51] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[16:17], s[52:53], s[52:53] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[18:19], s[54:55], s[54:55] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[20:21], s[56:57], s[56:57] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[22:23], s[58:59], s[58:59] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[24:25], s[60:61], s[60:61] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[26:27], s[62:63], s[62:63] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[28:29], s[64:65], s[64:65] op_sel:[0,1] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[30:31], s[66:67], s[66:67] op_sel:[0,1] -; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[6:7] -; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[4:5] -; GLOBALNESS0-NEXT: s_mov_b32 s39, s75 -; GLOBALNESS0-NEXT: s_mov_b64 s[42:43], s[72:73] -; GLOBALNESS0-NEXT: .LBB1_26: ; %Flow15 +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 +; GLOBALNESS0-NEXT: .LBB1_26: ; %Flow24 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[70:71] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[96:97] +; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[80:81] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[62:63] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS0-NEXT: ; %bb.27: ; %bb67.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v42, 2 -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v42, 3 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[58:59] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[40:41], off +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 +; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[40:41], off ; GLOBALNESS0-NEXT: s_branch .LBB1_1 ; GLOBALNESS0-NEXT: .LBB1_29: ; %bb73.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[40:41], off +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 +; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[40:41], off ; GLOBALNESS0-NEXT: s_branch .LBB1_2 ; GLOBALNESS0-NEXT: .LBB1_30: ; %loop.exit.guard ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_32 ; GLOBALNESS0-NEXT: ; %bb.31: ; %bb7.i.i -; GLOBALNESS0-NEXT: s_add_u32 s8, s60, 40 -; GLOBALNESS0-NEXT: s_addc_u32 s9, s61, 0 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] +; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 +; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 @@ -785,15 +589,15 @@ ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_34 ; GLOBALNESS0-NEXT: ; %bb.33: ; %bb11.i.i -; GLOBALNESS0-NEXT: s_add_u32 s8, s60, 40 -; GLOBALNESS0-NEXT: s_addc_u32 s9, s61, 0 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] +; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 +; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 -; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 -; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 +; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 +; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 diff --git a/llvm/test/CodeGen/AMDGPU/v1024.ll b/llvm/test/CodeGen/AMDGPU/v1024.ll --- a/llvm/test/CodeGen/AMDGPU/v1024.ll +++ b/llvm/test/CodeGen/AMDGPU/v1024.ll @@ -4,7 +4,7 @@ ; GCN-LABEL: {{^}}test_v1024: ; GCN-NOT: v_accvgpr -; GCN-COUNT-32: v_mov_b32_e32 +; GCN-COUNT-8: global_store_dwordx4 ; GCN-NOT: v_accvgpr define amdgpu_kernel void @test_v1024() { entry: diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1811,28 +1811,28 @@ ; GFX1032-NEXT: s_branch .LBB33_2 ; GFX1032-NEXT: .LBB33_1: ; %body ; GFX1032-NEXT: ; in Loop: Header=BB33_2 Depth=1 -; GFX1032-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX1032-NEXT: image_sample v[0:3], v7, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX1032-NEXT: v_add_f32_e32 v8, 2.0, v8 ; GFX1032-NEXT: s_cbranch_execz .LBB33_4 ; GFX1032-NEXT: .LBB33_2: ; %loop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v7, v3 -; GFX1032-NEXT: v_mov_b32_e32 v6, v2 -; GFX1032-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032-NEXT: v_mov_b32_e32 v4, v0 +; GFX1032-NEXT: v_mov_b32_e32 v4, v3 +; GFX1032-NEXT: v_mov_b32_e32 v5, v2 +; GFX1032-NEXT: v_mov_b32_e32 v6, v1 +; GFX1032-NEXT: v_mov_b32_e32 v7, v0 ; GFX1032-NEXT: s_cbranch_vccz .LBB33_1 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX1032-NEXT: ; implicit-def: $vgpr3 ; GFX1032-NEXT: ; implicit-def: $vgpr8 ; GFX1032-NEXT: .LBB33_4: ; %break ; GFX1032-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v0, v4 -; GFX1032-NEXT: v_mov_b32_e32 v1, v5 -; GFX1032-NEXT: v_mov_b32_e32 v2, v6 -; GFX1032-NEXT: v_mov_b32_e32 v3, v7 +; GFX1032-NEXT: v_mov_b32_e32 v0, v7 +; GFX1032-NEXT: v_mov_b32_e32 v1, v6 +; GFX1032-NEXT: v_mov_b32_e32 v2, v5 +; GFX1032-NEXT: v_mov_b32_e32 v3, v4 ; GFX1032-NEXT: ; return to shader part epilog ; ; GFX1064-LABEL: test_loop_vcc: @@ -1843,28 +1843,28 @@ ; GFX1064-NEXT: s_branch .LBB33_2 ; GFX1064-NEXT: .LBB33_1: ; %body ; GFX1064-NEXT: ; in Loop: Header=BB33_2 Depth=1 -; GFX1064-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX1064-NEXT: image_sample v[0:3], v7, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX1064-NEXT: v_add_f32_e32 v8, 2.0, v8 ; GFX1064-NEXT: s_cbranch_execz .LBB33_4 ; GFX1064-NEXT: .LBB33_2: ; %loop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_cmp_lt_f32_e32 vcc, 0x40e00000, v8 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v7, v3 -; GFX1064-NEXT: v_mov_b32_e32 v6, v2 -; GFX1064-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064-NEXT: v_mov_b32_e32 v4, v0 +; GFX1064-NEXT: v_mov_b32_e32 v4, v3 +; GFX1064-NEXT: v_mov_b32_e32 v5, v2 +; GFX1064-NEXT: v_mov_b32_e32 v6, v1 +; GFX1064-NEXT: v_mov_b32_e32 v7, v0 ; GFX1064-NEXT: s_cbranch_vccz .LBB33_1 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX1064-NEXT: ; implicit-def: $vgpr3 ; GFX1064-NEXT: ; implicit-def: $vgpr8 ; GFX1064-NEXT: .LBB33_4: ; %break ; GFX1064-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v0, v4 -; GFX1064-NEXT: v_mov_b32_e32 v1, v5 -; GFX1064-NEXT: v_mov_b32_e32 v2, v6 -; GFX1064-NEXT: v_mov_b32_e32 v3, v7 +; GFX1064-NEXT: v_mov_b32_e32 v0, v7 +; GFX1064-NEXT: v_mov_b32_e32 v1, v6 +; GFX1064-NEXT: v_mov_b32_e32 v2, v5 +; GFX1064-NEXT: v_mov_b32_e32 v3, v4 ; GFX1064-NEXT: ; return to shader part epilog entry: br label %loop diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -1835,36 +1835,36 @@ ; GFX9-W64: ; %bb.0: ; %entry ; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec -; GFX9-W64-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-W64-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-W64-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0 ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] -; GFX9-W64-NEXT: image_store v[4:7], v0, s[0:7] dmask:0xf unorm +; GFX9-W64-NEXT: image_store v[0:3], v0, s[0:7] dmask:0xf unorm ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-W64-NEXT: s_mov_b32 s4, 0x40e00000 ; GFX9-W64-NEXT: s_branch .LBB31_2 ; GFX9-W64-NEXT: .LBB31_1: ; %body ; GFX9-W64-NEXT: ; in Loop: Header=BB31_2 Depth=1 -; GFX9-W64-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf +; GFX9-W64-NEXT: image_sample v[0:3], v7, s[0:7], s[0:3] dmask:0xf ; GFX9-W64-NEXT: v_add_f32_e32 v8, 2.0, v8 ; GFX9-W64-NEXT: s_cbranch_execz .LBB31_4 ; GFX9-W64-NEXT: .LBB31_2: ; %loop ; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-W64-NEXT: v_cmp_lt_f32_e32 vcc, s4, v8 -; GFX9-W64-NEXT: v_mov_b32_e32 v1, v5 -; GFX9-W64-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-W64-NEXT: v_mov_b32_e32 v3, v7 +; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-W64-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-W64-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-W64-NEXT: v_mov_b32_e32 v7, v0 ; GFX9-W64-NEXT: s_cbranch_vccz .LBB31_1 ; GFX9-W64-NEXT: ; %bb.3: -; GFX9-W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GFX9-W64-NEXT: ; implicit-def: $vgpr3 ; GFX9-W64-NEXT: ; implicit-def: $vgpr8 ; GFX9-W64-NEXT: .LBB31_4: ; %break ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) +; GFX9-W64-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-W64-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-W64-NEXT: v_mov_b32_e32 v2, v5 +; GFX9-W64-NEXT: v_mov_b32_e32 v3, v4 ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_loop_vcc: @@ -1879,28 +1879,28 @@ ; GFX10-W32-NEXT: .p2align 6 ; GFX10-W32-NEXT: .LBB31_1: ; %body ; GFX10-W32-NEXT: ; in Loop: Header=BB31_2 Depth=1 -; GFX10-W32-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: image_sample v[0:3], v7, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: v_add_f32_e32 v8, 2.0, v8 ; GFX10-W32-NEXT: s_cbranch_execz .LBB31_4 ; GFX10-W32-NEXT: .LBB31_2: ; %loop ; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-W32-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: v_mov_b32_e32 v7, v3 -; GFX10-W32-NEXT: v_mov_b32_e32 v6, v2 -; GFX10-W32-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-W32-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-W32-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-W32-NEXT: v_mov_b32_e32 v7, v0 ; GFX10-W32-NEXT: s_cbranch_vccz .LBB31_1 ; GFX10-W32-NEXT: ; %bb.3: -; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX10-W32-NEXT: ; implicit-def: $vgpr3 ; GFX10-W32-NEXT: ; implicit-def: $vgpr8 ; GFX10-W32-NEXT: .LBB31_4: ; %break ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: v_mov_b32_e32 v0, v4 -; GFX10-W32-NEXT: v_mov_b32_e32 v1, v5 -; GFX10-W32-NEXT: v_mov_b32_e32 v2, v6 -; GFX10-W32-NEXT: v_mov_b32_e32 v3, v7 +; GFX10-W32-NEXT: v_mov_b32_e32 v0, v7 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, v6 +; GFX10-W32-NEXT: v_mov_b32_e32 v2, v5 +; GFX10-W32-NEXT: v_mov_b32_e32 v3, v4 ; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-W32-NEXT: ; return to shader part epilog entry: @@ -2136,7 +2136,7 @@ ; GFX9-W64-NEXT: s_cbranch_execz .LBB35_3 ; GFX9-W64-NEXT: s_branch .LBB35_4 ; GFX9-W64-NEXT: .LBB35_2: -; GFX9-W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX9-W64-NEXT: ; implicit-def: $vgpr3 ; GFX9-W64-NEXT: .LBB35_3: ; %if ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 @@ -2162,7 +2162,7 @@ ; GFX10-W32-NEXT: s_cbranch_execz .LBB35_3 ; GFX10-W32-NEXT: s_branch .LBB35_4 ; GFX10-W32-NEXT: .LBB35_2: -; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX10-W32-NEXT: ; implicit-def: $vgpr3 ; GFX10-W32-NEXT: .LBB35_3: ; %if ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0