diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2093,6 +2093,29 @@ return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); } + // Using a single 32-bit SALU to calculate the high half is smaller than + // S_BFE with a literal constant operand. + if (DstSize > 32 && SrcSize == 32) { + Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister; + if (Signed) { + BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg) + .addReg(SrcReg, 0, SubReg) + .addImm(31); + } else { + BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg) + .addImm(0); + } + BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) + .addReg(SrcReg, 0, SubReg) + .addImm(AMDGPU::sub0) + .addReg(HiReg) + .addImm(AMDGPU::sub1); + I.eraseFromParent(); + return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, + *MRI); + } + const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; @@ -2101,7 +2124,7 @@ // We need a 64-bit register source, but the high bits don't matter. Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); - unsigned SubReg = InReg ? AMDGPU::sub0 : 0; + unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister; BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext-inreg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext-inreg.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext-inreg.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext-inreg.mir @@ -203,10 +203,9 @@ ; GCN-LABEL: name: sext_inreg_sgpr_s64_32 ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GCN: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; GCN: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[DEF]], %subreg.sub1 - ; GCN: [[S_BFE_I64_:%[0-9]+]]:sreg_64 = S_BFE_I64 [[REG_SEQUENCE]], 2097152, implicit-def $scc - ; GCN: $sgpr0_sgpr1 = COPY [[S_BFE_I64_]] + ; GCN: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[COPY]].sub0, 31, implicit-def $scc + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[S_ASHR_I32_]], %subreg.sub1 + ; GCN: $sgpr0_sgpr1 = COPY [[REG_SEQUENCE]] %0:sgpr(s64) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_SEXT_INREG %0, 32 $sgpr0_sgpr1 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -252,11 +252,11 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_sgpr_offset(i32 addrspace(1)* inreg %ptr, i32 inreg %soffset) { ; GFX6-LABEL: mubuf_store_sgpr_ptr_sgpr_offset: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_ashr_i32 s5, s4, 31 +; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000 -; GFX6-NEXT: s_lshl_b64 s[4:5], s[2:3], 2 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -266,11 +266,11 @@ ; ; GFX7-LABEL: mubuf_store_sgpr_ptr_sgpr_offset: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_ashr_i32 s5, s4, 31 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000 -; GFX7-NEXT: s_lshl_b64 s[4:5], s[2:3], 2 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -285,8 +285,8 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset(i32 addrspace(1)* %ptr, i32 inreg %soffset) { ; GFX6-LABEL: mubuf_store_vgpr_ptr_sgpr_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX6-NEXT: s_ashr_i32 s3, s2, 31 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -295,8 +295,8 @@ ; ; GFX7-LABEL: mubuf_store_vgpr_ptr_sgpr_offset: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX7-NEXT: s_ashr_i32 s3, s2, 31 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -310,8 +310,8 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset_offset256(i32 addrspace(1)* %ptr, i32 inreg %soffset) { ; GFX6-LABEL: mubuf_store_vgpr_ptr_sgpr_offset_offset256: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX6-NEXT: s_ashr_i32 s3, s2, 31 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -320,8 +320,8 @@ ; ; GFX7-LABEL: mubuf_store_vgpr_ptr_sgpr_offset_offset256: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX7-NEXT: s_ashr_i32 s3, s2, 31 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -338,11 +338,11 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b64 s[4:5], 0x400 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NEXT: s_ashr_i32 s3, s2, 31 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, s5 @@ -353,11 +353,11 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b64 s[4:5], 0x400 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: s_ashr_i32 s3, s2, 31 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, s5 @@ -706,11 +706,11 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_sgpr_offset(float addrspace(1)* inreg %ptr, i32 inreg %soffset) { ; GFX6-LABEL: mubuf_load_sgpr_ptr_sgpr_offset: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_ashr_i32 s5, s4, 31 +; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000 -; GFX6-NEXT: s_lshl_b64 s[4:5], s[2:3], 2 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 @@ -720,11 +720,11 @@ ; ; GFX7-LABEL: mubuf_load_sgpr_ptr_sgpr_offset: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_ashr_i32 s5, s4, 31 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000 -; GFX7-NEXT: s_lshl_b64 s[4:5], s[2:3], 2 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -739,8 +739,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset(float addrspace(1)* %ptr, i32 inreg %soffset) { ; GFX6-LABEL: mubuf_load_vgpr_ptr_sgpr_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX6-NEXT: s_ashr_i32 s3, s2, 31 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc @@ -749,8 +749,8 @@ ; ; GFX7-LABEL: mubuf_load_vgpr_ptr_sgpr_offset: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX7-NEXT: s_ashr_i32 s3, s2, 31 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc @@ -764,8 +764,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset_offset256(float addrspace(1)* %ptr, i32 inreg %soffset) { ; GFX6-LABEL: mubuf_load_vgpr_ptr_sgpr_offset_offset256: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX6-NEXT: s_ashr_i32 s3, s2, 31 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 glc @@ -774,8 +774,8 @@ ; ; GFX7-LABEL: mubuf_load_vgpr_ptr_sgpr_offset_offset256: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX7-NEXT: s_ashr_i32 s3, s2, 31 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 glc @@ -792,11 +792,11 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b64 s[4:5], 0x400 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 +; GFX6-NEXT: s_ashr_i32 s3, s2, 31 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, s5 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc @@ -807,11 +807,11 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b64 s[4:5], 0x400 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 +; GFX7-NEXT: s_ashr_i32 s3, s2, 31 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, s5 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -96,14 +96,14 @@ ; GCN-LABEL: s_shl_i64_zext_i32_overflow: ; GCN: ; %bb.0: ; GCN-NEXT: s_bitset0_b32 s0, 31 -; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 +; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_shl_i64_zext_i32_overflow: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s1, 0 ; GFX10-NEXT: s_bitset0_b32 s0, 31 -; GFX10-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX10-NEXT: ; return to shader part epilog %and = and i32 %x, 2147483647 @@ -155,14 +155,14 @@ ; GCN-LABEL: s_shl_i64_sext_i32_overflow: ; GCN: ; %bb.0: ; GCN-NEXT: s_bitset0_b32 s0, 31 -; GCN-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000 +; GCN-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_shl_i64_sext_i32_overflow: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_bitset0_b32 s0, 31 -; GFX10-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000 +; GFX10-NEXT: s_ashr_i32 s1, s0, 31 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX10-NEXT: ; return to shader part epilog %and = and i32 %x, 2147483647 @@ -373,9 +373,10 @@ ; GCN-NEXT: s_brev_b32 s2, -4 ; GCN-NEXT: s_mov_b32 s3, s2 ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GCN-NEXT: s_bfe_u64 s[2:3], s[0:1], 0x200000 -; GCN-NEXT: s_mov_b32 s0, s1 -; GCN-NEXT: s_bfe_u64 s[4:5], s[0:1], 0x200000 +; GCN-NEXT: s_mov_b32 s3, 0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: s_mov_b32 s4, s1 +; GCN-NEXT: s_mov_b32 s5, s3 ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], 2 ; GCN-NEXT: ; return to shader part epilog @@ -385,11 +386,12 @@ ; GFX10-NEXT: s_brev_b32 s2, -4 ; GFX10-NEXT: s_mov_b32 s3, s2 ; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: s_mov_b32 s2, s1 -; GFX10-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 -; GFX10-NEXT: s_bfe_u64 s[2:3], s[2:3], 0x200000 -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: s_mov_b32 s2, s0 +; GFX10-NEXT: s_mov_b32 s4, s1 +; GFX10-NEXT: s_mov_b32 s5, s3 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[4:5], 2 ; GFX10-NEXT: ; return to shader part epilog %and = and <2 x i32> %x, %ext = zext <2 x i32> %and to <2 x i64> @@ -458,9 +460,10 @@ ; GCN-NEXT: s_brev_b32 s2, -8 ; GCN-NEXT: s_mov_b32 s3, s2 ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GCN-NEXT: s_bfe_i64 s[2:3], s[0:1], 0x200000 -; GCN-NEXT: s_mov_b32 s0, s1 -; GCN-NEXT: s_bfe_i64 s[4:5], s[0:1], 0x200000 +; GCN-NEXT: s_ashr_i32 s3, s0, 31 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: s_ashr_i32 s5, s1, 31 +; GCN-NEXT: s_mov_b32 s4, s1 ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], 2 ; GCN-NEXT: ; return to shader part epilog @@ -470,11 +473,12 @@ ; GFX10-NEXT: s_brev_b32 s2, -8 ; GFX10-NEXT: s_mov_b32 s3, s2 ; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: s_mov_b32 s2, s1 -; GFX10-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000 -; GFX10-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x200000 -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX10-NEXT: s_ashr_i32 s3, s0, 31 +; GFX10-NEXT: s_mov_b32 s2, s0 +; GFX10-NEXT: s_ashr_i32 s5, s1, 31 +; GFX10-NEXT: s_mov_b32 s4, s1 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[4:5], 2 ; GFX10-NEXT: ; return to shader part epilog %and = and <2 x i32> %x, %ext = sext <2 x i32> %and to <2 x i64> diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -582,11 +582,11 @@ ; GFX10-GISEL-NEXT: s_clause 0x1 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c ; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 -; GFX10-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -452,10 +452,10 @@ ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; GFX9-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[4:5] -; GFX9-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -575,11 +575,11 @@ ; GFX10-GISEL-NEXT: s_clause 0x1 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c ; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[2:3] ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 -; GFX10-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -553,10 +553,10 @@ ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_ff1_i32_b64 s0, s[4:5] -; GFX9-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]