diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2267,6 +2267,29 @@ return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); } + // Using a single 32-bit SALU to calculate the high half is smaller than + // S_BFE with a literal constant operand. + if (DstSize > 32 && SrcSize == 32) { + Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister; + if (Signed) { + BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg) + .addReg(SrcReg, 0, SubReg) + .addImm(31); + } else { + BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg) + .addImm(0); + } + BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) + .addReg(SrcReg, 0, SubReg) + .addImm(AMDGPU::sub0) + .addReg(HiReg) + .addImm(AMDGPU::sub1); + I.eraseFromParent(); + return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, + *MRI); + } + const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; @@ -2275,7 +2298,7 @@ // We need a 64-bit register source, but the high bits don't matter. Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); - unsigned SubReg = InReg ? AMDGPU::sub0 : 0; + unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister; BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext-inreg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext-inreg.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext-inreg.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext-inreg.mir @@ -225,10 +225,9 @@ ; GCN: liveins: $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 - ; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[DEF]], %subreg.sub1 - ; GCN-NEXT: [[S_BFE_I64_:%[0-9]+]]:sreg_64 = S_BFE_I64 [[REG_SEQUENCE]], 2097152, implicit-def $scc - ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[S_BFE_I64_]] + ; GCN-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[COPY]].sub0, 31, implicit-def $scc + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[S_ASHR_I32_]], %subreg.sub1 + ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[REG_SEQUENCE]] %0:sgpr(s64) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_SEXT_INREG %0, 32 $sgpr0_sgpr1 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir @@ -127,10 +127,9 @@ ; GCN: liveins: $sgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[DEF]], %subreg.sub1 - ; GCN-NEXT: [[S_BFE_I64_:%[0-9]+]]:sreg_64 = S_BFE_I64 [[REG_SEQUENCE]], 2097152, implicit-def $scc - ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[S_BFE_I64_]] + ; GCN-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[COPY]], 31, implicit-def $scc + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[S_ASHR_I32_]], %subreg.sub1 + ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[REG_SEQUENCE]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s64) = G_SEXT %0 $sgpr0_sgpr1 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir @@ -127,10 +127,9 @@ ; GCN: liveins: $sgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[DEF]], %subreg.sub1 - ; GCN-NEXT: [[S_BFE_U64_:%[0-9]+]]:sreg_64 = S_BFE_U64 [[REG_SEQUENCE]], 2097152, implicit-def $scc - ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[S_BFE_U64_]] + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[REG_SEQUENCE]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s64) = G_ZEXT %0 $sgpr0_sgpr1 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -252,11 +252,11 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_sgpr_offset(i32 addrspace(1)* inreg %ptr, i32 inreg %soffset) { ; GFX6-LABEL: mubuf_store_sgpr_ptr_sgpr_offset: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_ashr_i32 s5, s4, 31 +; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000 -; GFX6-NEXT: s_lshl_b64 s[4:5], s[2:3], 2 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -266,11 +266,11 @@ ; ; GFX7-LABEL: mubuf_store_sgpr_ptr_sgpr_offset: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_ashr_i32 s5, s4, 31 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000 -; GFX7-NEXT: s_lshl_b64 s[4:5], s[2:3], 2 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -285,8 +285,8 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset(i32 addrspace(1)* %ptr, i32 inreg %soffset) { ; GFX6-LABEL: mubuf_store_vgpr_ptr_sgpr_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX6-NEXT: s_ashr_i32 s3, s2, 31 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -295,8 +295,8 @@ ; ; GFX7-LABEL: mubuf_store_vgpr_ptr_sgpr_offset: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX7-NEXT: s_ashr_i32 s3, s2, 31 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -310,8 +310,8 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset_offset256(i32 addrspace(1)* %ptr, i32 inreg %soffset) { ; GFX6-LABEL: mubuf_store_vgpr_ptr_sgpr_offset_offset256: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX6-NEXT: s_ashr_i32 s3, s2, 31 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -320,8 +320,8 @@ ; ; GFX7-LABEL: mubuf_store_vgpr_ptr_sgpr_offset_offset256: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX7-NEXT: s_ashr_i32 s3, s2, 31 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -336,8 +336,8 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset256_offset(i32 addrspace(1)* %ptr, i32 inreg %soffset) { ; GFX6-LABEL: mubuf_store_vgpr_ptr_sgpr_offset256_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX6-NEXT: s_ashr_i32 s3, s2, 31 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -346,8 +346,8 @@ ; ; GFX7-LABEL: mubuf_store_vgpr_ptr_sgpr_offset256_offset: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX7-NEXT: s_ashr_i32 s3, s2, 31 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -698,11 +698,11 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_sgpr_offset(float addrspace(1)* inreg %ptr, i32 inreg %soffset) { ; GFX6-LABEL: mubuf_load_sgpr_ptr_sgpr_offset: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_ashr_i32 s5, s4, 31 +; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000 -; GFX6-NEXT: s_lshl_b64 s[4:5], s[2:3], 2 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 @@ -712,11 +712,11 @@ ; ; GFX7-LABEL: mubuf_load_sgpr_ptr_sgpr_offset: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_ashr_i32 s5, s4, 31 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000 -; GFX7-NEXT: s_lshl_b64 s[4:5], s[2:3], 2 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 @@ -731,8 +731,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset(float addrspace(1)* %ptr, i32 inreg %soffset) { ; GFX6-LABEL: mubuf_load_vgpr_ptr_sgpr_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX6-NEXT: s_ashr_i32 s3, s2, 31 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc @@ -741,8 +741,8 @@ ; ; GFX7-LABEL: mubuf_load_vgpr_ptr_sgpr_offset: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX7-NEXT: s_ashr_i32 s3, s2, 31 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc @@ -756,8 +756,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset_offset256(float addrspace(1)* %ptr, i32 inreg %soffset) { ; GFX6-LABEL: mubuf_load_vgpr_ptr_sgpr_offset_offset256: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX6-NEXT: s_ashr_i32 s3, s2, 31 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 glc @@ -766,8 +766,8 @@ ; ; GFX7-LABEL: mubuf_load_vgpr_ptr_sgpr_offset_offset256: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX7-NEXT: s_ashr_i32 s3, s2, 31 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 glc @@ -782,8 +782,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(float addrspace(1)* %ptr, i32 inreg %soffset) { ; GFX6-LABEL: mubuf_load_vgpr_ptr_sgpr_offset256_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX6-NEXT: s_ashr_i32 s3, s2, 31 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 glc @@ -792,8 +792,8 @@ ; ; GFX7-LABEL: mubuf_load_vgpr_ptr_sgpr_offset256_offset: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX7-NEXT: s_ashr_i32 s3, s2, 31 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 glc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -120,14 +120,14 @@ ; GCN-LABEL: s_shl_i64_zext_i32_overflow: ; GCN: ; %bb.0: ; GCN-NEXT: s_bitset0_b32 s0, 31 -; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 +; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_shl_i64_zext_i32_overflow: ; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s1, 0 ; GFX10PLUS-NEXT: s_bitset0_b32 s0, 31 -; GFX10PLUS-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 ; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX10PLUS-NEXT: ; return to shader part epilog %and = and i32 %x, 2147483647 @@ -187,14 +187,14 @@ ; GCN-LABEL: s_shl_i64_sext_i32_overflow: ; GCN: ; %bb.0: ; GCN-NEXT: s_bitset0_b32 s0, 31 -; GCN-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000 +; GCN-NEXT: s_ashr_i32 s1, s0, 31 ; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_shl_i64_sext_i32_overflow: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_bitset0_b32 s0, 31 -; GFX10PLUS-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000 +; GFX10PLUS-NEXT: s_ashr_i32 s1, s0, 31 ; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX10PLUS-NEXT: ; return to shader part epilog %and = and i32 %x, 2147483647 @@ -434,9 +434,10 @@ ; GCN-NEXT: s_brev_b32 s2, -4 ; GCN-NEXT: s_mov_b32 s3, s2 ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GCN-NEXT: s_bfe_u64 s[2:3], s[0:1], 0x200000 -; GCN-NEXT: s_mov_b32 s0, s1 -; GCN-NEXT: s_bfe_u64 s[4:5], s[0:1], 0x200000 +; GCN-NEXT: s_mov_b32 s3, 0 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: s_mov_b32 s4, s1 +; GCN-NEXT: s_mov_b32 s5, s3 ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], 2 ; GCN-NEXT: ; return to shader part epilog @@ -446,11 +447,12 @@ ; GFX10PLUS-NEXT: s_brev_b32 s2, -4 ; GFX10PLUS-NEXT: s_mov_b32 s3, s2 ; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX10PLUS-NEXT: s_mov_b32 s2, s1 -; GFX10PLUS-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 -; GFX10PLUS-NEXT: s_bfe_u64 s[2:3], s[2:3], 0x200000 -; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX10PLUS-NEXT: s_mov_b32 s3, 0 +; GFX10PLUS-NEXT: s_mov_b32 s2, s0 +; GFX10PLUS-NEXT: s_mov_b32 s4, s1 +; GFX10PLUS-NEXT: s_mov_b32 s5, s3 +; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[4:5], 2 ; GFX10PLUS-NEXT: ; return to shader part epilog %and = and <2 x i32> %x, %ext = zext <2 x i32> %and to <2 x i64> @@ -525,9 +527,10 @@ ; GCN-NEXT: s_brev_b32 s2, -8 ; GCN-NEXT: s_mov_b32 s3, s2 ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GCN-NEXT: s_bfe_i64 s[2:3], s[0:1], 0x200000 -; GCN-NEXT: s_mov_b32 s0, s1 -; GCN-NEXT: s_bfe_i64 s[4:5], s[0:1], 0x200000 +; GCN-NEXT: s_ashr_i32 s3, s0, 31 +; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: s_ashr_i32 s5, s1, 31 +; GCN-NEXT: s_mov_b32 s4, s1 ; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 ; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], 2 ; GCN-NEXT: ; return to shader part epilog @@ -537,11 +540,12 @@ ; GFX10PLUS-NEXT: s_brev_b32 s2, -8 ; GFX10PLUS-NEXT: s_mov_b32 s3, s2 ; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX10PLUS-NEXT: s_mov_b32 s2, s1 -; GFX10PLUS-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000 -; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x200000 -; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX10PLUS-NEXT: s_ashr_i32 s3, s0, 31 +; GFX10PLUS-NEXT: s_mov_b32 s2, s0 +; GFX10PLUS-NEXT: s_ashr_i32 s5, s1, 31 +; GFX10PLUS-NEXT: s_mov_b32 s4, s1 +; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[4:5], 2 ; GFX10PLUS-NEXT: ; return to shader part epilog %and = and <2 x i32> %x, %ext = sext <2 x i32> %and to <2 x i64> diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -664,11 +664,11 @@ ; GFX10-GISEL-NEXT: s_clause 0x1 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c ; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 -; GFX10-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -448,10 +448,10 @@ ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] -; GFX9-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -572,11 +572,11 @@ ; GFX10-GISEL-NEXT: s_clause 0x1 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c ; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[2:3] ; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 -; GFX10-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -549,12 +549,12 @@ ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s2, s[2:3] -; GFX9-GISEL-NEXT: s_bfe_u64 s[2:3], s[2:3], 0x200000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s4, s[2:3] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone