diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1564,9 +1564,11 @@ // Return a suitable opcode for extending the operands of Opc when widening. static unsigned getExtendOp(unsigned Opc) { switch (Opc) { + case TargetOpcode::G_ASHR: case TargetOpcode::G_SMIN: case TargetOpcode::G_SMAX: return TargetOpcode::G_SEXT; + case TargetOpcode::G_LSHR: case TargetOpcode::G_UMIN: case TargetOpcode::G_UMAX: return TargetOpcode::G_ZEXT; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -560,11 +560,13 @@ ; ; GFX9-LABEL: s_ashr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 +; GFX9-NEXT: s_sext_i32_i16 s2, s0 +; GFX9-NEXT: s_sext_i32_i16 s3, s1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NEXT: s_ashr_i32 s2, s2, s3 ; GFX9-NEXT: s_ashr_i32 s0, s0, s1 -; GFX9-NEXT: s_ashr_i32 s1, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s0 ; GFX9-NEXT: ; return to shader part epilog %result = ashr <2 x i16> %value, %amount %cast = bitcast <2 x i16> %result to i32 @@ -754,16 +756,20 @@ ; ; GFX9-LABEL: s_ashr_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_lshr_b32 s5, s2, 16 +; GFX9-NEXT: s_sext_i32_i16 s4, s0 +; GFX9-NEXT: s_sext_i32_i16 s5, s2 +; GFX9-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NEXT: s_ashr_i32 s2, s2, 16 ; GFX9-NEXT: s_ashr_i32 s0, s0, s2 -; GFX9-NEXT: s_ashr_i32 s2, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-NEXT: s_lshr_b32 s2, s1, 16 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_ashr_i32 s1, s1, s3 +; GFX9-NEXT: s_ashr_i32 s4, s4, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s0 +; GFX9-NEXT: s_sext_i32_i16 s2, s1 +; GFX9-NEXT: s_sext_i32_i16 s4, s3 +; GFX9-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NEXT: s_ashr_i32 s3, s3, 16 ; GFX9-NEXT: s_ashr_i32 s2, s2, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-NEXT: s_ashr_i32 s1, s1, s3 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s2, s1 ; GFX9-NEXT: ; return to shader part epilog %result = ashr <4 x i16> %value, %amount %cast = bitcast <4 x i16> %result to <2 x i32> @@ -968,26 +974,34 @@ ; ; GFX9-LABEL: s_ashr_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s8, s0, 16 -; GFX9-NEXT: s_lshr_b32 s9, s4, 16 +; GFX9-NEXT: s_sext_i32_i16 s8, s0 +; GFX9-NEXT: s_sext_i32_i16 s9, s4 +; GFX9-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NEXT: s_ashr_i32 s4, s4, 16 ; GFX9-NEXT: s_ashr_i32 s0, s0, s4 -; GFX9-NEXT: s_ashr_i32 s4, s8, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_lshr_b32 s8, s5, 16 +; GFX9-NEXT: s_ashr_i32 s8, s8, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s8, s0 +; GFX9-NEXT: s_sext_i32_i16 s4, s1 +; GFX9-NEXT: s_sext_i32_i16 s8, s5 +; GFX9-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NEXT: s_ashr_i32 s5, s5, 16 ; GFX9-NEXT: s_ashr_i32 s1, s1, s5 ; GFX9-NEXT: s_ashr_i32 s4, s4, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_lshr_b32 s5, s6, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s1 +; GFX9-NEXT: s_sext_i32_i16 s4, s2 +; GFX9-NEXT: s_sext_i32_i16 s5, s6 +; GFX9-NEXT: s_ashr_i32 s2, s2, 16 +; GFX9-NEXT: s_ashr_i32 s6, s6, 16 ; GFX9-NEXT: s_ashr_i32 s4, s4, s5 ; GFX9-NEXT: s_ashr_i32 s2, s2, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_lshr_b32 s5, s7, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s2 +; GFX9-NEXT: s_sext_i32_i16 s4, s3 +; GFX9-NEXT: s_sext_i32_i16 s5, s7 +; GFX9-NEXT: s_ashr_i32 s3, s3, 16 +; GFX9-NEXT: s_ashr_i32 s6, s7, 16 ; GFX9-NEXT: s_ashr_i32 s4, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: s_ashr_i32 s3, s3, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 ; GFX9-NEXT: ; return to shader part epilog %result = ashr <8 x i16> %value, %amount %cast = bitcast <8 x i16> %result to <4 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -569,10 +569,13 @@ ; ; GFX9-LABEL: s_lshr_v2i16: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s3, 0xffff ; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 +; GFX9-NEXT: s_lshr_b32 s4, s1, 16 +; GFX9-NEXT: s_and_b32 s0, s0, s3 +; GFX9-NEXT: s_and_b32 s1, s1, s3 ; GFX9-NEXT: s_lshr_b32 s0, s0, s1 -; GFX9-NEXT: s_lshr_b32 s1, s2, s3 +; GFX9-NEXT: s_lshr_b32 s1, s2, s4 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog %result = lshr <2 x i16> %value, %amount @@ -747,13 +750,18 @@ ; ; GFX9-LABEL: s_lshr_v4i16: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s5, 0xffff ; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_lshr_b32 s5, s2, 16 +; GFX9-NEXT: s_lshr_b32 s6, s2, 16 +; GFX9-NEXT: s_and_b32 s0, s0, s5 +; GFX9-NEXT: s_and_b32 s2, s2, s5 ; GFX9-NEXT: s_lshr_b32 s0, s0, s2 -; GFX9-NEXT: s_lshr_b32 s2, s4, s5 +; GFX9-NEXT: s_lshr_b32 s2, s4, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX9-NEXT: s_lshr_b32 s2, s1, 16 ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 +; GFX9-NEXT: s_and_b32 s1, s1, s5 +; GFX9-NEXT: s_and_b32 s3, s3, s5 ; GFX9-NEXT: s_lshr_b32 s1, s1, s3 ; GFX9-NEXT: s_lshr_b32 s2, s2, s4 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 @@ -937,24 +945,33 @@ ; ; GFX9-LABEL: s_lshr_v8i16: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s9, 0xffff ; GFX9-NEXT: s_lshr_b32 s8, s0, 16 -; GFX9-NEXT: s_lshr_b32 s9, s4, 16 +; GFX9-NEXT: s_lshr_b32 s10, s4, 16 +; GFX9-NEXT: s_and_b32 s0, s0, s9 +; GFX9-NEXT: s_and_b32 s4, s4, s9 ; GFX9-NEXT: s_lshr_b32 s0, s0, s4 -; GFX9-NEXT: s_lshr_b32 s4, s8, s9 +; GFX9-NEXT: s_lshr_b32 s4, s8, s10 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX9-NEXT: s_lshr_b32 s4, s1, 16 ; GFX9-NEXT: s_lshr_b32 s8, s5, 16 +; GFX9-NEXT: s_and_b32 s1, s1, s9 +; GFX9-NEXT: s_and_b32 s5, s5, s9 ; GFX9-NEXT: s_lshr_b32 s1, s1, s5 ; GFX9-NEXT: s_lshr_b32 s4, s4, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX9-NEXT: s_lshr_b32 s4, s2, 16 ; GFX9-NEXT: s_lshr_b32 s5, s6, 16 -; GFX9-NEXT: s_lshr_b32 s4, s4, s5 +; GFX9-NEXT: s_and_b32 s2, s2, s9 +; GFX9-NEXT: s_and_b32 s6, s6, s9 ; GFX9-NEXT: s_lshr_b32 s2, s2, s6 +; GFX9-NEXT: s_lshr_b32 s4, s4, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_lshr_b32 s5, s7, 16 -; GFX9-NEXT: s_lshr_b32 s3, s3, s7 +; GFX9-NEXT: s_and_b32 s3, s3, s9 +; GFX9-NEXT: s_and_b32 s6, s7, s9 +; GFX9-NEXT: s_lshr_b32 s3, s3, s6 ; GFX9-NEXT: s_lshr_b32 s4, s4, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ashr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ashr.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ashr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ashr.mir @@ -183,14 +183,16 @@ ; CHECK: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr1 ; CHECK: [[BITCAST:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[BITCAST]], 16 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; CHECK: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[ASHR:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST]], [[C]](s32) ; CHECK: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; CHECK: [[SEXT_INREG1:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[BITCAST1]], 16 ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; CHECK: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; CHECK: [[ASHR:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST]], [[BITCAST1]](s32) - ; CHECK: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[LSHR]], [[LSHR1]](s32) - ; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ASHR]](s32), [[ASHR1]](s32) + ; CHECK: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C1]](s32) + ; CHECK: [[ASHR2:%[0-9]+]]:sgpr(s32) = G_ASHR [[SEXT_INREG]], [[SEXT_INREG1]](s32) + ; CHECK: [[ASHR3:%[0-9]+]]:sgpr(s32) = G_ASHR [[ASHR]], [[ASHR1]](s32) + ; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ASHR2]](s32), [[ASHR3]](s32) ; CHECK: S_ENDPGM 0, implicit [[BUILD_VECTOR_TRUNC]](<2 x s16>) %0:_(<2 x s16>) = COPY $sgpr0 %1:_(<2 x s16>) = COPY $sgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-lshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-lshr.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-lshr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-lshr.mir @@ -185,10 +185,14 @@ ; CHECK: [[BITCAST:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY]](<2 x s16>) ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 ; CHECK: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 + ; CHECK: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST]], [[C1]] ; CHECK: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; CHECK: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; CHECK: [[LSHR2:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[BITCAST1]](s32) + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C2]](s32) + ; CHECK: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 + ; CHECK: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C3]] + ; CHECK: [[LSHR2:%[0-9]+]]:sgpr(s32) = G_LSHR [[AND]], [[AND1]](s32) ; CHECK: [[LSHR3:%[0-9]+]]:sgpr(s32) = G_LSHR [[LSHR]], [[LSHR1]](s32) ; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[LSHR3]](s32) ; CHECK: S_ENDPGM 0, implicit [[BUILD_VECTOR_TRUNC]](<2 x s16>)