diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3237,7 +3237,7 @@ // // i16 (trunc (srl i64:x, K)), K <= 16 -> // i16 (trunc (srl (i32 (trunc x), K))) - if (VT.getScalarSizeInBits() < 32) { + if (VT.getScalarSizeInBits() == 16) { EVT SrcVT = Src.getValueType(); if (SrcVT.getScalarSizeInBits() > 32 && (Src.getOpcode() == ISD::SRL || diff --git a/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll --- a/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll @@ -149,3 +149,26 @@ %trunc = trunc i64 %shift to i16 ret i16 %trunc } + +; Checks that we don't blindly apply the combine on anything <32. +; It's completely possible to trunc to weird integer types like i26 +; as an intermediate step of a bigger computation. +; +; Thus, we should have an alignbit here and not a lshrrev +define i32 @trunc_srl_i64_25_to_i26(i64 %x) { +; GCN-LABEL: trunc_srl_i64_25_to_i26: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xa000000, v0 +; GCN-NEXT: v_alignbit_b32 v0, 0, v0, 25 +; GCN-NEXT: v_add_u32_e32 v0, 55, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %value.knownbits2 = and i64 %x, 167772160 ; 0xA000000 + %shift = lshr i64 %value.knownbits2, 25 + %trunc = trunc i64 %shift to i26 + %add = add i26 %trunc, 55 + %ext = zext i26 %add to i32 + ret i32 %ext +} + +