Index: llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -137,9 +137,11 @@ // about in practice. LLT Ty = MRI.getType(DstReg); if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) { - const APInt Mask = APInt::getHighBitsSet(32, 24); - return Helper.getKnownBits()->maskedValueIsZero(MI.getOperand(1).getReg(), - Mask); + Register SrcReg = MI.getOperand(1).getReg(); + unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); + assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64); + const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8); + return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask); } return false; @@ -151,14 +153,18 @@ const LLT S32 = LLT::scalar(32); Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); LLT Ty = B.getMRI()->getType(DstReg); + LLT SrcTy = B.getMRI()->getType(SrcReg); + if (SrcTy != S32) + SrcReg = B.buildAnyExtOrTrunc(S32,SrcReg).getReg(0); if (Ty == S32) { B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, - {MI.getOperand(1)}, MI.getFlags()); + {SrcReg}, MI.getFlags()); } else { auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, - {MI.getOperand(1)}, MI.getFlags()); + {SrcReg}, MI.getFlags()); B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir @@ -173,3 +173,95 @@ %4:_(s32) = G_ANYEXT %3 $vgpr0 = COPY %4 ... + +--- +name: uitofp_s64_char_to_f32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: uitofp_s64_char_to_f32 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255 + ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]] + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64) + ; CHECK: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[TRUNC]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = G_CONSTANT i64 255 + %2:_(s64) = G_AND %0, %1 + %3:_(s32) = G_UITOFP %2 + $vgpr0 = COPY %3 +... + +--- +name: sitofp_s64_char_to_f32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: sitofp_s64_char_to_f32 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255 + ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]] + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64) + ; CHECK: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[TRUNC]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = G_CONSTANT i64 255 + %2:_(s64) = G_AND %0, %1 + %3:_(s32) = G_SITOFP %2 + $vgpr0 = COPY %3 +... + +--- +name: uitofp_s16_char_to_f32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: uitofp_s16_char_to_f32 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; CHECK: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[AND]](s16) + ; CHECK: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[ANYEXT]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s16) = G_TRUNC %0 + %2:_(s16) = G_CONSTANT i16 255 + %3:_(s16) = G_AND %1, %2 + %4:_(s32) = G_UITOFP %3 + $vgpr0 = COPY %4 +... + +--- +name: sitofp_s16_char_to_f32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: sitofp_s16_char_to_f32 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; CHECK: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[AND]](s16) + ; CHECK: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[ANYEXT]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s16) = G_TRUNC %0 + %2:_(s16) = G_CONSTANT i16 255 + %3:_(s16) = G_AND %1, %2 + %4:_(s32) = G_SITOFP %3 + $vgpr0 = COPY %4 +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -1142,3 +1142,187 @@ store float %add, float addrspace(1)* %out ret void } + +define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) { +; SI-LABEL: v_test_sitofp_i64_byte_to_f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_movk_i32 s6, 0xff +; SI-NEXT: v_and_b32_e32 v2, s6, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0, v2 +; SI-NEXT: v_ffbh_u32_e32 v4, v2 +; SI-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc +; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v4 +; SI-NEXT: v_ffbh_u32_e32 v5, v3 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; SI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; SI-NEXT: v_mov_b32_e32 v5, 0xbe +; SI-NEXT: v_sub_i32_e32 v6, vcc, v5, v4 +; SI-NEXT: v_lshl_b64 v[4:5], v[2:3], v4 +; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; SI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v5 +; SI-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc +; SI-NEXT: v_and_b32_e32 v5, s6, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 23, v2 +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_movk_i32 s5, 0x80 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5] +; SI-NEXT: v_and_b32_e32 v3, 1, v2 +; SI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[4:5] +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: v_cndmask_b32_e64 v3, v3, 1, vcc +; SI-NEXT: v_mov_b32_e32 v1, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, v2, -v2, vcc +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_test_sitofp_i64_byte_to_f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_movk_i32 s6, 0xff +; VI-NEXT: v_and_b32_e32 v2, s6, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0, v2 +; VI-NEXT: v_ffbh_u32_e32 v4, v2 +; VI-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v4 +; VI-NEXT: v_ffbh_u32_e32 v5, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; VI-NEXT: v_mov_b32_e32 v5, 0xbe +; VI-NEXT: v_sub_u32_e32 v6, vcc, v5, v4 +; VI-NEXT: v_lshlrev_b64 v[4:5], v4, v[2:3] +; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v5 +; VI-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc +; VI-NEXT: v_and_b32_e32 v5, s6, v3 +; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v2, 23, v2 +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_movk_i32 s5, 0x80 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5] +; VI-NEXT: v_and_b32_e32 v3, 1, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[4:5] +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: v_cndmask_b32_e64 v3, v3, 1, vcc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; VI-NEXT: v_cndmask_b32_e64 v0, v2, -v2, vcc +; VI-NEXT: s_setpc_b64 s[30:31] + %masked = and i64 %arg0, 255 + %itofp = sitofp i64 %masked to float + ret float %itofp +} + +define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) { +; SI-LABEL: v_test_uitofp_i64_byte_to_f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_movk_i32 s4, 0xff +; SI-NEXT: v_and_b32_e32 v0, s4, v0 +; SI-NEXT: v_ffbh_u32_e32 v2, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2 +; SI-NEXT: v_ffbh_u32_e32 v3, 0 +; SI-NEXT: v_cmp_eq_u32_e64 vcc, 0, 0 +; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; SI-NEXT: v_mov_b32_e32 v3, 0xbe +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: v_sub_i32_e32 v4, vcc, v3, v2 +; SI-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 +; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3 +; SI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; SI-NEXT: v_and_b32_e32 v3, s4, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 23, v0 +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_movk_i32 s5, 0x80 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3] +; SI-NEXT: v_and_b32_e32 v1, 1, v0 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; SI-NEXT: v_cndmask_b32_e64 v1, v1, 1, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_test_uitofp_i64_byte_to_f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_movk_i32 s4, 0xff +; VI-NEXT: v_and_b32_e32 v0, s4, v0 +; VI-NEXT: v_ffbh_u32_e32 v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2 +; VI-NEXT: v_ffbh_u32_e32 v3, 0 +; VI-NEXT: v_cmp_eq_u32_e64 vcc, 0, 0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; VI-NEXT: v_mov_b32_e32 v3, 0xbe +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_sub_u32_e32 v4, vcc, v3, v2 +; VI-NEXT: v_lshlrev_b64 v[2:3], v2, v[0:1] +; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3 +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; VI-NEXT: v_and_b32_e32 v3, s4, v1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0 +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_movk_i32 s5, 0x80 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3] +; VI-NEXT: v_and_b32_e32 v1, 1, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; VI-NEXT: v_cndmask_b32_e64 v1, v1, 1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] + %masked = and i64 %arg0, 255 + %itofp = uitofp i64 %masked to float + ret float %itofp +} + +define float @v_test_sitofp_i16_byte_to_f32(i16 %arg0) { +; SI-LABEL: v_test_sitofp_i16_byte_to_f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_test_sitofp_i16_byte_to_f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; VI-NEXT: s_setpc_b64 s[30:31] + %masked = and i16 %arg0, 255 + %itofp = sitofp i16 %masked to float + ret float %itofp +} + +define float @v_test_uitofp_i16_byte_to_f32(i16 %arg0) { +; SI-LABEL: v_test_uitofp_i16_byte_to_f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_bfe_u32 v0, v0, 0, 16 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_test_uitofp_i16_byte_to_f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; VI-NEXT: s_setpc_b64 s[30:31] + %masked = and i16 %arg0, 255 + %itofp = uitofp i16 %masked to float + ret float %itofp +}