Index: llvm/lib/Target/AMDGPU/AMDGPUCombine.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -20,6 +20,13 @@ (apply [{ applySelectFCmpToFMinToFMaxLegacy(*${select}, ${matchinfo}); }])>; +def uchar_to_float : GICombineRule< + (defs root:$itofp), + (match (wip_match_opcode G_UITOFP, G_SITOFP):$itofp, + [{ return matchUCharToFloat(*${itofp}, MRI, *MF, Helper); }]), + (apply [{ applyUCharToFloat(*${itofp}); }])>; + + // Combines which should only apply on SI/VI def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; @@ -32,6 +39,6 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< "AMDGPUGenPostLegalizerCombinerHelper", [all_combines, - gfx6gfx7_combines]> { + gfx6gfx7_combines, uchar_to_float]> { let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule"; } Index: llvm/lib/Target/AMDGPU/AMDGPUGISel.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -153,6 +153,11 @@ def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; + def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; Index: llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -127,6 +127,43 @@ MI.eraseFromParent(); } +static bool matchUCharToFloat(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineFunction &MF, CombinerHelper &Helper) { + Register DstReg = MI.getOperand(0).getReg(); + + // TODO: We could try to match extracting the higher bytes, which would be + // easier if i8 vectors weren't promoted to i32 vectors, particularly after + // types are legalized. v4i8 -> v4f32 is probably the only case to worry + // about in practice. + LLT Ty = MRI.getType(DstReg); + if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) { + const APInt Mask = APInt::getHighBitsSet(32, 24); + return Helper.getKnownBits()->maskedValueIsZero(MI.getOperand(1).getReg(), + Mask); + } + + return false; +} + +static void applyUCharToFloat(MachineInstr &MI) { + MachineIRBuilder B(MI); + + const LLT S32 = LLT::scalar(32); + + Register DstReg = MI.getOperand(0).getReg(); + LLT Ty = B.getMRI()->getType(DstReg); + + if (Ty == S32) { + B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, + {MI.getOperand(1)}, MI.getFlags()); + } else { + auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, + {MI.getOperand(1)}, MI.getFlags()); + B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); + } + + MI.eraseFromParent(); +} #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS #include "AMDGPUGenPostLegalizeGICombiner.inc" Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3288,6 +3288,10 @@ case AMDGPU::G_AMDGPU_FMIN_LEGACY: case AMDGPU::G_AMDGPU_FMAX_LEGACY: case AMDGPU::G_AMDGPU_RCP_IFLAG: + case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0: + case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: + case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: + case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: return getDefaultMappingVOP(MI); case AMDGPU::G_UMULH: case AMDGPU::G_SMULH: { Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2297,6 +2297,14 @@ let hasSideEffects = 0; } +foreach N = 0-3 in { +def G_AMDGPU_CVT_F32_UBYTE#N : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0); + let hasSideEffects = 0; +} +} + // Atomic cmpxchg. $cmpval ad $newval are packed in a single vector // operand Expects a MachineMemOperand in addition to explicit // operands. Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir @@ -0,0 +1,175 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: uitofp_char_to_f32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: uitofp_char_to_f32 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[AND]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CONSTANT i32 255 + %2:_(s32) = G_AND %0, %1 + %3:_(s32) = G_UITOFP %2 + $vgpr0 = COPY %3 +... + +--- +name: uitofp_too_many_bits_to_f32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: uitofp_too_many_bits_to_f32 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 256 + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND]](s32) + ; CHECK: $vgpr0 = COPY [[UITOFP]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CONSTANT i32 256 + %2:_(s32) = G_AND %0, %1 + %3:_(s32) = G_UITOFP %2 + $vgpr0 = COPY %3 +... + +--- +name: sitofp_char_to_f32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: sitofp_char_to_f32 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[AND]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CONSTANT i32 255 + %2:_(s32) = G_AND %0, %1 + %3:_(s32) = G_SITOFP %2 + $vgpr0 = COPY %3 +... + +--- +name: sitofp_bits127_to_f32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: sitofp_bits127_to_f32 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 127 + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[AND]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CONSTANT i32 127 + %2:_(s32) = G_AND %0, %1 + %3:_(s32) = G_SITOFP %2 + $vgpr0 = COPY %3 +... + +--- +name: sitofp_bits128_to_f32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: sitofp_bits128_to_f32 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 128 + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[AND]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CONSTANT i32 128 + %2:_(s32) = G_AND %0, %1 + %3:_(s32) = G_SITOFP %2 + $vgpr0 = COPY %3 +... +--- +name: sitofp_too_many_bits_to_f32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: sitofp_too_many_bits_to_f32 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 256 + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[AND]](s32) + ; CHECK: $vgpr0 = COPY [[SITOFP]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CONSTANT i32 256 + %2:_(s32) = G_AND %0, %1 + %3:_(s32) = G_SITOFP %2 + $vgpr0 = COPY %3 +... + +--- +name: uitofp_char_to_f16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: uitofp_char_to_f16 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[AND]] + ; CHECK: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[AMDGPU_CVT_F32_UBYTE0_]](s32) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) + ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CONSTANT i32 255 + %2:_(s32) = G_AND %0, %1 + %3:_(s16) = G_UITOFP %2 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: sitofp_char_to_f16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: sitofp_char_to_f16 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[AND]] + ; CHECK: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[AMDGPU_CVT_F32_UBYTE0_]](s32) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) + ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CONSTANT i32 255 + %2:_(s32) = G_AND %0, %1 + %3:_(s16) = G_SITOFP %2 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +...