Index: include/llvm/CodeGen/GlobalISel/LegalizerHelper.h =================================================================== --- include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -215,6 +215,10 @@ LegalizeResult lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty); + LegalizeResult lowerU64ToF32BitOps(MachineInstr &MI); + LegalizeResult lowerUITOFP(MachineInstr &MI, unsigned TypeIdx, LLT Ty); + LegalizeResult lowerSITOFP(MachineInstr &MI, unsigned TypeIdx, LLT Ty); + MachineRegisterInfo &MRI; const LegalizerInfo &LI; /// To keep track of changes made by the LegalizerHelper. Index: lib/CodeGen/GlobalISel/LegalizerHelper.cpp =================================================================== --- lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -1596,6 +1596,10 @@ MI.eraseFromParent(); return Legalized; } + case G_UITOFP: + return lowerUITOFP(MI, TypeIdx, Ty); + case G_SITOFP: + return lowerSITOFP(MI, TypeIdx, Ty); } } @@ -2969,3 +2973,120 @@ } } } + +// Expand s32 = G_UITOFP s64 using bit operations to an IEEE float +// representation. +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) { + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Src = MI.getOperand(1).getReg(); + const LLT S64 = LLT::scalar(64); + const LLT S32 = LLT::scalar(32); + const LLT S1 = LLT::scalar(1); + + assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32); + + // unsigned cul2f(ulong u) { + // uint lz = clz(u); + // uint e = (u != 0) ? 127U + 63U - lz : 0; + // u = (u << lz) & 0x7fffffffffffffffUL; + // ulong t = u & 0xffffffffffUL; + // uint v = (e << 23) | (uint)(u >> 40); + // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U); + // return as_float(v + r); + // } + + auto Zero32 = MIRBuilder.buildConstant(S32, 0); + auto Zero64 = MIRBuilder.buildConstant(S64, 0); + + auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src); + + auto K = MIRBuilder.buildConstant(S32, 127U + 63U); + auto Sub = MIRBuilder.buildSub(S32, K, LZ); + + auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, Src, Zero64); + auto E = MIRBuilder.buildSelect(S32, NotZero, Sub, Zero32); + + auto Mask0 = MIRBuilder.buildConstant(S64, (-1ULL) >> 1); + auto ShlLZ = MIRBuilder.buildShl(S64, Src, LZ); + + auto U = MIRBuilder.buildAnd(S64, ShlLZ, Mask0); + + auto Mask1 = MIRBuilder.buildConstant(S64, 0xffffffffffULL); + auto T = MIRBuilder.buildAnd(S64, U, Mask1); + + auto UShl = MIRBuilder.buildLShr(S64, U, MIRBuilder.buildConstant(S64, 40)); + auto ShlE = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 23)); + auto V = MIRBuilder.buildOr(S32, ShlE, MIRBuilder.buildTrunc(S32, UShl)); + + auto C = MIRBuilder.buildConstant(S64, 0x8000000000ULL); + auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, S1, T, C); + auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, T, C); + auto One = MIRBuilder.buildConstant(S32, 1); + + auto VTrunc1 = MIRBuilder.buildAnd(S32, V, One); + auto Select0 = MIRBuilder.buildSelect(S32, TCmp, VTrunc1, Zero32); + auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0); + MIRBuilder.buildAdd(Dst, V, R); + + return Legalized; +} + +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerUITOFP(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Src = MI.getOperand(1).getReg(); + LLT DstTy = MRI.getType(Dst); + LLT SrcTy = MRI.getType(Src); + + if (SrcTy != LLT::scalar(64)) + return UnableToLegalize; + + if (DstTy == LLT::scalar(32)) { + // TODO: SelectionDAG has several alternative expansions to port which may + // be more reasonble depending on the available instructions. If a target + // has sitofp, does not have CTLZ, or can efficiently use f64 as an + // intermediate type, this is probably worse. + return lowerU64ToF32BitOps(MI); + } + + return UnableToLegalize; +} + +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerSITOFP(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Src = MI.getOperand(1).getReg(); + LLT DstTy = MRI.getType(Dst); + LLT SrcTy = MRI.getType(Src); + + const LLT S64 = LLT::scalar(64); + const LLT S32 = LLT::scalar(32); + const LLT S1 = LLT::scalar(1); + + if (SrcTy != S64) + return UnableToLegalize; + + if (DstTy == S32) { + // signed cl2f(long l) { + // long s = l >> 63; + // float r = cul2f((l + s) ^ s); + // return s ? -r : r; + // } + unsigned L = Src; + auto SignBit = MIRBuilder.buildConstant(S64, 63); + auto S = MIRBuilder.buildAShr(S64, L, SignBit); + + auto LPlusS = MIRBuilder.buildAdd(S64, L, S); + auto Xor = MIRBuilder.buildXor(S64, LPlusS, S); + auto R = MIRBuilder.buildUITOFP(S32, Xor); + + auto RNeg = MIRBuilder.buildFNeg(S32, R); + auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S, + MIRBuilder.buildConstant(S64, 0)); + MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R); + return Legalized; + } + + return UnableToLegalize; +} Index: lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -283,6 +283,7 @@ getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) .legalFor({{S32, S32}, {S64, S32}}) + .lowerFor({{S32, S64}}) .scalarize(0); getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) Index: test/CodeGen/AMDGPU/GlobalISel/legalize-sitofp.mir =================================================================== --- test/CodeGen/AMDGPU/GlobalISel/legalize-sitofp.mir +++ test/CodeGen/AMDGPU/GlobalISel/legalize-sitofp.mir @@ -66,3 +66,59 @@ %1:_(<2 x s64>) = G_SITOFP %0 $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 ... + +--- +name: test_sitofp_s64_to_s32 +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: test_sitofp_s64_to_s32 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 63 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64) + ; CHECK: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY]], [[TRUNC]](s32) + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) + ; CHECK: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 false + ; CHECK: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV]], [[UV2]], [[C1]] + ; CHECK: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDE1]] + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDE]](s32), [[UADDE2]](s32) + ; CHECK: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[MV]], [[ASHR]] + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[XOR]](s64) + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 190 + ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[CTLZ_ZERO_UNDEF]] + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[XOR]](s64), [[C3]] + ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[C2]] + ; CHECK: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807 + ; CHECK: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[XOR]], [[CTLZ_ZERO_UNDEF]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[SHL]], [[C5]] + ; CHECK: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1099511627775 + ; CHECK: [[AND1:%[0-9]+]]:_(s64) = G_AND [[AND]], [[C6]] + ; CHECK: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 40 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[C7]](s64) + ; CHECK: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[AND]], [[TRUNC1]](s32) + ; CHECK: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 23 + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SELECT]], [[C8]](s32) + ; CHECK: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[TRUNC2]] + ; CHECK: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 549755813888 + ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ugt), [[AND1]](s64), [[C9]] + ; CHECK: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND1]](s64), [[C9]] + ; CHECK: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[OR]], [[C10]] + ; CHECK: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[AND2]], [[C2]] + ; CHECK: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C10]], [[SELECT1]] + ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[OR]], [[SELECT2]] + ; CHECK: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[XOR]](s64) + ; CHECK: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[UITOFP]] + ; CHECK: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[ASHR]](s64), [[C3]] + ; CHECK: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[FNEG]], [[UITOFP]] + ; CHECK: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[COPY]](s64) + ; CHECK: $vgpr0 = COPY [[SITOFP]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_SITOFP %0 + $vgpr0 = COPY %1 +... Index: test/CodeGen/AMDGPU/GlobalISel/legalize-uitofp.mir =================================================================== --- test/CodeGen/AMDGPU/GlobalISel/legalize-uitofp.mir +++ test/CodeGen/AMDGPU/GlobalISel/legalize-uitofp.mir @@ -31,3 +31,62 @@ $vgpr0_vgpr1 = COPY %1 ... +--- +name: test_uitofp_v2s32_to_v2s32 +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: test_uitofp_v2s32_to_v2s32 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; CHECK: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[UV]](s32) + ; CHECK: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[UV1]](s32) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UITOFP]](s32), [[UITOFP1]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(<2 x s32>) = G_UITOFP %0 + $vgpr0_vgpr1 = COPY %1 +... + +--- +name: test_uitofp_s64_to_s32 +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: test_uitofp_s64_to_s32 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[COPY]](s64) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 190 + ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[CTLZ_ZERO_UNDEF]] + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](s64), [[C1]] + ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB]], [[C]] + ; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807 + ; CHECK: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[CTLZ_ZERO_UNDEF]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[SHL]], [[C3]] + ; CHECK: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 1099511627775 + ; CHECK: [[AND1:%[0-9]+]]:_(s64) = G_AND [[AND]], [[C4]] + ; CHECK: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 40 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C5]](s64) + ; CHECK: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[AND]], [[TRUNC]](s32) + ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 23 + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SELECT]], [[C6]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[TRUNC1]] + ; CHECK: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 549755813888 + ; CHECK: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ugt), [[AND1]](s64), [[C7]] + ; CHECK: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND1]](s64), [[C7]] + ; CHECK: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[OR]], [[C8]] + ; CHECK: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[AND2]], [[C]] + ; CHECK: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C8]], [[SELECT1]] + ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[OR]], [[SELECT2]] + ; CHECK: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[COPY]](s64) + ; CHECK: $vgpr0 = COPY [[UITOFP]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_UITOFP %0 + $vgpr0 = COPY %1 +...