diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h --- a/llvm/include/llvm/ADT/APFloat.h +++ b/llvm/include/llvm/ADT/APFloat.h @@ -158,11 +158,22 @@ // 8-bit floating point number following IEEE-754 conventions with bit // layout S1E5M2 as described in https://arxiv.org/abs/2209.05433. S_Float8E5M2, + // 8-bit floating point number mostly follwing IEEE-754 conventions + // and bit layout S1E5M2 described in [[TODO where did we denclibe this]] + // with expanded range and with no infinity or signed zero. + // NaN is represnted as negative zero. (FN -> Finite, UZ -> unsigned zero) + S_Float8E5M2FNUZ, // 8-bit floating point number mostly following IEEE-754 conventions with // bit layout S1E4M3 as described in https://arxiv.org/abs/2209.05433. // Unlike IEEE-754 types, there are no infinity values, and NaN is // represented with the exponent and mantissa bits set to all 1s. S_Float8E4M3FN, + // 8-bit floating point number mostly follwing IEEE-754 conventions + // and bit layout S1E4M3 described in [[TODO where did we denclibe this]] + // with expanded range and with no infinity or signed zero. + // NaN is represnted as negative zero. (FN -> Finite, UZ -> unsigned zero) + S_Float8E4M3FNUZ, + S_x87DoubleExtended, S_MaxSemantics = S_x87DoubleExtended, }; @@ -177,7 +188,9 @@ static const fltSemantics &IEEEquad() LLVM_READNONE; static const fltSemantics &PPCDoubleDouble() LLVM_READNONE; static const fltSemantics &Float8E5M2() LLVM_READNONE; + static const fltSemantics &Float8E5M2FNUZ() LLVM_READNONE; static const fltSemantics &Float8E4M3FN() LLVM_READNONE; + static const fltSemantics &Float8E4M3FNUZ() LLVM_READNONE; static const fltSemantics &x87DoubleExtended() LLVM_READNONE; /// A Pseudo fltsemantic used to construct APFloats that cannot conflict with @@ -569,7 +582,9 @@ APInt convertF80LongDoubleAPFloatToAPInt() const; APInt convertPPCDoubleDoubleAPFloatToAPInt() const; APInt convertFloat8E5M2APFloatToAPInt() const; + APInt convertFloat8E5M2FNUZAPFloatToAPInt() const; APInt convertFloat8E4M3FNAPFloatToAPInt() const; + APInt convertFloat8E4M3FNUZAPFloatToAPInt() const; void initFromAPInt(const fltSemantics *Sem, const APInt &api); void initFromHalfAPInt(const APInt &api); void initFromBFloatAPInt(const APInt &api); @@ -579,7 +594,9 @@ void initFromF80LongDoubleAPInt(const APInt &api); void initFromPPCDoubleDoubleAPInt(const APInt &api); void initFromFloat8E5M2APInt(const APInt &api); + void initFromFloat8E5M2FNUZAPInt(const APInt &api); void initFromFloat8E4M3FNAPInt(const APInt &api); + void initFromFloat8E4M3FNUZAPInt(const APInt &api); void assign(const IEEEFloat &); void copySignificand(const IEEEFloat &); diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp --- a/llvm/lib/Support/APFloat.cpp +++ b/llvm/lib/Support/APFloat.cpp @@ -14,6 +14,7 @@ #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APSInt.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/FloatingPointMode.h" #include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/StringExtras.h" @@ -67,6 +68,21 @@ NanOnly, }; + enum class fltNanEncoding { + // Represents the standard IEEE behavior where a value is NaN if its + // exponent is all 1s and the significand is non-zero. + IEEE, + + // Represents the behavior in some 8-bit floating ponit types where NaN is + // represnted by the all-1s value + AllOnes, + + // Represents the behaviro in some 8-bit floating point types where NaN is + // represented by a sign bit of 1 and all 0s in the exponent (i.e. the + // negative zero encoding in a IEEE float). + NegativeZero, + }; + /* Represents floating point arithmetic semantics. */ struct fltSemantics { /* The largest E such that 2^E is representable; this matches the @@ -86,6 +102,7 @@ fltNonfiniteBehavior nonFiniteBehavior = fltNonfiniteBehavior::IEEE754; + fltNanEncoding nanEncoding = fltNanEncoding::IEEE; // Returns true if any number described by this semantics can be precisely // represented by the specified semantics. Does not take into account // the value of fltNonfiniteBehavior. @@ -101,8 +118,16 @@ static const fltSemantics semIEEEdouble = {1023, -1022, 53, 64}; static const fltSemantics semIEEEquad = {16383, -16382, 113, 128}; static const fltSemantics semFloat8E5M2 = {15, -14, 3, 8}; - static const fltSemantics semFloat8E4M3FN = {8, -6, 4, 8, - fltNonfiniteBehavior::NanOnly}; + static const fltSemantics semFloat8E5M2FNUZ = {15, + -15, + 3, + 8, + fltNonfiniteBehavior::NanOnly, + fltNanEncoding::NegativeZero}; + static const fltSemantics semFloat8E4M3FN = { + 8, -6, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::AllOnes}; + static const fltSemantics semFloat8E4M3FNUZ = { + 7, -7, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero}; static const fltSemantics semX87DoubleExtended = {16383, -16382, 64, 80}; static const fltSemantics semBogus = {0, 0, 0, 0}; @@ -160,8 +185,12 @@ return PPCDoubleDouble(); case S_Float8E5M2: return Float8E5M2(); + case S_Float8E5M2FNUZ: + return Float8E5M2FNUZ(); case S_Float8E4M3FN: return Float8E4M3FN(); + case S_Float8E4M3FNUZ: + return Float8E4M3FNUZ(); case S_x87DoubleExtended: return x87DoubleExtended(); } @@ -184,8 +213,12 @@ return S_PPCDoubleDouble; else if (&Sem == &llvm::APFloat::Float8E5M2()) return S_Float8E5M2; + else if (&Sem == &llvm::APFloat::Float8E5M2FNUZ()) + return S_Float8E5M2FNUZ; else if (&Sem == &llvm::APFloat::Float8E4M3FN()) return S_Float8E4M3FN; + else if (&Sem == &llvm::APFloat::Float8E4M3FNUZ()) + return S_Float8E4M3FNUZ; else if (&Sem == &llvm::APFloat::x87DoubleExtended()) return S_x87DoubleExtended; else @@ -209,7 +242,13 @@ return semPPCDoubleDouble; } const fltSemantics &APFloatBase::Float8E5M2() { return semFloat8E5M2; } + const fltSemantics &APFloatBase::Float8E5M2FNUZ() { + return semFloat8E5M2FNUZ; + } const fltSemantics &APFloatBase::Float8E4M3FN() { return semFloat8E4M3FN; } + const fltSemantics &APFloatBase::Float8E4M3FNUZ() { + return semFloat8E4M3FNUZ; + } const fltSemantics &APFloatBase::x87DoubleExtended() { return semX87DoubleExtended; } @@ -801,7 +840,12 @@ // The only NaN representation is where the mantissa is all 1s, which is // non-signalling. SNaN = false; - fill_storage = APInt::getAllOnes(semantics->precision - 1); + if (semantics->nanEncoding == fltNanEncoding::NegativeZero) { + sign = true; + fill_storage = APInt::getZero(semantics->precision - 1); + } else { + fill_storage = APInt::getAllOnes(semantics->precision - 1); + } fill = &fill_storage; } @@ -832,6 +876,9 @@ // conventionally, this is the next bit down from the QNaN bit. if (APInt::tcIsZero(significand, numParts)) APInt::tcSetBit(significand, QNaNBit - 1); + } else if (semantics->nanEncoding == fltNanEncoding::NegativeZero) { + // The only NaN is a quiet NaN, and it has no bits sets in the significand. + // Do nothing. } else { // We always have to set the QNaN bit to make it a QNaN. APInt::tcSetBit(significand, QNaNBit); @@ -976,7 +1023,8 @@ } bool IEEEFloat::isLargest() const { - if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly) { + if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly && + semantics->nanEncoding == fltNanEncoding::AllOnes) { // The largest number by magnitude in our format will be the floating point // number with maximum exponent and with significand that is all ones except // the LSB. @@ -1418,7 +1466,8 @@ exponent = semantics->maxExponent; tcSetLeastSignificantBits(significandParts(), partCount(), semantics->precision); - if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly) + if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly && + semantics->nanEncoding == fltNanEncoding::AllOnes) APInt::tcClearBit(significandParts(), 0); return opInexact; @@ -1519,7 +1568,10 @@ } } + // The all-ones values is an overflow if NaN is all ones. If NaN is + // represented by negative zero, then it is a valid finite value. if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly && + semantics->nanEncoding == fltNanEncoding::AllOnes && exponent == semantics->maxExponent && isSignificandAllOnes()) return handleOverflow(rounding_mode); @@ -1530,8 +1582,11 @@ underflow for exact results. */ if (lost_fraction == lfExactlyZero) { /* Canonicalize zeroes. */ - if (omsb == 0) + if (omsb == 0) { category = fcZero; + if (semantics->nanEncoding == fltNanEncoding::NegativeZero) + sign = false; + } return opOK; } @@ -1549,18 +1604,22 @@ /* Renormalize by incrementing the exponent and shifting our significand right one. However if we already have the maximum exponent we overflow to infinity. */ - if (exponent == semantics->maxExponent) { - category = fcInfinity; - - return (opStatus) (opOverflow | opInexact); - } + if (exponent == semantics->maxExponent) + // Invoke overflow handling with a rounding mode that will guarantee + // that the result gets turned into the correct infinity representation. + // This is needed instead of just setting the category to infinity to + // account for 8-bit floating point types that have no inf, only NaN. + return handleOverflow(sign ? rmTowardNegative : rmTowardPositive); shiftSignificandRight(1); return opInexact; } + // The all-ones values is an overflow if NaN is all ones. If NaN is + // represented by negative zero, then it is a valid finite value. if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly && + semantics->nanEncoding == fltNanEncoding::AllOnes && exponent == semantics->maxExponent && isSignificandAllOnes()) return handleOverflow(rounding_mode); } @@ -1574,8 +1633,11 @@ assert(omsb < semantics->precision); /* Canonicalize zeroes. */ - if (omsb == 0) + if (omsb == 0) { category = fcZero; + if (semantics->nanEncoding == fltNanEncoding::NegativeZero) + sign = false; + } /* The fcZero case is a denormal that underflowed to zero. */ return (opStatus) (opUnderflow | opInexact); @@ -1877,6 +1939,11 @@ /* Change sign. */ void IEEEFloat::changeSign() { + // With NaN-as-negative-zero, neither NaN or negative zero can change + // their signs. + if (semantics->nanEncoding == fltNanEncoding::NegativeZero && + (isZero() || isNaN())) + return; /* Look mummy, this one's easy. */ sign = !sign; } @@ -1906,6 +1973,9 @@ if (category == fcZero) { if (rhs.category != fcZero || (sign == rhs.sign) == subtract) sign = (rounding_mode == rmTowardNegative); + // NaN-in-negative-zero means zeros need to be normalized to +0. + if (semantics->nanEncoding == fltNanEncoding::NegativeZero) + sign = false; } return fs; @@ -1931,6 +2001,8 @@ sign ^= rhs.sign; fs = multiplySpecials(rhs); + if (isZero() && semantics->nanEncoding == fltNanEncoding::NegativeZero) + sign = false; if (isFiniteNonZero()) { lostFraction lost_fraction = multiplySignificand(rhs); fs = normalize(rounding_mode, lost_fraction); @@ -1949,6 +2021,8 @@ sign ^= rhs.sign; fs = divideSpecials(rhs); + if (isZero() && semantics->nanEncoding == fltNanEncoding::NegativeZero) + sign = false; if (isFiniteNonZero()) { lostFraction lost_fraction = divideSignificand(rhs); fs = normalize(rounding_mode, lost_fraction); @@ -2057,8 +2131,13 @@ } } - if (isZero()) + if (isZero()) { sign = origSign; // IEEE754 requires this + if (semantics->nanEncoding == fltNanEncoding::NegativeZero) + // But some 8-bit floats only have positive 0. + sign = false; + } + else sign ^= origSign; return fs; @@ -2083,8 +2162,11 @@ fs = subtract(V, rmNearestTiesToEven); assert(fs==opOK); } - if (isZero()) + if (isZero()) { sign = origSign; // fmod requires this + if (semantics->nanEncoding == fltNanEncoding::NegativeZero) + sign = false; + } return fs; } @@ -2112,8 +2194,11 @@ /* If two numbers add (exactly) to zero, IEEE 754 decrees it is a positive zero unless rounding to minus infinity, except that adding two like-signed zeroes gives that zero. */ - if (category == fcZero && !(fs & opUnderflow) && sign != addend.sign) + if (category == fcZero && !(fs & opUnderflow) && sign != addend.sign) { sign = (rounding_mode == rmTowardNegative); + if (semantics->nanEncoding == fltNanEncoding::NegativeZero) + sign = false; + } } else { fs = multiplySpecials(multiplicand); @@ -2389,6 +2474,12 @@ return is_signaling ? opInvalidOp : opOK; } + // If NaN is negative zero, we need to create a new NaN to avoid converting + // NaN to -Inf. + if (fromSemantics.nanEncoding == fltNanEncoding::NegativeZero && + semantics->nanEncoding != fltNanEncoding::NegativeZero) + makeNaN(false, false); + *losesInfo = lostFraction != lfExactlyZero || X86SpecialNan; // For x87 extended precision, we want to make a NaN, not a special NaN if @@ -2410,6 +2501,12 @@ makeNaN(false, sign); *losesInfo = true; fs = opInexact; + } else if (category == fcZero && + semantics->nanEncoding == fltNanEncoding::NegativeZero) { + *losesInfo = fromSemantics.nanEncoding != fltNanEncoding::NegativeZero; + fs = *losesInfo ? opInexact : opOK; + // NaN is negative zero means -0 -> +0, which can lose information + sign = false; } else { *losesInfo = false; fs = opOK; @@ -2877,9 +2974,11 @@ if (D.firstSigDigit == str.end() || decDigitValue(*D.firstSigDigit) >= 10U) { category = fcZero; fs = opOK; + if (semantics->nanEncoding == fltNanEncoding::NegativeZero) + sign = false; - /* Check whether the normalized exponent is high enough to overflow - max during the log-rebasing in the max-exponent check below. */ + /* Check whether the normalized exponent is high enough to overflow + max during the log-rebasing in the max-exponent check below. */ } else if (D.normalizedExponent - 1 > INT_MAX / 42039) { fs = handleOverflow(rounding_mode); @@ -3507,6 +3606,33 @@ (mysignificand & 0x3))); } +APInt IEEEFloat::convertFloat8E5M2FNUZAPFloatToAPInt() const { + assert(semantics == (const llvm::fltSemantics *)&semFloat8E5M2FNUZ); + assert(partCount() == 1); + + uint32_t myexponent, mysignificand; + + if (isFiniteNonZero()) { + myexponent = exponent + 16; // bias + mysignificand = (uint32_t)*significandParts(); + if (myexponent == 1 && !(mysignificand & 0x4)) + myexponent = 0; // denormal + } else if (category == fcZero) { + myexponent = 0; + mysignificand = 0; + } else if (category == fcInfinity) { + myexponent = 0; + mysignificand = 0; + } else { + assert(category == fcNaN && "Unknown category!"); + myexponent = 0; + mysignificand = (uint32_t)*significandParts(); + } + + return APInt(8, (((sign & 1) << 7) | ((myexponent & 0x1f) << 2) | + (mysignificand & 0x3))); +} + APInt IEEEFloat::convertFloat8E4M3FNAPFloatToAPInt() const { assert(semantics == (const llvm::fltSemantics *)&semFloat8E4M3FN); assert(partCount() == 1); @@ -3534,6 +3660,33 @@ (mysignificand & 0x7))); } +APInt IEEEFloat::convertFloat8E4M3FNUZAPFloatToAPInt() const { + assert(semantics == (const llvm::fltSemantics *)&semFloat8E4M3FNUZ); + assert(partCount() == 1); + + uint32_t myexponent, mysignificand; + + if (isFiniteNonZero()) { + myexponent = exponent + 8; // bias + mysignificand = (uint32_t)*significandParts(); + if (myexponent == 1 && !(mysignificand & 0x8)) + myexponent = 0; // denormal + } else if (category == fcZero) { + myexponent = 0; + mysignificand = 0; + } else if (category == fcInfinity) { + myexponent = 0; + mysignificand = 0; + } else { + assert(category == fcNaN && "Unknown category!"); + myexponent = 0; + mysignificand = (uint32_t)*significandParts(); + } + + return APInt(8, (((sign & 1) << 7) | ((myexponent & 0xf) << 3) | + (mysignificand & 0x7))); +} + // This function creates an APInt that is just a bit map of the floating // point constant as it would appear in memory. It is not a conversion, // and treating the result as a normal integer is unlikely to be useful. @@ -3560,9 +3713,15 @@ if (semantics == (const llvm::fltSemantics *)&semFloat8E5M2) return convertFloat8E5M2APFloatToAPInt(); + if (semantics == (const llvm::fltSemantics *)&semFloat8E5M2FNUZ) + return convertFloat8E5M2FNUZAPFloatToAPInt(); + if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3FN) return convertFloat8E4M3FNAPFloatToAPInt(); + if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3FNUZ) + return convertFloat8E4M3FNUZAPFloatToAPInt(); + assert(semantics == (const llvm::fltSemantics*)&semX87DoubleExtended && "unknown format!"); return convertF80LongDoubleAPFloatToAPInt(); @@ -3818,6 +3977,32 @@ } } +void IEEEFloat::initFromFloat8E5M2FNUZAPInt(const APInt &api) { + uint32_t i = (uint32_t)*api.getRawData(); + uint32_t myexponent = (i >> 2) & 0x1f; + uint32_t mysignificand = i & 0x3; + + initialize(&semFloat8E5M2FNUZ); + assert(partCount() == 1); + + sign = i >> 7; + if (myexponent == 0 && mysignificand == 0 && sign == 0) { + makeZero(sign); + } else if (myexponent == 0 && mysignificand == 0 && sign == 1) { + category = fcNaN; + exponent = exponentNaN(); + *significandParts() = mysignificand; + } else { + category = fcNormal; + exponent = myexponent - 16; // bias + *significandParts() = mysignificand; + if (myexponent == 0) // denormal + exponent = -15; + else + *significandParts() |= 0x4; // integer bit + } +} + void IEEEFloat::initFromFloat8E4M3FNAPInt(const APInt &api) { uint32_t i = (uint32_t)*api.getRawData(); uint32_t myexponent = (i >> 3) & 0xf; @@ -3844,6 +4029,32 @@ } } +void IEEEFloat::initFromFloat8E4M3FNUZAPInt(const APInt &api) { + uint32_t i = (uint32_t)*api.getRawData(); + uint32_t myexponent = (i >> 3) & 0xf; + uint32_t mysignificand = i & 0x7; + + initialize(&semFloat8E4M3FNUZ); + assert(partCount() == 1); + + sign = i >> 7; + if (myexponent == 0 && mysignificand == 0 && sign == 0) { + makeZero(sign); + } else if (myexponent == 0 && mysignificand == 0 && sign == 1) { + category = fcNaN; + exponent = exponentNaN(); + *significandParts() = mysignificand; + } else { + category = fcNormal; + exponent = myexponent - 8; // bias + *significandParts() = mysignificand; + if (myexponent == 0) // denormal + exponent = -7; + else + *significandParts() |= 0x8; // integer bit + } +} + /// Treat api as containing the bits of a floating point number. void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) { assert(api.getBitWidth() == Sem->sizeInBits); @@ -3863,8 +4074,12 @@ return initFromPPCDoubleDoubleAPInt(api); if (Sem == &semFloat8E5M2) return initFromFloat8E5M2APInt(api); + if (Sem == &semFloat8E5M2FNUZ) + return initFromFloat8E5M2FNUZAPInt(api); if (Sem == &semFloat8E4M3FN) return initFromFloat8E4M3FNAPInt(api); + if (Sem == &semFloat8E4M3FNUZ) + return initFromFloat8E4M3FNUZAPInt(api); llvm_unreachable(nullptr); } @@ -3893,7 +4108,8 @@ ? (~integerPart(0) >> NumUnusedHighBits) : 0; - if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly) + if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly && + semantics->nanEncoding == fltNanEncoding::AllOnes) significand[0] &= ~integerPart(1); } @@ -4321,6 +4537,8 @@ APInt::tcSet(significandParts(), 0, partCount()); category = fcZero; exponent = 0; + if (semantics->nanEncoding == fltNanEncoding::NegativeZero) + sign = false; break; } @@ -4407,8 +4625,11 @@ } APFloatBase::ExponentType IEEEFloat::exponentNaN() const { - if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly) + if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly) { + if (semantics->nanEncoding == fltNanEncoding::NegativeZero) + return semantics->minExponent; return semantics->maxExponent; + } return semantics->maxExponent + 1; } @@ -4435,6 +4656,10 @@ void IEEEFloat::makeZero(bool Negative) { category = fcZero; sign = Negative; + if (semantics->nanEncoding == fltNanEncoding::NegativeZero) { + // Merge negative zero to positive because 0b10000...000 is used for NaN + sign = false; + } exponent = exponentZero(); APInt::tcSet(significandParts(), 0, partCount()); } diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp --- a/llvm/unittests/ADT/APFloatTest.cpp +++ b/llvm/unittests/ADT/APFloatTest.cpp @@ -9,6 +9,7 @@ #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APSInt.h" #include "llvm/ADT/Hashing.h" +#include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/Error.h" #include "llvm/Support/FormatVariadic.h" @@ -1291,6 +1292,7 @@ bool Negative; uint64_t payload; } tests[] = { + // clang-format off /* expected semantics SNaN Neg payload */ { 0x7fc00000ULL, APFloat::IEEEsingle(), false, false, 0x00000000ULL }, { 0xffc00000ULL, APFloat::IEEEsingle(), false, true, 0x00000000ULL }, @@ -1312,6 +1314,15 @@ { 0x7ff000000000ae72ULL, APFloat::IEEEdouble(), true, false, 0x000000000000ae72ULL }, { 0x7ff7ffffffffae72ULL, APFloat::IEEEdouble(), true, false, 0xffffffffffffae72ULL }, { 0x7ff1aaaaaaaaae72ULL, APFloat::IEEEdouble(), true, false, 0x0001aaaaaaaaae72ULL }, + { 0x80ULL, APFloat::Float8E5M2FNUZ(), false, false, 0xaaULL }, + { 0x80ULL, APFloat::Float8E5M2FNUZ(), false, true, 0xaaULL }, + { 0x80ULL, APFloat::Float8E5M2FNUZ(), true, false, 0xaaULL }, + { 0x80ULL, APFloat::Float8E5M2FNUZ(), true, true, 0xaaULL }, + { 0x80ULL, APFloat::Float8E4M3FNUZ(), false, false, 0xaaULL }, + { 0x80ULL, APFloat::Float8E4M3FNUZ(), false, true, 0xaaULL }, + { 0x80ULL, APFloat::Float8E4M3FNUZ(), true, false, 0xaaULL }, + { 0x80ULL, APFloat::Float8E4M3FNUZ(), true, true, 0xaaULL }, + // clang-format on }; for (const auto &t : tests) { @@ -1735,6 +1746,10 @@ EXPECT_EQ(3.402823466e+38f, APFloat::getLargest(APFloat::IEEEsingle()).convertToFloat()); EXPECT_EQ(1.7976931348623158e+308, APFloat::getLargest(APFloat::IEEEdouble()).convertToDouble()); EXPECT_EQ(448, APFloat::getLargest(APFloat::Float8E4M3FN()).convertToDouble()); + EXPECT_EQ(240, + APFloat::getLargest(APFloat::Float8E4M3FNUZ()).convertToDouble()); + EXPECT_EQ(57344, + APFloat::getLargest(APFloat::Float8E5M2FNUZ()).convertToDouble()); } TEST(APFloatTest, getSmallest) { @@ -1765,6 +1780,20 @@ EXPECT_TRUE(test.isFiniteNonZero()); EXPECT_TRUE(test.isDenormal()); EXPECT_TRUE(test.bitwiseIsEqual(expected)); + + test = APFloat::getSmallest(APFloat::Float8E5M2FNUZ(), false); + expected = APFloat(APFloat::Float8E5M2FNUZ(), "0x0.4p-15"); + EXPECT_FALSE(test.isNegative()); + EXPECT_TRUE(test.isFiniteNonZero()); + EXPECT_TRUE(test.isDenormal()); + EXPECT_TRUE(test.bitwiseIsEqual(expected)); + + test = APFloat::getSmallest(APFloat::Float8E4M3FNUZ(), false); + expected = APFloat(APFloat::Float8E4M3FNUZ(), "0x0.2p-7"); + EXPECT_FALSE(test.isNegative()); + EXPECT_TRUE(test.isFiniteNonZero()); + EXPECT_TRUE(test.isDenormal()); + EXPECT_TRUE(test.bitwiseIsEqual(expected)); } TEST(APFloatTest, getSmallestNormalized) { @@ -1821,27 +1850,31 @@ struct { const fltSemantics *semantics; const bool sign; + const bool signedZero; const unsigned long long bitPattern[2]; - const unsigned bitPatternLength; + const unsigned bitPatternLength; // todo preserve sign flag } const GetZeroTest[] = { - {&APFloat::IEEEhalf(), false, {0, 0}, 1}, - {&APFloat::IEEEhalf(), true, {0x8000ULL, 0}, 1}, - {&APFloat::IEEEsingle(), false, {0, 0}, 1}, - {&APFloat::IEEEsingle(), true, {0x80000000ULL, 0}, 1}, - {&APFloat::IEEEdouble(), false, {0, 0}, 1}, - {&APFloat::IEEEdouble(), true, {0x8000000000000000ULL, 0}, 1}, - {&APFloat::IEEEquad(), false, {0, 0}, 2}, - {&APFloat::IEEEquad(), true, {0, 0x8000000000000000ULL}, 2}, - {&APFloat::PPCDoubleDouble(), false, {0, 0}, 2}, - {&APFloat::PPCDoubleDouble(), true, {0x8000000000000000ULL, 0}, 2}, - {&APFloat::x87DoubleExtended(), false, {0, 0}, 2}, - {&APFloat::x87DoubleExtended(), true, {0, 0x8000ULL}, 2}, - {&APFloat::Float8E5M2(), false, {0, 0}, 1}, - {&APFloat::Float8E5M2(), true, {0x80ULL, 0}, 1}, - {&APFloat::Float8E4M3FN(), false, {0, 0}, 1}, - {&APFloat::Float8E4M3FN(), true, {0x80ULL, 0}, 1}, - }; - const unsigned NumGetZeroTests = 12; + {&APFloat::IEEEhalf(), false, true, {0, 0}, 1}, + {&APFloat::IEEEhalf(), true, true, {0x8000ULL, 0}, 1}, + {&APFloat::IEEEsingle(), false, true, {0, 0}, 1}, + {&APFloat::IEEEsingle(), true, true, {0x80000000ULL, 0}, 1}, + {&APFloat::IEEEdouble(), false, true, {0, 0}, 1}, + {&APFloat::IEEEdouble(), true, true, {0x8000000000000000ULL, 0}, 1}, + {&APFloat::IEEEquad(), false, true, {0, 0}, 2}, + {&APFloat::IEEEquad(), true, true, {0, 0x8000000000000000ULL}, 2}, + {&APFloat::PPCDoubleDouble(), false, true, {0, 0}, 2}, + {&APFloat::PPCDoubleDouble(), true, true, {0x8000000000000000ULL, 0}, 2}, + {&APFloat::x87DoubleExtended(), false, true, {0, 0}, 2}, + {&APFloat::x87DoubleExtended(), true, true, {0, 0x8000ULL}, 2}, + {&APFloat::Float8E5M2(), false, true, {0, 0}, 1}, + {&APFloat::Float8E5M2(), true, true, {0x80ULL, 0}, 1}, + {&APFloat::Float8E5M2FNUZ(), false, false, {0, 0}, 1}, + {&APFloat::Float8E5M2FNUZ(), true, false, {0, 0}, 1}, + {&APFloat::Float8E4M3FN(), false, true, {0, 0}, 1}, + {&APFloat::Float8E4M3FN(), true, true, {0x80ULL, 0}, 1}, + {&APFloat::Float8E4M3FNUZ(), false, false, {0, 0}, 1}, + {&APFloat::Float8E4M3FNUZ(), true, false, {0, 0}, 1}}; + const unsigned NumGetZeroTests = 16; for (unsigned i = 0; i < NumGetZeroTests; ++i) { APFloat test = APFloat::getZero(*GetZeroTest[i].semantics, GetZeroTest[i].sign); @@ -1849,7 +1882,10 @@ APFloat expected = APFloat(*GetZeroTest[i].semantics, pattern); EXPECT_TRUE(test.isZero()); - EXPECT_TRUE(GetZeroTest[i].sign? test.isNegative() : !test.isNegative()); + if (GetZeroTest[i].signedZero) + EXPECT_TRUE(GetZeroTest[i].sign ? test.isNegative() : !test.isNegative()); + else + EXPECT_TRUE(!test.isNegative()); EXPECT_TRUE(test.bitwiseIsEqual(expected)); for (unsigned j = 0, je = GetZeroTest[i].bitPatternLength; j < je; ++j) { EXPECT_EQ(GetZeroTest[i].bitPattern[j], @@ -1867,6 +1903,15 @@ APFloat::copySign(APFloat(-42.0), APFloat(-1.0)))); EXPECT_TRUE(APFloat(42.0).bitwiseIsEqual( APFloat::copySign(APFloat(42.0), APFloat(1.0)))); + // For floating-point formats with unsigned 0, copySign() to a zero is a noop + EXPECT_TRUE( + APFloat::getZero(APFloat::Float8E4M3FNUZ()) + .bitwiseIsEqual(APFloat::copySign( + APFloat::getZero(APFloat::Float8E4M3FNUZ()), APFloat(-1.0)))); + EXPECT_TRUE( + APFloat::getNaN(APFloat::Float8E4M3FNUZ(), true) + .bitwiseIsEqual(APFloat::copySign( + APFloat::getNaN(APFloat::Float8E4M3FNUZ(), true), APFloat(1.0)))); } TEST(APFloatTest, convert) { @@ -1979,6 +2024,67 @@ EXPECT_TRUE(losesInfo); } +TEST(APFloatTest, Float8UZConvert) { + bool losesInfo = false; + std::pair toNaNTests[] = { + {APFloat::getQNaN(APFloat::IEEEsingle(), false), APFloat::opOK}, + {APFloat::getQNaN(APFloat::IEEEsingle(), true), APFloat::opOK}, + {APFloat::getSNaN(APFloat::IEEEsingle(), false), APFloat::opInvalidOp}, + {APFloat::getSNaN(APFloat::IEEEsingle(), true), APFloat::opInvalidOp}, + {APFloat::getInf(APFloat::IEEEsingle(), false), APFloat::opInexact}, + {APFloat::getInf(APFloat::IEEEsingle(), true), APFloat::opInexact}}; + for (auto [toTest, expectedRes] : toNaNTests) { + llvm::SmallString<16> value; + toTest.toString(value); + SCOPED_TRACE("toTest = " + value); + for (const fltSemantics *sem : + {&APFloat::Float8E4M3FNUZ(), &APFloat::Float8E5M2FNUZ()}) { + SCOPED_TRACE("Semantics = " + + std::to_string(APFloat::SemanticsToEnum(*sem))); + losesInfo = false; + APFloat test = toTest; + EXPECT_EQ(test.convert(*sem, APFloat::rmNearestTiesToAway, &losesInfo), + expectedRes); + EXPECT_TRUE(test.isNaN()); + EXPECT_TRUE(test.isNegative()); + EXPECT_FALSE(test.isSignaling()); + EXPECT_FALSE(test.isInfinity()); + EXPECT_EQ(0x80, test.bitcastToAPInt()); + EXPECT_TRUE(losesInfo); + } + } + + // Zero conversions are information losing. + losesInfo = false; + APFloat test = APFloat::getZero(APFloat::IEEEsingle(), true); + EXPECT_EQ(test.convert(APFloat::Float8E5M2FNUZ(), + APFloat::rmNearestTiesToAway, &losesInfo), + APFloat::opInexact); + EXPECT_TRUE(test.isZero()); + EXPECT_FALSE(test.isNegative()); + EXPECT_TRUE(losesInfo); + EXPECT_EQ(0x0, test.bitcastToAPInt()); + + losesInfo = false; + test = APFloat::getZero(APFloat::IEEEsingle(), false); + EXPECT_EQ(test.convert(APFloat::Float8E5M2FNUZ(), + APFloat::rmNearestTiesToAway, &losesInfo), + APFloat::opInexact); + EXPECT_TRUE(test.isZero()); + EXPECT_FALSE(test.isNegative()); + EXPECT_TRUE(losesInfo); + EXPECT_EQ(0x0, test.bitcastToAPInt()); + + // Except in castss between ourselves + losesInfo = true; + test = APFloat::getZero(APFloat::Float8E5M2FNUZ()); + EXPECT_EQ(test.convert(APFloat::Float8E4M3FNUZ(), + APFloat::rmNearestTiesToAway, &losesInfo), + APFloat::opOK); + EXPECT_FALSE(losesInfo); + EXPECT_EQ(0x0, test.bitcastToAPInt()); +} + TEST(APFloatTest, PPCDoubleDouble) { APFloat test(APFloat::PPCDoubleDouble(), "1.0"); EXPECT_EQ(0x3ff0000000000000ull, test.bitcastToAPInt().getRawData()[0]); @@ -5143,11 +5249,11 @@ // convert to BFloat APFloat test2 = test; - bool loses_info; + bool losesInfo; APFloat::opStatus status = test2.convert( - APFloat::BFloat(), APFloat::rmNearestTiesToEven, &loses_info); + APFloat::BFloat(), APFloat::rmNearestTiesToEven, &losesInfo); EXPECT_EQ(status, APFloat::opOK); - EXPECT_FALSE(loses_info); + EXPECT_FALSE(losesInfo); if (i == 127 || i == 255) EXPECT_TRUE(test2.isNaN()); else @@ -5158,95 +5264,395 @@ } } -TEST(APFloatTest, Float8E4M3FNExhaustivePair) { - // Test each pair of Float8E4M3FN values. +TEST(APFloatTest, Float8E5M2FNUZNext) { + APFloat test(APFloat::Float8E5M2FNUZ(), APFloat::uninitialized); + APFloat expected(APFloat::Float8E5M2FNUZ(), APFloat::uninitialized); + + // 1. NextUp of largest bit pattern is nan + test = APFloat::getLargest(APFloat::Float8E5M2FNUZ()); + expected = APFloat::getNaN(APFloat::Float8E5M2FNUZ()); + EXPECT_EQ(test.next(false), APFloat::opOK); + EXPECT_FALSE(test.isInfinity()); + EXPECT_FALSE(test.isZero()); + EXPECT_TRUE(test.isNaN()); + EXPECT_TRUE(test.bitwiseIsEqual(expected)); + + // 2. NextUp of smallest denormal is +0 + test = APFloat::getSmallest(APFloat::Float8E5M2FNUZ(), true); + expected = APFloat::getZero(APFloat::Float8E5M2FNUZ(), false); + EXPECT_EQ(test.next(false), APFloat::opOK); + EXPECT_FALSE(test.isNegZero()); + EXPECT_TRUE(test.isPosZero()); + EXPECT_TRUE(test.bitwiseIsEqual(expected)); + + // 3. nextDown of negative of largest value is NaN + test = APFloat::getLargest(APFloat::Float8E5M2FNUZ(), true); + expected = APFloat::getNaN(APFloat::Float8E5M2FNUZ()); + EXPECT_EQ(test.next(true), APFloat::opOK); + EXPECT_FALSE(test.isInfinity()); + EXPECT_FALSE(test.isZero()); + EXPECT_TRUE(test.isNaN()); + EXPECT_TRUE(test.bitwiseIsEqual(expected)); + + // 4. nextDown of +0 is smallest negative denormal + test = APFloat::getZero(APFloat::Float8E5M2FNUZ(), false); + expected = APFloat::getSmallest(APFloat::Float8E5M2FNUZ(), true); + EXPECT_EQ(test.next(true), APFloat::opOK); + EXPECT_FALSE(test.isZero()); + EXPECT_TRUE(test.isDenormal()); + EXPECT_TRUE(test.bitwiseIsEqual(expected)); +} + +TEST(APFloatTest, UnsignedZeroArithmeticSpecial) { + // Float semantics with only unsigned zero (ex. Float8E4M3FNUZ) violate the + // IEEE rules about signs in arithmetic operations when producing zeros, + // because they only have one zero. Most of the rest of the complexities of + // arithmetic on these values are covered by the other Float8 types' test + // cases and so are not repeated here. + + // The IEEE round towards negative rule doesn't apply + APFloat test = APFloat::getSmallest(APFloat::Float8E4M3FNUZ()); + APFloat rhs = test; + EXPECT_EQ(test.subtract(rhs, APFloat::rmTowardNegative), APFloat::opOK); + EXPECT_TRUE(test.isZero()); + EXPECT_FALSE(test.isNegative()); + + // Multiplication of (small) * (-small) is +0 + test = APFloat::getSmallestNormalized(APFloat::Float8E4M3FNUZ()); + rhs = -test; + EXPECT_EQ(test.multiply(rhs, APFloat::rmNearestTiesToAway), + APFloat::opInexact | APFloat::opUnderflow); + EXPECT_TRUE(test.isZero()); + EXPECT_FALSE(test.isNegative()); + + // Dividing the negatize float_min by anything hives +0 + test = APFloat::getSmallest(APFloat::Float8E4M3FNUZ(), true); + rhs = APFloat(APFloat::Float8E4M3FNUZ(), "2.0"); + EXPECT_EQ(test.divide(rhs, APFloat::rmNearestTiesToEven), + APFloat::opInexact | APFloat::opUnderflow); + EXPECT_TRUE(test.isZero()); + EXPECT_FALSE(test.isNegative()); + + // Remainder can't copy sign because there's only one zero + test = APFloat(APFloat::Float8E4M3FNUZ(), "-4.0"); + rhs = APFloat(APFloat::Float8E4M3FNUZ(), "2.0"); + EXPECT_EQ(test.remainder(rhs), APFloat::opOK); + EXPECT_TRUE(test.isZero()); + EXPECT_FALSE(test.isNegative()); + + // And same for mod + test = APFloat(APFloat::Float8E4M3FNUZ(), "-4.0"); + rhs = APFloat(APFloat::Float8E4M3FNUZ(), "2.0"); + EXPECT_EQ(test.mod(rhs), APFloat::opOK); + EXPECT_TRUE(test.isZero()); + EXPECT_FALSE(test.isNegative()); + + // FMA correctly handles both the multiply and add parts of all this + test = APFloat(APFloat::Float8E4M3FNUZ(), "2.0"); + rhs = test; + APFloat addend = APFloat(APFloat::Float8E4M3FNUZ(), "-4.0"); + EXPECT_EQ(test.fusedMultiplyAdd(rhs, addend, APFloat::rmTowardNegative), + APFloat::opOK); + EXPECT_TRUE(test.isZero()); + EXPECT_FALSE(test.isNegative()); +} + +TEST(APFloatTest, Float8E5M2FNUZExhaustive) { + // Test each of the 256 Float8E5M2FNUZ values. + for (int i = 0; i < 256; i++) { + APFloat test(APFloat::Float8E5M2FNUZ(), APInt(8, i)); + SCOPED_TRACE("i=" + std::to_string(i)); + + // isLargest + if (i == 127 || i == 255) { + EXPECT_TRUE(test.isLargest()); + EXPECT_EQ(abs(test).convertToDouble(), 57344.); + } else { + EXPECT_FALSE(test.isLargest()); + } + + // isSmallest + if (i == 1 || i == 129) { + EXPECT_TRUE(test.isSmallest()); + EXPECT_EQ(abs(test).convertToDouble(), 0x1p-17); + } else { + EXPECT_FALSE(test.isSmallest()); + } + + // convert to BFloat + APFloat test2 = test; + bool losesInfo; + APFloat::opStatus status = test2.convert( + APFloat::BFloat(), APFloat::rmNearestTiesToEven, &losesInfo); + EXPECT_EQ(status, APFloat::opOK); + EXPECT_FALSE(losesInfo); + if (i == 128) + EXPECT_TRUE(test2.isNaN()); + else + EXPECT_EQ(test.convertToFloat(), test2.convertToFloat()); + + // bitcastToAPInt + EXPECT_EQ(i, test.bitcastToAPInt()); + } +} + +TEST(APFloatTest, Float8E4M3FNUZExhaustive) { + // Test each of the 256 Float8E4M3FNUZ values. for (int i = 0; i < 256; i++) { - for (int j = 0; j < 256; j++) { - SCOPED_TRACE("i=" + std::to_string(i) + ",j=" + std::to_string(j)); - APFloat x(APFloat::Float8E4M3FN(), APInt(8, i)); - APFloat y(APFloat::Float8E4M3FN(), APInt(8, j)); - - bool losesInfo; - APFloat x16 = x; - x16.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, - &losesInfo); - EXPECT_FALSE(losesInfo); - APFloat y16 = y; - y16.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, - &losesInfo); - EXPECT_FALSE(losesInfo); - - // Add - APFloat z = x; - z.add(y, APFloat::rmNearestTiesToEven); - APFloat z16 = x16; - z16.add(y16, APFloat::rmNearestTiesToEven); - z16.convert(APFloat::Float8E4M3FN(), APFloat::rmNearestTiesToEven, - &losesInfo); - EXPECT_TRUE(z.bitwiseIsEqual(z16)); - - // Subtract - z = x; - z.subtract(y, APFloat::rmNearestTiesToEven); - z16 = x16; - z16.subtract(y16, APFloat::rmNearestTiesToEven); - z16.convert(APFloat::Float8E4M3FN(), APFloat::rmNearestTiesToEven, - &losesInfo); - EXPECT_TRUE(z.bitwiseIsEqual(z16)); - - // Multiply - z = x; - z.multiply(y, APFloat::rmNearestTiesToEven); - z16 = x16; - z16.multiply(y16, APFloat::rmNearestTiesToEven); - z16.convert(APFloat::Float8E4M3FN(), APFloat::rmNearestTiesToEven, - &losesInfo); - EXPECT_TRUE(z.bitwiseIsEqual(z16)) << "i=" << i << ", j=" << j; - - // Divide - z = x; - z.divide(y, APFloat::rmNearestTiesToEven); - z16 = x16; - z16.divide(y16, APFloat::rmNearestTiesToEven); - z16.convert(APFloat::Float8E4M3FN(), APFloat::rmNearestTiesToEven, - &losesInfo); - EXPECT_TRUE(z.bitwiseIsEqual(z16)) << "i=" << i << ", j=" << j; - - // Mod - z = x; - z.mod(y); - z16 = x16; - z16.mod(y16); - z16.convert(APFloat::Float8E4M3FN(), APFloat::rmNearestTiesToEven, - &losesInfo); - EXPECT_TRUE(z.bitwiseIsEqual(z16)) << "i=" << i << ", j=" << j; - - // Remainder - z = x; - z.remainder(y); - z16 = x16; - z16.remainder(y16); - z16.convert(APFloat::Float8E4M3FN(), APFloat::rmNearestTiesToEven, - &losesInfo); - EXPECT_TRUE(z.bitwiseIsEqual(z16)) << "i=" << i << ", j=" << j; + APFloat test(APFloat::Float8E4M3FNUZ(), APInt(8, i)); + SCOPED_TRACE("i=" + std::to_string(i)); + + // isLargest + if (i == 127 || i == 255) { + EXPECT_TRUE(test.isLargest()); + EXPECT_EQ(abs(test).convertToDouble(), 240.); + } else { + EXPECT_FALSE(test.isLargest()); + } + + // isSmallest + if (i == 1 || i == 129) { + EXPECT_TRUE(test.isSmallest()); + EXPECT_EQ(abs(test).convertToDouble(), 0x1p-10); + } else { + EXPECT_FALSE(test.isSmallest()); + } + + // convert to BFloat + APFloat test2 = test; + bool losesInfo; + APFloat::opStatus status = test2.convert( + APFloat::BFloat(), APFloat::rmNearestTiesToEven, &losesInfo); + EXPECT_EQ(status, APFloat::opOK); + EXPECT_FALSE(losesInfo); + if (i == 128) + EXPECT_TRUE(test2.isNaN()); + else + EXPECT_EQ(test.convertToFloat(), test2.convertToFloat()); + + // bitcastToAPInt + EXPECT_EQ(i, test.bitcastToAPInt()); + } +} + +TEST(APFloatTest, Float8ExhaustivePair) { + // Test each pair of 8-bit floats with non-standard semantics + for (APFloat::Semantics Sem : + {APFloat::S_Float8E4M3FN, APFloat::S_Float8E5M2FNUZ, + APFloat::S_Float8E4M3FNUZ}) { + const llvm::fltSemantics &S = APFloat::EnumToSemantics(Sem); + for (int i = 0; i < 256; i++) { + for (int j = 0; j < 256; j++) { + SCOPED_TRACE("sem=" + std::to_string(Sem) + ",i=" + std::to_string(i) + + ",j=" + std::to_string(j)); + APFloat x(S, APInt(8, i)); + APFloat y(S, APInt(8, j)); + + bool losesInfo; + APFloat x16 = x; + x16.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, + &losesInfo); + EXPECT_FALSE(losesInfo); + APFloat y16 = y; + y16.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, + &losesInfo); + EXPECT_FALSE(losesInfo); + + // Add + APFloat z = x; + z.add(y, APFloat::rmNearestTiesToEven); + APFloat z16 = x16; + z16.add(y16, APFloat::rmNearestTiesToEven); + z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo); + EXPECT_TRUE(z.bitwiseIsEqual(z16)) + << "sem=" << Sem << ", i=" << i << ", j=" << j; + + // Subtract + z = x; + z.subtract(y, APFloat::rmNearestTiesToEven); + z16 = x16; + z16.subtract(y16, APFloat::rmNearestTiesToEven); + z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo); + EXPECT_TRUE(z.bitwiseIsEqual(z16)) + << "sem=" << Sem << ", i=" << i << ", j=" << j; + + // Multiply + z = x; + z.multiply(y, APFloat::rmNearestTiesToEven); + z16 = x16; + z16.multiply(y16, APFloat::rmNearestTiesToEven); + z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo); + EXPECT_TRUE(z.bitwiseIsEqual(z16)) + << "sem=" << Sem << ", i=" << i << ", j=" << j; + + // Divide + z = x; + z.divide(y, APFloat::rmNearestTiesToEven); + z16 = x16; + z16.divide(y16, APFloat::rmNearestTiesToEven); + z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo); + EXPECT_TRUE(z.bitwiseIsEqual(z16)) + << "sem=" << Sem << ", i=" << i << ", j=" << j; + + // Mod + z = x; + z.mod(y); + z16 = x16; + z16.mod(y16); + z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo); + EXPECT_TRUE(z.bitwiseIsEqual(z16)) + << "sem=" << Sem << ", i=" << i << ", j=" << j; + + // Remainder + z = x; + z.remainder(y); + z16 = x16; + z16.remainder(y16); + z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo); + EXPECT_TRUE(z.bitwiseIsEqual(z16)) + << "sem=" << Sem << ", i=" << i << ", j=" << j; + } } } } +TEST(APFloatTest, ConvertE5M2FNUZToE4M3FNUZ) { + bool losesInfo; + APFloat test(APFloat::Float8E5M2FNUZ(), "1.0"); + APFloat::opStatus status = test.convert( + APFloat::Float8E4M3FNUZ(), APFloat::rmNearestTiesToEven, &losesInfo); + EXPECT_EQ(1.0f, test.convertToFloat()); + EXPECT_FALSE(losesInfo); + EXPECT_EQ(status, APFloat::opOK); + + losesInfo = true; + test = APFloat(APFloat::Float8E5M2FNUZ(), "0.0"); + status = test.convert(APFloat::Float8E4M3FNUZ(), APFloat::rmNearestTiesToEven, + &losesInfo); + EXPECT_EQ(0.0f, test.convertToFloat()); + EXPECT_FALSE(losesInfo); + EXPECT_EQ(status, APFloat::opOK); + + losesInfo = true; + test = APFloat(APFloat::Float8E5M2FNUZ(), "0x1.Cp7"); // 224 + status = test.convert(APFloat::Float8E4M3FNUZ(), APFloat::rmNearestTiesToEven, + &losesInfo); + EXPECT_EQ(0x1.Cp7 /* 224 */, test.convertToFloat()); + EXPECT_FALSE(losesInfo); + EXPECT_EQ(status, APFloat::opOK); + + // Test overflow + losesInfo = false; + test = APFloat(APFloat::Float8E5M2FNUZ(), "0x1.0p8"); // 256 + status = test.convert(APFloat::Float8E4M3FNUZ(), APFloat::rmNearestTiesToEven, + &losesInfo); + EXPECT_TRUE(std::isnan(test.convertToFloat())); + EXPECT_TRUE(losesInfo); + EXPECT_EQ(status, APFloat::opOverflow | APFloat::opInexact); + + // Test underflow + test = APFloat(APFloat::Float8E5M2FNUZ(), "0x1.0p-11"); + status = test.convert(APFloat::Float8E4M3FNUZ(), APFloat::rmNearestTiesToEven, + &losesInfo); + EXPECT_EQ(0., test.convertToFloat()); + EXPECT_TRUE(losesInfo); + EXPECT_EQ(status, APFloat::opUnderflow | APFloat::opInexact); + + // Test rounding up to smallest denormal number + losesInfo = false; + test = APFloat(APFloat::Float8E5M2FNUZ(), "0x1.8p-11"); + status = test.convert(APFloat::Float8E4M3FNUZ(), APFloat::rmNearestTiesToEven, + &losesInfo); + EXPECT_EQ(0x1.0p-10, test.convertToFloat()); + EXPECT_TRUE(losesInfo); + EXPECT_EQ(status, APFloat::opUnderflow | APFloat::opInexact); + + // Testing inexact rounding to denormal number + losesInfo = false; + test = APFloat(APFloat::Float8E5M2FNUZ(), "0x1.8p-10"); + status = test.convert(APFloat::Float8E4M3FNUZ(), APFloat::rmNearestTiesToEven, + &losesInfo); + EXPECT_EQ(0x1.0p-9, test.convertToFloat()); + EXPECT_TRUE(losesInfo); + EXPECT_EQ(status, APFloat::opUnderflow | APFloat::opInexact); +} + +TEST(APFloatTest, ConvertE4M3FNUZToE5M2FNUZ) { + bool losesInfo; + APFloat test(APFloat::Float8E4M3FNUZ(), "1.0"); + APFloat::opStatus status = test.convert( + APFloat::Float8E5M2FNUZ(), APFloat::rmNearestTiesToEven, &losesInfo); + EXPECT_EQ(1.0f, test.convertToFloat()); + EXPECT_FALSE(losesInfo); + EXPECT_EQ(status, APFloat::opOK); + + losesInfo = true; + test = APFloat(APFloat::Float8E4M3FNUZ(), "0.0"); + status = test.convert(APFloat::Float8E5M2FNUZ(), APFloat::rmNearestTiesToEven, + &losesInfo); + EXPECT_EQ(0.0f, test.convertToFloat()); + EXPECT_FALSE(losesInfo); + EXPECT_EQ(status, APFloat::opOK); + + losesInfo = false; + test = APFloat(APFloat::Float8E4M3FNUZ(), "0x1.2p0"); // 1.125 + status = test.convert(APFloat::Float8E5M2FNUZ(), APFloat::rmNearestTiesToEven, + &losesInfo); + EXPECT_EQ(0x1.0p0 /* 1.0 */, test.convertToFloat()); + EXPECT_TRUE(losesInfo); + EXPECT_EQ(status, APFloat::opInexact); + + losesInfo = false; + test = APFloat(APFloat::Float8E4M3FNUZ(), "0x1.6p0"); // 1.375 + status = test.convert(APFloat::Float8E5M2FNUZ(), APFloat::rmNearestTiesToEven, + &losesInfo); + EXPECT_EQ(0x1.8p0 /* 1.5 */, test.convertToFloat()); + EXPECT_TRUE(losesInfo); + EXPECT_EQ(status, APFloat::opInexact); + + // Convert E4M3 denormal to E5M2 normal. Should not be truncated, despite the + // destination format having one fewer significand bit + losesInfo = true; + test = APFloat(APFloat::Float8E4M3FNUZ(), "0x1.Cp-8"); + status = test.convert(APFloat::Float8E5M2FNUZ(), APFloat::rmNearestTiesToEven, + &losesInfo); + EXPECT_EQ(0x1.Cp-8, test.convertToFloat()); + EXPECT_FALSE(losesInfo); + EXPECT_EQ(status, APFloat::opOK); +} + +TEST(APFloatTest, Float8E4M3FNUZFromString) { + // Exactly representable + EXPECT_EQ(240, APFloat(APFloat::Float8E4M3FNUZ(), "240").convertToDouble()); + // Round down to maximum value + EXPECT_EQ(240, APFloat(APFloat::Float8E4M3FNUZ(), "247").convertToDouble()); + // Round up, causing overflow to NaN + EXPECT_TRUE(APFloat(APFloat::Float8E4M3FNUZ(), "248").isNaN()); + // Overflow without rounding + EXPECT_TRUE(APFloat(APFloat::Float8E4M3FNUZ(), "256").isNaN()); + // Inf converted to NaN + EXPECT_TRUE(APFloat(APFloat::Float8E4M3FNUZ(), "inf").isNaN()); + // NaN converted to NaN + EXPECT_TRUE(APFloat(APFloat::Float8E4M3FNUZ(), "nan").isNaN()); + // Negative zero converted to positive zero + EXPECT_TRUE(APFloat(APFloat::Float8E4M3FNUZ(), "-0").isPosZero()); +} + TEST(APFloatTest, F8ToString) { for (APFloat::Semantics S : - {APFloat::S_Float8E5M2, APFloat::S_Float8E4M3FN}) { + {APFloat::S_Float8E5M2, APFloat::S_Float8E4M3FN, + APFloat::S_Float8E5M2FNUZ, APFloat::S_Float8E4M3FNUZ}) { SCOPED_TRACE("Semantics=" + std::to_string(S)); for (int i = 0; i < 256; i++) { SCOPED_TRACE("i=" + std::to_string(i)); - APFloat test(APFloat::Float8E5M2(), APInt(8, i)); + APFloat test(APFloat::EnumToSemantics(S), APInt(8, i)); llvm::SmallString<128> str; test.toString(str); if (test.isNaN()) { EXPECT_EQ(str, "NaN"); } else { - APFloat test2(APFloat::Float8E5M2(), str); + APFloat test2(APFloat::EnumToSemantics(S), str); EXPECT_TRUE(test.bitwiseIsEqual(test2)); } } @@ -5458,6 +5864,56 @@ EXPECT_TRUE(std::isnan(QNaN.convertToDouble())); } +TEST(APFloatTest, Float8E5M2FNUZToDouble) { + APFloat One(APFloat::Float8E5M2FNUZ(), "1.0"); + EXPECT_EQ(1.0, One.convertToDouble()); + APFloat Two(APFloat::Float8E5M2FNUZ(), "2.0"); + EXPECT_EQ(2.0, Two.convertToDouble()); + APFloat PosLargest = APFloat::getLargest(APFloat::Float8E5M2FNUZ(), false); + EXPECT_EQ(57344., PosLargest.convertToDouble()); + APFloat NegLargest = APFloat::getLargest(APFloat::Float8E5M2FNUZ(), true); + EXPECT_EQ(-57344., NegLargest.convertToDouble()); + APFloat PosSmallest = + APFloat::getSmallestNormalized(APFloat::Float8E5M2FNUZ(), false); + EXPECT_EQ(0x1.p-15, PosSmallest.convertToDouble()); + APFloat NegSmallest = + APFloat::getSmallestNormalized(APFloat::Float8E5M2FNUZ(), true); + EXPECT_EQ(-0x1.p-15, NegSmallest.convertToDouble()); + + APFloat SmallestDenorm = + APFloat::getSmallest(APFloat::Float8E5M2FNUZ(), false); + EXPECT_TRUE(SmallestDenorm.isDenormal()); + EXPECT_EQ(0x1p-17, SmallestDenorm.convertToDouble()); + + APFloat QNaN = APFloat::getQNaN(APFloat::Float8E5M2FNUZ()); + EXPECT_TRUE(std::isnan(QNaN.convertToDouble())); +} + +TEST(APFloatTest, Float8E4M3FNUZToDouble) { + APFloat One(APFloat::Float8E4M3FNUZ(), "1.0"); + EXPECT_EQ(1.0, One.convertToDouble()); + APFloat Two(APFloat::Float8E4M3FNUZ(), "2.0"); + EXPECT_EQ(2.0, Two.convertToDouble()); + APFloat PosLargest = APFloat::getLargest(APFloat::Float8E4M3FNUZ(), false); + EXPECT_EQ(240., PosLargest.convertToDouble()); + APFloat NegLargest = APFloat::getLargest(APFloat::Float8E4M3FNUZ(), true); + EXPECT_EQ(-240., NegLargest.convertToDouble()); + APFloat PosSmallest = + APFloat::getSmallestNormalized(APFloat::Float8E4M3FNUZ(), false); + EXPECT_EQ(0x1.p-7, PosSmallest.convertToDouble()); + APFloat NegSmallest = + APFloat::getSmallestNormalized(APFloat::Float8E4M3FNUZ(), true); + EXPECT_EQ(-0x1.p-7, NegSmallest.convertToDouble()); + + APFloat SmallestDenorm = + APFloat::getSmallest(APFloat::Float8E4M3FNUZ(), false); + EXPECT_TRUE(SmallestDenorm.isDenormal()); + EXPECT_EQ(0x1p-10, SmallestDenorm.convertToDouble()); + + APFloat QNaN = APFloat::getQNaN(APFloat::Float8E4M3FNUZ()); + EXPECT_TRUE(std::isnan(QNaN.convertToDouble())); +} + TEST(APFloatTest, IEEEsingleToFloat) { APFloat FPosZero(0.0F); APFloat FPosZeroToFloat(FPosZero.convertToFloat());