extended-0.1.0/.cargo_vcs_info.json0000644000000001360000000000100126320ustar { "git": { "sha1": "dc18803769dd6702c9cf91b033f91e200c98a684" }, "path_in_vcs": "" }extended-0.1.0/.gitignore000064400000000000000000000000240072674642500134360ustar 00000000000000/target /Cargo.lock extended-0.1.0/Cargo.toml0000644000000015260000000000100106340ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" name = "extended" version = "0.1.0" authors = ["Dietrich Epp "] description = """ Extended precision 80-bit floating-point numbers (f80). """ homepage = "https://github.com/depp/extended-rs" readme = "README.md" keywords = ["float"] categories = ["mathematics"] license = "MIT" repository = "https://github.com/depp/extended-rs" resolver = "2" extended-0.1.0/Cargo.toml.orig000064400000000000000000000005640072674642500143460ustar 00000000000000[package] name = "extended" version = "0.1.0" edition = "2021" authors = ["Dietrich Epp "] readme = "README.md" repository = "https://github.com/depp/extended-rs" homepage = "https://github.com/depp/extended-rs" license = "MIT" description = """ Extended precision 80-bit floating-point numbers (f80). """ keywords = ["float"] categories = ["mathematics"] extended-0.1.0/LICENSE.txt000064400000000000000000000020340072674642500132740ustar 00000000000000Copyright 2022 Dietrich Epp Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. extended-0.1.0/README.md000064400000000000000000000016500072674642500127330ustar 00000000000000# 80-bit Extended-Precision Floating-Point Numbers This is a Rust library that provides a type for representing 80-bit extended-precision floating-point numbers. It is licensed under the terms of the MIT license, see [LICENSE.txt](LICENSE.txt) for details. ## Rounding, Infinity, and NaN This library uses round-to-even when converting from 80-bit floats to 64-bit floats. This should be what you’re used to, and what you expect! In round-to-even, when an 80-bit float is exactly half-way between two possible `float64` values, the value with a zero in the least-significant bit is chosen (or the value with the larger exponent is chosen, if the values have different exponents). Values which are outside the range of possible `float64` values are rounded to infinity. Infinity and NaN are preserved. Different types of NaN values are not distinguished from each other, but the sign of NaN values is preserved during conversion. extended-0.1.0/src/lib.rs000064400000000000000000000255370072674642500133710ustar 00000000000000//! Extended-precision 80-bit floating-point numbers (f80). #[warn(missing_docs)] use std::convert::From; /// An 80-bit extended floating-point number. /// /// See Apple Numerics Manual, 2nd edition (1988), p. 18 "SANE Data Types". #[derive(Debug, Copy, Clone, PartialEq)] pub struct Extended { // The sign is stored as the high bit. The low 15 bits contain the exponent, // with a bias of 16383. pub sign_exponent: u16, // The fraction includes a ones place as the high bit. The value in the ones // place may be zero. pub fraction: u64, } const MAX_EXPONENT_64: u32 = (1 << 11) - 1; impl Extended { /// Create an extended 80-bit floating-point number from its big endian /// representation. pub fn from_be_bytes(b: [u8; 10]) -> Self { Extended { sign_exponent: u16::from_be_bytes(b[0..2].try_into().unwrap()), fraction: u64::from_be_bytes(b[2..10].try_into().unwrap()), } } /// Create an extended 80-bit floating-point number from its little endian /// representation. pub fn from_le_bytes(b: [u8; 10]) -> Self { Extended { sign_exponent: u16::from_le_bytes(b[8..10].try_into().unwrap()), fraction: u64::from_le_bytes(b[0..8].try_into().unwrap()), } } /// Convert an 80-bit floating-point number to its big endian /// representation. pub fn to_be_bytes(&self) -> [u8; 10] { let mut b = [0u8; 10]; b[0..2].copy_from_slice(&self.sign_exponent.to_be_bytes()); b[2..10].copy_from_slice(&self.fraction.to_be_bytes()); b } /// Convert an 80-bit floating-point number to its big endian /// representation. pub fn to_le_bytes(&self) -> [u8; 10] { let mut b = [0u8; 10]; b[8..10].copy_from_slice(&self.sign_exponent.to_le_bytes()); b[0..8].copy_from_slice(&self.fraction.to_le_bytes()); b } /// Convert to a 64-bit floating-point number. Values which are out of range /// are flushed to infinity or zero. pub fn to_f64(&self) -> f64 { const INFINITY: u64 = (MAX_EXPONENT_64 as u64) << 52; const NAN: u64 = u64::MAX >> 1; let exponent = i32::from(self.sign_exponent) & 0x7fff; let bits = if exponent == 0x7fff { if self.fraction == 0 { INFINITY } else { NAN } } else if self.fraction == 0 { 0 } else { // 2^(e64 - 1023) * 1.fraction // = 2^(e80 - 16383) * 1.fraction / 2^nzero // e63 - 1023 = e80 - 16383 // e63 = e80 - 16383 + 1023 - nzero let nzero = self.fraction.leading_zeros(); let exponent = exponent - 16383 + 1023 - (nzero as i32); let fraction = self.fraction << nzero; // Fraction is of the form 1.xxxxx. if exponent <= 0 { // Subnormal numbers. let shift = 12 - exponent; let (fraction, rem) = if shift > 64 { (0, 0) } else if shift == 64 { (0, fraction) } else { (fraction >> shift, fraction << (64 - shift)) }; // The (fraction & 1) makes this round to even. if (rem | (fraction & 1)) <= (1 << 63) { fraction } else { fraction + 1 } } else { // Round it to 52 bits. The addition of ((fraction >> 11) & 1) // makes this round to even. let rem = (fraction & ((1 << 11) - 1)) | ((fraction >> 11) & 1); let fraction = (fraction >> 11) & ((1 << 52) - 1); let (exponent, fraction) = if rem <= (1 << 10) { (exponent, fraction) } else if fraction < (1 << 52) - 1 { (exponent, fraction + 1) } else { (exponent + 1, 0) }; if exponent >= (MAX_EXPONENT_64 as i32) { // Out of range. INFINITY } else { fraction | ((exponent as u64) << 52) } } }; let sign = (u64::from(self.sign_exponent) & 0x8000) << 48; f64::from_bits(bits | sign) } } impl From for Extended { fn from(x: f64) -> Self { let bits = x.to_bits(); let sign = ((bits >> (63 - 15)) as u32) & 0x8000; let exponent = ((bits >> 52) as u32) & MAX_EXPONENT_64; let mantissa = bits & ((1 << 52) - 1); if exponent == 0 { // Zero or subnormal. // Number is (-1)^sign * 2^-1022 * 0.mantissa. if mantissa == 0 { Extended { sign_exponent: sign as u16, fraction: 0, } } else { // 2^-1022 * 0.mantissa = 2^(e-16383) * 2^lzero * 0.mantissa // -1022 = e - 16383 + lzero // e = -1022 + 16383 - lzero let nzero = mantissa.leading_zeros(); let exponent = 16383 - 1022 + 11 - nzero; Extended { sign_exponent: (sign | exponent) as u16, fraction: mantissa << nzero, } } } else if exponent == MAX_EXPONENT_64 { // Infinity or NaN. Extended { sign_exponent: (sign | 0x7fff) as u16, fraction: if mantissa == 0 { 0 } else { u64::MAX }, } } else { // 2^(e64 - 1023) * 1.fraction = 2^(e80 - 16383) * 1.fraction // e63 - 1023 = e80 - 16383 // e80 = e63 + 16383 - 1023 let exponent = exponent + 16383 - 1023; Extended { sign_exponent: (sign | exponent) as u16, fraction: (1 << 63) | (mantissa << 11), } } } } impl From for Extended { fn from(x: f32) -> Self { f64::from(x).into() } } impl From for Extended { fn from(x: i32) -> Self { f64::from(x).into() } } impl From for Extended { fn from(x: u32) -> Self { f64::from(x).into() } } #[cfg(test)] mod test { use super::*; fn equal_f64(x: f64, y: f64) -> bool { if x.is_nan() { y.is_nan() } else { x == y } } #[test] fn test_to_f64() { const CASES: &[(u16, u64, f64)] = &[ // Easy. (16383, 1 << 63, 1.0), (16384, 1 << 63, 2.0), (16382, 1 << 63, 0.5), // Next after 1.0. (16383, (1 << 63) + (1 << 11), 1.0000000000000002), // Rounds to even. (16383, (1 << 63) + (1 << 10), 1.0), (16383, (1 << 63) + (1 << 10) + 1, 1.0000000000000002), (16383, (1 << 63) + (1 << 11), 1.0000000000000002), (16383, (1 << 63) + (3 << 10) - 1, 1.0000000000000002), (16383, (1 << 63) + (3 << 10), 1.0000000000000004), // Rounds to next exponent. (16381, u64::MAX, 0.5), // Is infinity. (32767, 0, f64::INFINITY), // Out of range. (32000, 1 << 63, f64::INFINITY), (32000, u64::MAX, f64::INFINITY), (17406, 0xfffffffffffff800, 1.7976931348623157e+308), (17406, 0xfffffffffffffbff, 1.7976931348623157e+308), (17406, 0xfffffffffffffc00, f64::INFINITY), // Zero. (0, 0, 0.0), // NaN. (32767, 1, f64::NAN), (32767, 1 << 63, f64::NAN), // Smallest normal. (15361, 1 << 63, 2.2250738585072014e-308), // Subnormal. (15360, 1 << 63, 1.1125369292536007e-308), // Smallest subnormal. (15309, 1 << 63, 5e-324), // Rounds up to smallest subnormal. (15308, (1 << 63) + 1, 5e-324), (15308, 1 << 63, 0.0), // Very small. (10000, 1 << 63, 0.0), ]; let mut failed = false; for (n, &(exponent, fraction, expect)) in CASES.iter().enumerate() { for sign in 0..2 { let exponent = exponent | ((sign as u16) << 15); let fin = Extended { sign_exponent: exponent, fraction }; let fout = fin.to_f64(); let expect = if sign == 0 { expect } else { -expect }; if !equal_f64(fout, expect) { failed = true; eprintln!( "Case {}: Input = {:04x}:{:016x}, Output = {:?}, Expected = {:?}", n, exponent, fraction, fout, expect ); } } } if failed { panic!("test failed"); } } #[test] fn test_from_f64() { const CASES: &[(u16, u64, f64)] = &[ // Easy. (16383, 1 << 63, 1.0), (16384, 1 << 63, 2.0), (16382, 1 << 63, 0.5), (16383 - 10, 1 << 63, 0.0009765625), (16383 - 100, 1 << 63, 7.888609052210118e-31), // Next after 1.0. (16383, (1 << 63) + (1 << 11), 1.0000000000000002), // Is infinity. (32767, 0, f64::INFINITY), // Zero. (0, 0, 0.0), // NaN. (32767, u64::MAX, f64::NAN), // Smallest normal. (15361, 1 << 63, 2.2250738585072014e-308), // Subnormal. (15360, 1 << 63, 1.1125369292536007e-308), // // Smallest subnormal. (15309, 1 << 63, 5e-324), ]; let mut failed = false; for (n, &(exponent, fraction, fin)) in CASES.iter().enumerate() { for sign in 0..2 { let exponent = exponent | ((sign as u16) << 15); let fin = if sign == 0 { fin } else { -fin }; let fout = Extended::from(fin); let expect = Extended { sign_exponent: exponent, fraction }; if fout != expect { failed = true; eprintln!( "Case {}: Input = {:?}, Output = {:04x}:{:016x}, Expected = {:04x}:{:016x}", n, fin, fout.sign_exponent, fout.fraction, expect.sign_exponent, expect.fraction ); continue; } // Round-trip sanity check. let rev = fout.to_f64(); if !equal_f64(fin, rev) { failed = true; eprintln!( "Case {}: Round trip faied: {:?} -> {:04x}:{:016x} -> {:?}", n, fin, fout.sign_exponent, fout.fraction, rev ); } } } if failed { panic!("test failed"); } } }