blob: 9504d0daa0edb87e5afc7d7dd126d44b024b54f7 [file]
/*
* Floating point intermediate representation
*
* The code in this source file is derived from release 2a of the SoftFloat
* IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
* some later contributions) are provided under that license, as detailed below.
* It has subsequently been modified by contributors to the QEMU Project,
* so some portions are provided under:
* the SoftFloat-2a license
* the BSD license
* GPL-v2-or-later
*
* Any future contributions to this file after December 1st 2014 will be
* taken to be licensed under the Softfloat-2a license unless specifically
* indicated otherwise.
*/
#ifndef SOFTFLOAT_PARTS_H
#define SOFTFLOAT_PARTS_H
/* Format-specific handling of exp == exp_max */
typedef enum __attribute__((__packed__)) {
/* exp==max, frac==0 ? infinity : nan; this is ieee standard. */
float_expmax_ieee,
/* exp==max is a normal number; no infinity or nan representation. */
float_expmax_normal,
/* exp==max, frac==max ? nan : normal; no infinity representation. */
float_expmax_e4m3,
} FloatFmtExpMaxKind;
/*
* Structure holding all of the relevant parameters for a format.
* exp_size: the size of the exponent field
* exp_bias: the offset applied to the exponent field
* exp_max: the maximum normalised exponent
* frac_size: the size of the fraction field
* frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
* The following are computed based the size of fraction
* round_mask: bits below lsb which must be rounded
* The following optional modifiers are available:
* exp_max_kind: affects how exp == exp_max is interpreted
* has_explicit_bit: has an explicit integer bit; this affects whether
* the float_status floatx80_behaviour handling applies
* overflow_raises_invalid: for float_expmax_normal, raise invalid
* instead of overflow.
*/
typedef struct {
int exp_size;
int exp_bias;
int exp_re_bias;
int exp_max;
int frac_size;
int frac_shift;
FloatFmtExpMaxKind exp_max_kind;
bool has_explicit_bit;
bool overflow_raises_invalid;
uint64_t round_mask;
} FloatFmt;
extern const FloatFmt float4_e2m1_params;
extern const FloatFmt float8_e4m3_params;
extern const FloatFmt float8_e5m2_params;
extern const FloatFmt float16_params;
extern const FloatFmt bfloat16_params;
extern const FloatFmt float32_params;
extern const FloatFmt float64_params;
extern const FloatFmt float128_params;
/*
* Classify a floating point number. Everything above float_class_qnan
* is a NaN so cls >= float_class_qnan is any NaN.
*
* Note that we canonicalize denormals, so most code should treat
* class_normal and class_denormal identically.
*/
typedef enum __attribute__ ((__packed__)) {
float_class_unclassified,
float_class_zero,
float_class_normal,
float_class_denormal, /* input was a non-squashed denormal */
float_class_inf,
float_class_qnan, /* all NaNs from here */
float_class_snan,
} FloatClass;
#define float_cmask(bit) (1u << (bit))
enum {
float_cmask_zero = float_cmask(float_class_zero),
float_cmask_normal = float_cmask(float_class_normal),
float_cmask_denormal = float_cmask(float_class_denormal),
float_cmask_inf = float_cmask(float_class_inf),
float_cmask_qnan = float_cmask(float_class_qnan),
float_cmask_snan = float_cmask(float_class_snan),
float_cmask_infzero = float_cmask_zero | float_cmask_inf,
float_cmask_anynan = float_cmask_qnan | float_cmask_snan,
float_cmask_anynorm = float_cmask_normal | float_cmask_denormal,
};
/*
* Structure holding all of the decomposed parts of a float.
* The exponent is unbiased and the fraction is normalized.
*
* The fraction words are stored in big-endian word ordering,
* so that truncation from a larger format to a smaller format
* can be done simply by ignoring subsequent elements.
*/
typedef struct {
FloatClass cls;
bool sign;
int32_t exp;
union {
/* Routines that know the structure may reference the singular name. */
uint64_t frac;
/*
* Routines expanded with multiple structures reference "hi" and "lo"
* depending on the operation. In FloatParts64, "hi" and "lo" are
* both the same word and aliased here.
*/
uint64_t frac_hi;
uint64_t frac_lo;
};
} FloatParts64;
typedef struct {
FloatClass cls;
bool sign;
int32_t exp;
uint64_t frac_hi;
uint64_t frac_lo;
} FloatParts128;
/*
* Unpack routines from a specific floating-point format.
*/
FloatParts64 float4_e2m1_unpack_canonical(float4_e2m1 f, float_status *s);
FloatParts64 float8_e4m3_unpack_canonical(float8_e4m3 f, float_status *s);
FloatParts64 float8_e5m2_unpack_canonical(float8_e5m2 f, float_status *s);
FloatParts64 float16_unpack_canonical(float16 f, float_status *s);
FloatParts64 bfloat16_unpack_canonical(bfloat16 f, float_status *s);
FloatParts64 float32_unpack_canonical(float32 f, float_status *s);
FloatParts64 float64_unpack_canonical(float64 f, float_status *s);
FloatParts128 float128_unpack_canonical(float128 f, float_status *s);
/* Returns false if the encoding is invalid. */
bool floatx80_unpack_canonical(FloatParts128 *p, floatx80 f, float_status *s);
/*
* Pack routines to a specific floating-point format.
*/
float8_e4m3 float8_e4m3_round_pack_canonical(FloatParts64 *p, float_status *s,
bool saturate);
float8_e5m2 float8_e5m2_round_pack_canonical(FloatParts64 *p, float_status *s,
bool saturate);
float16 float16_round_pack_canonical(FloatParts64 *p, float_status *s);
bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p, float_status *s);
float32 float32_round_pack_canonical(FloatParts64 *p, float_status *s);
float64 float64_round_pack_canonical(FloatParts64 *p, float_status *s);
float128 float128_round_pack_canonical(FloatParts128 *p, float_status *s);
floatx80 floatx80_round_pack_canonical(FloatParts128 *p, float_status *s);
/*
* NaN handling
*/
FloatParts64 parts64_default_nan(float_status *status);
FloatParts128 parts128_default_nan(float_status *status);
FloatParts64 parts64_pick_nan(const FloatParts64 *, const FloatParts64 *,
float_status *);
FloatParts128 parts128_pick_nan(const FloatParts128 *, const FloatParts128 *,
float_status *);
FloatParts64 parts64_return_nan(const FloatParts64 *a, float_status *s);
FloatParts128 parts128_return_nan(const FloatParts128 *a, float_status *s);
/*
* Operations
*/
FloatParts64 parts64_addsub(const FloatParts64 *a, const FloatParts64 *b,
float_status *s, bool subtract);
FloatParts128 parts128_addsub(const FloatParts128 *a, const FloatParts128 *b,
float_status *s, bool subtract);
FloatRelation parts64_compare(const FloatParts64 *a, const FloatParts64 *b,
float_status *s, bool quiet);
FloatRelation parts128_compare(const FloatParts128 *a, const FloatParts128 *b,
float_status *s, bool quiet);
FloatParts64 parts64_div(const FloatParts64 *a, const FloatParts64 *b,
float_status *s);
FloatParts128 parts128_div(const FloatParts128 *a, const FloatParts128 *b,
float_status *s);
FloatParts64 parts64_mul(const FloatParts64 *a, const FloatParts64 *b,
float_status *s);
FloatParts128 parts128_mul(const FloatParts128 *a, const FloatParts128 *b,
float_status *s);
FloatParts64 parts64_round_to_int(const FloatParts64 *a,
FloatRoundMode rmode,
int scale, float_status *s,
const FloatFmt *fmt);
FloatParts128 parts128_round_to_int(const FloatParts128 *a,
FloatRoundMode rmode,
int scale, float_status *s,
const FloatFmt *fmt);
FloatParts64 parts64_round_to_fmt(const FloatParts64 *p, float_status *s,
const FloatFmt *fmt);
#endif