| /* |
| * Tiny arbitrary precision floating point library |
| * |
| * Copyright (c) 2017-2021 Fabrice Bellard |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a copy |
| * of this software and associated documentation files (the "Software"), to deal |
| * in the Software without restriction, including without limitation the rights |
| * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| * copies of the Software, and to permit persons to whom the Software is |
| * furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice shall be included in |
| * all copies or substantial portions of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| * THE SOFTWARE. |
| */ |
| #include <stdlib.h> |
| #include <stdio.h> |
| #include <inttypes.h> |
| #include <math.h> |
| #include <string.h> |
| #include <assert.h> |
| |
| #ifdef __AVX2__ |
| #include <immintrin.h> |
| #endif |
| |
| #include "cutils.h" |
| #include "libbf.h" |
| |
| /* enable it to check the multiplication result */ |
| //#define USE_MUL_CHECK |
| /* enable it to use FFT/NTT multiplication */ |
| #define USE_FFT_MUL |
| /* enable decimal floating point support */ |
| #define USE_BF_DEC |
| |
| //#define inline __attribute__((always_inline)) |
| |
| #ifdef __AVX2__ |
| #define FFT_MUL_THRESHOLD 100 /* in limbs of the smallest factor */ |
| #else |
| #define FFT_MUL_THRESHOLD 100 /* in limbs of the smallest factor */ |
| #endif |
| |
| /* XXX: adjust */ |
| #define DIVNORM_LARGE_THRESHOLD 50 |
| #define UDIV1NORM_THRESHOLD 3 |
| |
| #if LIMB_BITS == 64 |
| #define FMT_LIMB1 "%" PRIx64 |
| #define FMT_LIMB "%016" PRIx64 |
| #define PRId_LIMB PRId64 |
| #define PRIu_LIMB PRIu64 |
| |
| #else |
| |
| #define FMT_LIMB1 "%x" |
| #define FMT_LIMB "%08x" |
| #define PRId_LIMB "d" |
| #define PRIu_LIMB "u" |
| |
| #endif |
| |
| typedef intptr_t mp_size_t; |
| |
| typedef int bf_op2_func_t(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, |
| bf_flags_t flags); |
| |
| #ifdef USE_FFT_MUL |
| |
| #define FFT_MUL_R_OVERLAP_A (1 << 0) |
| #define FFT_MUL_R_OVERLAP_B (1 << 1) |
| #define FFT_MUL_R_NORESIZE (1 << 2) |
| |
| static no_inline int fft_mul(bf_context_t *s, |
| bf_t *res, limb_t *a_tab, limb_t a_len, |
| limb_t *b_tab, limb_t b_len, int mul_flags); |
| static void fft_clear_cache(bf_context_t *s); |
| #endif |
| #ifdef USE_BF_DEC |
| static limb_t get_digit(const limb_t *tab, limb_t len, slimb_t pos); |
| #endif |
| |
| |
| /* could leading zeros */ |
| static inline int clz(limb_t a) |
| { |
| if (a == 0) { |
| return LIMB_BITS; |
| } else { |
| #if LIMB_BITS == 64 |
| return clz64(a); |
| #else |
| return clz32(a); |
| #endif |
| } |
| } |
| |
| static inline int ctz(limb_t a) |
| { |
| if (a == 0) { |
| return LIMB_BITS; |
| } else { |
| #if LIMB_BITS == 64 |
| return ctz64(a); |
| #else |
| return ctz32(a); |
| #endif |
| } |
| } |
| |
| static inline int ceil_log2(limb_t a) |
| { |
| if (a <= 1) |
| return 0; |
| else |
| return LIMB_BITS - clz(a - 1); |
| } |
| |
| /* b must be >= 1 */ |
| static inline slimb_t ceil_div(slimb_t a, slimb_t b) |
| { |
| if (a >= 0) |
| return (a + b - 1) / b; |
| else |
| return a / b; |
| } |
| |
| /* b must be >= 1 */ |
| static inline slimb_t floor_div(slimb_t a, slimb_t b) |
| { |
| if (a >= 0) { |
| return a / b; |
| } else { |
| return (a - b + 1) / b; |
| } |
| } |
| |
| /* return r = a modulo b (0 <= r <= b - 1. b must be >= 1 */ |
| static inline limb_t smod(slimb_t a, slimb_t b) |
| { |
| a = a % (slimb_t)b; |
| if (a < 0) |
| a += b; |
| return a; |
| } |
| |
| /* signed addition with saturation */ |
| static inline slimb_t sat_add(slimb_t a, slimb_t b) |
| { |
| slimb_t r; |
| r = a + b; |
| /* overflow ? */ |
| if (((a ^ r) & (b ^ r)) < 0) |
| r = (a >> (LIMB_BITS - 1)) ^ (((limb_t)1 << (LIMB_BITS - 1)) - 1); |
| return r; |
| } |
| |
| #define malloc(s) malloc_is_forbidden(s) |
| #define free(p) free_is_forbidden(p) |
| #define realloc(p, s) realloc_is_forbidden(p, s) |
| |
| void bf_context_init(bf_context_t *s, bf_realloc_func_t *realloc_func, |
| void *realloc_opaque) |
| { |
| memset(s, 0, sizeof(*s)); |
| s->realloc_func = realloc_func; |
| s->realloc_opaque = realloc_opaque; |
| } |
| |
| void bf_context_end(bf_context_t *s) |
| { |
| bf_clear_cache(s); |
| } |
| |
| void bf_init(bf_context_t *s, bf_t *r) |
| { |
| r->ctx = s; |
| r->sign = 0; |
| r->expn = BF_EXP_ZERO; |
| r->len = 0; |
| r->tab = NULL; |
| } |
| |
| /* return 0 if OK, -1 if alloc error */ |
| int bf_resize(bf_t *r, limb_t len) |
| { |
| limb_t *tab; |
| |
| if (len != r->len) { |
| tab = bf_realloc(r->ctx, r->tab, len * sizeof(limb_t)); |
| if (!tab && len != 0) |
| return -1; |
| r->tab = tab; |
| r->len = len; |
| } |
| return 0; |
| } |
| |
| /* return 0 or BF_ST_MEM_ERROR */ |
| int bf_set_ui(bf_t *r, uint64_t a) |
| { |
| r->sign = 0; |
| if (a == 0) { |
| r->expn = BF_EXP_ZERO; |
| bf_resize(r, 0); /* cannot fail */ |
| } |
| #if LIMB_BITS == 32 |
| else if (a <= 0xffffffff) |
| #else |
| else |
| #endif |
| { |
| int shift; |
| if (bf_resize(r, 1)) |
| goto fail; |
| shift = clz(a); |
| r->tab[0] = a << shift; |
| r->expn = LIMB_BITS - shift; |
| } |
| #if LIMB_BITS == 32 |
| else { |
| uint32_t a1, a0; |
| int shift; |
| if (bf_resize(r, 2)) |
| goto fail; |
| a0 = a; |
| a1 = a >> 32; |
| shift = clz(a1); |
| r->tab[0] = a0 << shift; |
| r->tab[1] = (a1 << shift) | (a0 >> (LIMB_BITS - shift)); |
| r->expn = 2 * LIMB_BITS - shift; |
| } |
| #endif |
| return 0; |
| fail: |
| bf_set_nan(r); |
| return BF_ST_MEM_ERROR; |
| } |
| |
| /* return 0 or BF_ST_MEM_ERROR */ |
| int bf_set_si(bf_t *r, int64_t a) |
| { |
| int ret; |
| |
| if (a < 0) { |
| ret = bf_set_ui(r, -a); |
| r->sign = 1; |
| } else { |
| ret = bf_set_ui(r, a); |
| } |
| return ret; |
| } |
| |
| void bf_set_nan(bf_t *r) |
| { |
| bf_resize(r, 0); /* cannot fail */ |
| r->expn = BF_EXP_NAN; |
| r->sign = 0; |
| } |
| |
| void bf_set_zero(bf_t *r, int is_neg) |
| { |
| bf_resize(r, 0); /* cannot fail */ |
| r->expn = BF_EXP_ZERO; |
| r->sign = is_neg; |
| } |
| |
| void bf_set_inf(bf_t *r, int is_neg) |
| { |
| bf_resize(r, 0); /* cannot fail */ |
| r->expn = BF_EXP_INF; |
| r->sign = is_neg; |
| } |
| |
| /* return 0 or BF_ST_MEM_ERROR */ |
| int bf_set(bf_t *r, const bf_t *a) |
| { |
| if (r == a) |
| return 0; |
| if (bf_resize(r, a->len)) { |
| bf_set_nan(r); |
| return BF_ST_MEM_ERROR; |
| } |
| r->sign = a->sign; |
| r->expn = a->expn; |
| memcpy(r->tab, a->tab, a->len * sizeof(limb_t)); |
| return 0; |
| } |
| |
| /* equivalent to bf_set(r, a); bf_delete(a) */ |
| void bf_move(bf_t *r, bf_t *a) |
| { |
| bf_context_t *s = r->ctx; |
| if (r == a) |
| return; |
| bf_free(s, r->tab); |
| *r = *a; |
| } |
| |
| static limb_t get_limbz(const bf_t *a, limb_t idx) |
| { |
| if (idx >= a->len) |
| return 0; |
| else |
| return a->tab[idx]; |
| } |
| |
| /* get LIMB_BITS at bit position 'pos' in tab */ |
| static inline limb_t get_bits(const limb_t *tab, limb_t len, slimb_t pos) |
| { |
| limb_t i, a0, a1; |
| int p; |
| |
| i = pos >> LIMB_LOG2_BITS; |
| p = pos & (LIMB_BITS - 1); |
| if (i < len) |
| a0 = tab[i]; |
| else |
| a0 = 0; |
| if (p == 0) { |
| return a0; |
| } else { |
| i++; |
| if (i < len) |
| a1 = tab[i]; |
| else |
| a1 = 0; |
| return (a0 >> p) | (a1 << (LIMB_BITS - p)); |
| } |
| } |
| |
| static inline limb_t get_bit(const limb_t *tab, limb_t len, slimb_t pos) |
| { |
| slimb_t i; |
| i = pos >> LIMB_LOG2_BITS; |
| if (i < 0 || i >= len) |
| return 0; |
| return (tab[i] >> (pos & (LIMB_BITS - 1))) & 1; |
| } |
| |
| static inline limb_t limb_mask(int start, int last) |
| { |
| limb_t v; |
| int n; |
| n = last - start + 1; |
| if (n == LIMB_BITS) |
| v = -1; |
| else |
| v = (((limb_t)1 << n) - 1) << start; |
| return v; |
| } |
| |
| static limb_t mp_scan_nz(const limb_t *tab, mp_size_t n) |
| { |
| mp_size_t i; |
| for(i = 0; i < n; i++) { |
| if (tab[i] != 0) |
| return 1; |
| } |
| return 0; |
| } |
| |
| /* return != 0 if one bit between 0 and bit_pos inclusive is not zero. */ |
| static inline limb_t scan_bit_nz(const bf_t *r, slimb_t bit_pos) |
| { |
| slimb_t pos; |
| limb_t v; |
| |
| pos = bit_pos >> LIMB_LOG2_BITS; |
| if (pos < 0) |
| return 0; |
| v = r->tab[pos] & limb_mask(0, bit_pos & (LIMB_BITS - 1)); |
| if (v != 0) |
| return 1; |
| pos--; |
| while (pos >= 0) { |
| if (r->tab[pos] != 0) |
| return 1; |
| pos--; |
| } |
| return 0; |
| } |
| |
| /* return the addend for rounding. Note that prec can be <= 0 (for |
| BF_FLAG_RADPNT_PREC) */ |
| static int bf_get_rnd_add(int *pret, const bf_t *r, limb_t l, |
| slimb_t prec, int rnd_mode) |
| { |
| int add_one, inexact; |
| limb_t bit1, bit0; |
| |
| if (rnd_mode == BF_RNDF) { |
| bit0 = 1; /* faithful rounding does not honor the INEXACT flag */ |
| } else { |
| /* starting limb for bit 'prec + 1' */ |
| bit0 = scan_bit_nz(r, l * LIMB_BITS - 1 - bf_max(0, prec + 1)); |
| } |
| |
| /* get the bit at 'prec' */ |
| bit1 = get_bit(r->tab, l, l * LIMB_BITS - 1 - prec); |
| inexact = (bit1 | bit0) != 0; |
| |
| add_one = 0; |
| switch(rnd_mode) { |
| case BF_RNDZ: |
| break; |
| case BF_RNDN: |
| if (bit1) { |
| if (bit0) { |
| add_one = 1; |
| } else { |
| /* round to even */ |
| add_one = |
| get_bit(r->tab, l, l * LIMB_BITS - 1 - (prec - 1)); |
| } |
| } |
| break; |
| case BF_RNDD: |
| case BF_RNDU: |
| if (r->sign == (rnd_mode == BF_RNDD)) |
| add_one = inexact; |
| break; |
| case BF_RNDA: |
| add_one = inexact; |
| break; |
| case BF_RNDNA: |
| case BF_RNDF: |
| add_one = bit1; |
| break; |
| default: |
| abort(); |
| } |
| |
| if (inexact) |
| *pret |= BF_ST_INEXACT; |
| return add_one; |
| } |
| |
| static int bf_set_overflow(bf_t *r, int sign, limb_t prec, bf_flags_t flags) |
| { |
| slimb_t i, l, e_max; |
| int rnd_mode; |
| |
| rnd_mode = flags & BF_RND_MASK; |
| if (prec == BF_PREC_INF || |
| rnd_mode == BF_RNDN || |
| rnd_mode == BF_RNDNA || |
| rnd_mode == BF_RNDA || |
| (rnd_mode == BF_RNDD && sign == 1) || |
| (rnd_mode == BF_RNDU && sign == 0)) { |
| bf_set_inf(r, sign); |
| } else { |
| /* set to maximum finite number */ |
| l = (prec + LIMB_BITS - 1) / LIMB_BITS; |
| if (bf_resize(r, l)) { |
| bf_set_nan(r); |
| return BF_ST_MEM_ERROR; |
| } |
| r->tab[0] = limb_mask((-prec) & (LIMB_BITS - 1), |
| LIMB_BITS - 1); |
| for(i = 1; i < l; i++) |
| r->tab[i] = (limb_t)-1; |
| e_max = (limb_t)1 << (bf_get_exp_bits(flags) - 1); |
| r->expn = e_max; |
| r->sign = sign; |
| } |
| return BF_ST_OVERFLOW | BF_ST_INEXACT; |
| } |
| |
| /* round to prec1 bits assuming 'r' is non zero and finite. 'r' is |
| assumed to have length 'l' (1 <= l <= r->len). Note: 'prec1' can be |
| infinite (BF_PREC_INF). 'ret' is 0 or BF_ST_INEXACT if the result |
| is known to be inexact. Can fail with BF_ST_MEM_ERROR in case of |
| overflow not returning infinity. */ |
| static int __bf_round(bf_t *r, limb_t prec1, bf_flags_t flags, limb_t l, |
| int ret) |
| { |
| limb_t v, a; |
| int shift, add_one, rnd_mode; |
| slimb_t i, bit_pos, pos, e_min, e_max, e_range, prec; |
| |
| /* e_min and e_max are computed to match the IEEE 754 conventions */ |
| e_range = (limb_t)1 << (bf_get_exp_bits(flags) - 1); |
| e_min = -e_range + 3; |
| e_max = e_range; |
| |
| if (flags & BF_FLAG_RADPNT_PREC) { |
| /* 'prec' is the precision after the radix point */ |
| if (prec1 != BF_PREC_INF) |
| prec = r->expn + prec1; |
| else |
| prec = prec1; |
| } else if (unlikely(r->expn < e_min) && (flags & BF_FLAG_SUBNORMAL)) { |
| /* restrict the precision in case of potentially subnormal |
| result */ |
| assert(prec1 != BF_PREC_INF); |
| prec = prec1 - (e_min - r->expn); |
| } else { |
| prec = prec1; |
| } |
| |
| /* round to prec bits */ |
| rnd_mode = flags & BF_RND_MASK; |
| add_one = bf_get_rnd_add(&ret, r, l, prec, rnd_mode); |
| |
| if (prec <= 0) { |
| if (add_one) { |
| bf_resize(r, 1); /* cannot fail */ |
| r->tab[0] = (limb_t)1 << (LIMB_BITS - 1); |
| r->expn += 1 - prec; |
| ret |= BF_ST_UNDERFLOW | BF_ST_INEXACT; |
| return ret; |
| } else { |
| goto underflow; |
| } |
| } else if (add_one) { |
| limb_t carry; |
| |
| /* add one starting at digit 'prec - 1' */ |
| bit_pos = l * LIMB_BITS - 1 - (prec - 1); |
| pos = bit_pos >> LIMB_LOG2_BITS; |
| carry = (limb_t)1 << (bit_pos & (LIMB_BITS - 1)); |
| |
| for(i = pos; i < l; i++) { |
| v = r->tab[i] + carry; |
| carry = (v < carry); |
| r->tab[i] = v; |
| if (carry == 0) |
| break; |
| } |
| if (carry) { |
| /* shift right by one digit */ |
| v = 1; |
| for(i = l - 1; i >= pos; i--) { |
| a = r->tab[i]; |
| r->tab[i] = (a >> 1) | (v << (LIMB_BITS - 1)); |
| v = a; |
| } |
| r->expn++; |
| } |
| } |
| |
| /* check underflow */ |
| if (unlikely(r->expn < e_min)) { |
| if (flags & BF_FLAG_SUBNORMAL) { |
| /* if inexact, also set the underflow flag */ |
| if (ret & BF_ST_INEXACT) |
| ret |= BF_ST_UNDERFLOW; |
| } else { |
| underflow: |
| ret |= BF_ST_UNDERFLOW | BF_ST_INEXACT; |
| bf_set_zero(r, r->sign); |
| return ret; |
| } |
| } |
| |
| /* check overflow */ |
| if (unlikely(r->expn > e_max)) |
| return bf_set_overflow(r, r->sign, prec1, flags); |
| |
| /* keep the bits starting at 'prec - 1' */ |
| bit_pos = l * LIMB_BITS - 1 - (prec - 1); |
| i = bit_pos >> LIMB_LOG2_BITS; |
| if (i >= 0) { |
| shift = bit_pos & (LIMB_BITS - 1); |
| if (shift != 0) |
| r->tab[i] &= limb_mask(shift, LIMB_BITS - 1); |
| } else { |
| i = 0; |
| } |
| /* remove trailing zeros */ |
| while (r->tab[i] == 0) |
| i++; |
| if (i > 0) { |
| l -= i; |
| memmove(r->tab, r->tab + i, l * sizeof(limb_t)); |
| } |
| bf_resize(r, l); /* cannot fail */ |
| return ret; |
| } |
| |
| /* 'r' must be a finite number. */ |
| int bf_normalize_and_round(bf_t *r, limb_t prec1, bf_flags_t flags) |
| { |
| limb_t l, v, a; |
| int shift, ret; |
| slimb_t i; |
| |
| // bf_print_str("bf_renorm", r); |
| l = r->len; |
| while (l > 0 && r->tab[l - 1] == 0) |
| l--; |
| if (l == 0) { |
| /* zero */ |
| r->expn = BF_EXP_ZERO; |
| bf_resize(r, 0); /* cannot fail */ |
| ret = 0; |
| } else { |
| r->expn -= (r->len - l) * LIMB_BITS; |
| /* shift to have the MSB set to '1' */ |
| v = r->tab[l - 1]; |
| shift = clz(v); |
| if (shift != 0) { |
| v = 0; |
| for(i = 0; i < l; i++) { |
| a = r->tab[i]; |
| r->tab[i] = (a << shift) | (v >> (LIMB_BITS - shift)); |
| v = a; |
| } |
| r->expn -= shift; |
| } |
| ret = __bf_round(r, prec1, flags, l, 0); |
| } |
| // bf_print_str("r_final", r); |
| return ret; |
| } |
| |
| /* return true if rounding can be done at precision 'prec' assuming |
| the exact result r is such that |r-a| <= 2^(EXP(a)-k). */ |
| /* XXX: check the case where the exponent would be incremented by the |
| rounding */ |
| int bf_can_round(const bf_t *a, slimb_t prec, bf_rnd_t rnd_mode, slimb_t k) |
| { |
| BOOL is_rndn; |
| slimb_t bit_pos, n; |
| limb_t bit; |
| |
| if (a->expn == BF_EXP_INF || a->expn == BF_EXP_NAN) |
| return FALSE; |
| if (rnd_mode == BF_RNDF) { |
| return (k >= (prec + 1)); |
| } |
| if (a->expn == BF_EXP_ZERO) |
| return FALSE; |
| is_rndn = (rnd_mode == BF_RNDN || rnd_mode == BF_RNDNA); |
| if (k < (prec + 2)) |
| return FALSE; |
| bit_pos = a->len * LIMB_BITS - 1 - prec; |
| n = k - prec; |
| /* bit pattern for RNDN or RNDNA: 0111.. or 1000... |
| for other rounding modes: 000... or 111... |
| */ |
| bit = get_bit(a->tab, a->len, bit_pos); |
| bit_pos--; |
| n--; |
| bit ^= is_rndn; |
| /* XXX: slow, but a few iterations on average */ |
| while (n != 0) { |
| if (get_bit(a->tab, a->len, bit_pos) != bit) |
| return TRUE; |
| bit_pos--; |
| n--; |
| } |
| return FALSE; |
| } |
| |
| /* Cannot fail with BF_ST_MEM_ERROR. */ |
| int bf_round(bf_t *r, limb_t prec, bf_flags_t flags) |
| { |
| if (r->len == 0) |
| return 0; |
| return __bf_round(r, prec, flags, r->len, 0); |
| } |
| |
| /* for debugging */ |
| static __maybe_unused void dump_limbs(const char *str, const limb_t *tab, limb_t n) |
| { |
| limb_t i; |
| printf("%s: len=%" PRId_LIMB "\n", str, n); |
| for(i = 0; i < n; i++) { |
| printf("%" PRId_LIMB ": " FMT_LIMB "\n", |
| i, tab[i]); |
| } |
| } |
| |
| void mp_print_str(const char *str, const limb_t *tab, limb_t n) |
| { |
| slimb_t i; |
| printf("%s= 0x", str); |
| for(i = n - 1; i >= 0; i--) { |
| if (i != (n - 1)) |
| printf("_"); |
| printf(FMT_LIMB, tab[i]); |
| } |
| printf("\n"); |
| } |
| |
| static __maybe_unused void mp_print_str_h(const char *str, |
| const limb_t *tab, limb_t n, |
| limb_t high) |
| { |
| slimb_t i; |
| printf("%s= 0x", str); |
| printf(FMT_LIMB, high); |
| for(i = n - 1; i >= 0; i--) { |
| printf("_"); |
| printf(FMT_LIMB, tab[i]); |
| } |
| printf("\n"); |
| } |
| |
| /* for debugging */ |
| void bf_print_str(const char *str, const bf_t *a) |
| { |
| slimb_t i; |
| printf("%s=", str); |
| |
| if (a->expn == BF_EXP_NAN) { |
| printf("NaN"); |
| } else { |
| if (a->sign) |
| putchar('-'); |
| if (a->expn == BF_EXP_ZERO) { |
| putchar('0'); |
| } else if (a->expn == BF_EXP_INF) { |
| printf("Inf"); |
| } else { |
| printf("0x0."); |
| for(i = a->len - 1; i >= 0; i--) |
| printf(FMT_LIMB, a->tab[i]); |
| printf("p%" PRId_LIMB, a->expn); |
| } |
| } |
| printf("\n"); |
| } |
| |
| /* compare the absolute value of 'a' and 'b'. Return < 0 if a < b, 0 |
| if a = b and > 0 otherwise. */ |
| int bf_cmpu(const bf_t *a, const bf_t *b) |
| { |
| slimb_t i; |
| limb_t len, v1, v2; |
| |
| if (a->expn != b->expn) { |
| if (a->expn < b->expn) |
| return -1; |
| else |
| return 1; |
| } |
| len = bf_max(a->len, b->len); |
| for(i = len - 1; i >= 0; i--) { |
| v1 = get_limbz(a, a->len - len + i); |
| v2 = get_limbz(b, b->len - len + i); |
| if (v1 != v2) { |
| if (v1 < v2) |
| return -1; |
| else |
| return 1; |
| } |
| } |
| return 0; |
| } |
| |
| /* Full order: -0 < 0, NaN == NaN and NaN is larger than all other numbers */ |
| int bf_cmp_full(const bf_t *a, const bf_t *b) |
| { |
| int res; |
| |
| if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) { |
| if (a->expn == b->expn) |
| res = 0; |
| else if (a->expn == BF_EXP_NAN) |
| res = 1; |
| else |
| res = -1; |
| } else if (a->sign != b->sign) { |
| res = 1 - 2 * a->sign; |
| } else { |
| res = bf_cmpu(a, b); |
| if (a->sign) |
| res = -res; |
| } |
| return res; |
| } |
| |
| /* Standard floating point comparison: return 2 if one of the operands |
| is NaN (unordered) or -1, 0, 1 depending on the ordering assuming |
| -0 == +0 */ |
| int bf_cmp(const bf_t *a, const bf_t *b) |
| { |
| int res; |
| |
| if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) { |
| res = 2; |
| } else if (a->sign != b->sign) { |
| if (a->expn == BF_EXP_ZERO && b->expn == BF_EXP_ZERO) |
| res = 0; |
| else |
| res = 1 - 2 * a->sign; |
| } else { |
| res = bf_cmpu(a, b); |
| if (a->sign) |
| res = -res; |
| } |
| return res; |
| } |
| |
| /* Compute the number of bits 'n' matching the pattern: |
| a= X1000..0 |
| b= X0111..1 |
| |
| When computing a-b, the result will have at least n leading zero |
| bits. |
| |
| Precondition: a > b and a.expn - b.expn = 0 or 1 |
| */ |
| static limb_t count_cancelled_bits(const bf_t *a, const bf_t *b) |
| { |
| slimb_t bit_offset, b_offset, n; |
| int p, p1; |
| limb_t v1, v2, mask; |
| |
| bit_offset = a->len * LIMB_BITS - 1; |
| b_offset = (b->len - a->len) * LIMB_BITS - (LIMB_BITS - 1) + |
| a->expn - b->expn; |
| n = 0; |
| |
| /* first search the equals bits */ |
| for(;;) { |
| v1 = get_limbz(a, bit_offset >> LIMB_LOG2_BITS); |
| v2 = get_bits(b->tab, b->len, bit_offset + b_offset); |
| // printf("v1=" FMT_LIMB " v2=" FMT_LIMB "\n", v1, v2); |
| if (v1 != v2) |
| break; |
| n += LIMB_BITS; |
| bit_offset -= LIMB_BITS; |
| } |
| /* find the position of the first different bit */ |
| p = clz(v1 ^ v2) + 1; |
| n += p; |
| /* then search for '0' in a and '1' in b */ |
| p = LIMB_BITS - p; |
| if (p > 0) { |
| /* search in the trailing p bits of v1 and v2 */ |
| mask = limb_mask(0, p - 1); |
| p1 = bf_min(clz(v1 & mask), clz((~v2) & mask)) - (LIMB_BITS - p); |
| n += p1; |
| if (p1 != p) |
| goto done; |
| } |
| bit_offset -= LIMB_BITS; |
| for(;;) { |
| v1 = get_limbz(a, bit_offset >> LIMB_LOG2_BITS); |
| v2 = get_bits(b->tab, b->len, bit_offset + b_offset); |
| // printf("v1=" FMT_LIMB " v2=" FMT_LIMB "\n", v1, v2); |
| if (v1 != 0 || v2 != -1) { |
| /* different: count the matching bits */ |
| p1 = bf_min(clz(v1), clz(~v2)); |
| n += p1; |
| break; |
| } |
| n += LIMB_BITS; |
| bit_offset -= LIMB_BITS; |
| } |
| done: |
| return n; |
| } |
| |
| static int bf_add_internal(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, |
| bf_flags_t flags, int b_neg) |
| { |
| const bf_t *tmp; |
| int is_sub, ret, cmp_res, a_sign, b_sign; |
| |
| a_sign = a->sign; |
| b_sign = b->sign ^ b_neg; |
| is_sub = a_sign ^ b_sign; |
| cmp_res = bf_cmpu(a, b); |
| if (cmp_res < 0) { |
| tmp = a; |
| a = b; |
| b = tmp; |
| a_sign = b_sign; /* b_sign is never used later */ |
| } |
| /* abs(a) >= abs(b) */ |
| if (cmp_res == 0 && is_sub && a->expn < BF_EXP_INF) { |
| /* zero result */ |
| bf_set_zero(r, (flags & BF_RND_MASK) == BF_RNDD); |
| ret = 0; |
| } else if (a->len == 0 || b->len == 0) { |
| ret = 0; |
| if (a->expn >= BF_EXP_INF) { |
| if (a->expn == BF_EXP_NAN) { |
| /* at least one operand is NaN */ |
| bf_set_nan(r); |
| } else if (b->expn == BF_EXP_INF && is_sub) { |
| /* infinities with different signs */ |
| bf_set_nan(r); |
| ret = BF_ST_INVALID_OP; |
| } else { |
| bf_set_inf(r, a_sign); |
| } |
| } else { |
| /* at least one zero and not subtract */ |
| bf_set(r, a); |
| r->sign = a_sign; |
| goto renorm; |
| } |
| } else { |
| slimb_t d, a_offset, b_bit_offset, i, cancelled_bits; |
| limb_t carry, v1, v2, u, r_len, carry1, precl, tot_len, z, sub_mask; |
| |
| r->sign = a_sign; |
| r->expn = a->expn; |
| d = a->expn - b->expn; |
| /* must add more precision for the leading cancelled bits in |
| subtraction */ |
| if (is_sub) { |
| if (d <= 1) |
| cancelled_bits = count_cancelled_bits(a, b); |
| else |
| cancelled_bits = 1; |
| } else { |
| cancelled_bits = 0; |
| } |
| |
| /* add two extra bits for rounding */ |
| precl = (cancelled_bits + prec + 2 + LIMB_BITS - 1) / LIMB_BITS; |
| tot_len = bf_max(a->len, b->len + (d + LIMB_BITS - 1) / LIMB_BITS); |
| r_len = bf_min(precl, tot_len); |
| if (bf_resize(r, r_len)) |
| goto fail; |
| a_offset = a->len - r_len; |
| b_bit_offset = (b->len - r_len) * LIMB_BITS + d; |
| |
| /* compute the bits before for the rounding */ |
| carry = is_sub; |
| z = 0; |
| sub_mask = -is_sub; |
| i = r_len - tot_len; |
| while (i < 0) { |
| slimb_t ap, bp; |
| BOOL inflag; |
| |
| ap = a_offset + i; |
| bp = b_bit_offset + i * LIMB_BITS; |
| inflag = FALSE; |
| if (ap >= 0 && ap < a->len) { |
| v1 = a->tab[ap]; |
| inflag = TRUE; |
| } else { |
| v1 = 0; |
| } |
| if (bp + LIMB_BITS > 0 && bp < (slimb_t)(b->len * LIMB_BITS)) { |
| v2 = get_bits(b->tab, b->len, bp); |
| inflag = TRUE; |
| } else { |
| v2 = 0; |
| } |
| if (!inflag) { |
| /* outside 'a' and 'b': go directly to the next value |
| inside a or b so that the running time does not |
| depend on the exponent difference */ |
| i = 0; |
| if (ap < 0) |
| i = bf_min(i, -a_offset); |
| /* b_bit_offset + i * LIMB_BITS + LIMB_BITS >= 1 |
| equivalent to |
| i >= ceil(-b_bit_offset + 1 - LIMB_BITS) / LIMB_BITS) |
| */ |
| if (bp + LIMB_BITS <= 0) |
| i = bf_min(i, (-b_bit_offset) >> LIMB_LOG2_BITS); |
| } else { |
| i++; |
| } |
| v2 ^= sub_mask; |
| u = v1 + v2; |
| carry1 = u < v1; |
| u += carry; |
| carry = (u < carry) | carry1; |
| z |= u; |
| } |
| /* and the result */ |
| for(i = 0; i < r_len; i++) { |
| v1 = get_limbz(a, a_offset + i); |
| v2 = get_bits(b->tab, b->len, b_bit_offset + i * LIMB_BITS); |
| v2 ^= sub_mask; |
| u = v1 + v2; |
| carry1 = u < v1; |
| u += carry; |
| carry = (u < carry) | carry1; |
| r->tab[i] = u; |
| } |
| /* set the extra bits for the rounding */ |
| r->tab[0] |= (z != 0); |
| |
| /* carry is only possible in add case */ |
| if (!is_sub && carry) { |
| if (bf_resize(r, r_len + 1)) |
| goto fail; |
| r->tab[r_len] = 1; |
| r->expn += LIMB_BITS; |
| } |
| renorm: |
| ret = bf_normalize_and_round(r, prec, flags); |
| } |
| return ret; |
| fail: |
| bf_set_nan(r); |
| return BF_ST_MEM_ERROR; |
| } |
| |
| static int __bf_add(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, |
| bf_flags_t flags) |
| { |
| return bf_add_internal(r, a, b, prec, flags, 0); |
| } |
| |
| static int __bf_sub(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, |
| bf_flags_t flags) |
| { |
| return bf_add_internal(r, a, b, prec, flags, 1); |
| } |
| |
| limb_t mp_add(limb_t *res, const limb_t *op1, const limb_t *op2, |
| limb_t n, limb_t carry) |
| { |
| slimb_t i; |
| limb_t k, a, v, k1; |
| |
| k = carry; |
| for(i=0;i<n;i++) { |
| v = op1[i]; |
| a = v + op2[i]; |
| k1 = a < v; |
| a = a + k; |
| k = (a < k) | k1; |
| res[i] = a; |
| } |
| return k; |
| } |
| |
| limb_t mp_add_ui(limb_t *tab, limb_t b, size_t n) |
| { |
| size_t i; |
| limb_t k, a; |
| |
| k=b; |
| for(i=0;i<n;i++) { |
| if (k == 0) |
| break; |
| a = tab[i] + k; |
| k = (a < k); |
| tab[i] = a; |
| } |
| return k; |
| } |
| |
| limb_t mp_sub(limb_t *res, const limb_t *op1, const limb_t *op2, |
| mp_size_t n, limb_t carry) |
| { |
| int i; |
| limb_t k, a, v, k1; |
| |
| k = carry; |
| for(i=0;i<n;i++) { |
| v = op1[i]; |
| a = v - op2[i]; |
| k1 = a > v; |
| v = a - k; |
| k = (v > a) | k1; |
| res[i] = v; |
| } |
| return k; |
| } |
| |
| /* compute 0 - op2 */ |
| static limb_t mp_neg(limb_t *res, const limb_t *op2, mp_size_t n, limb_t carry) |
| { |
| int i; |
| limb_t k, a, v, k1; |
| |
| k = carry; |
| for(i=0;i<n;i++) { |
| v = 0; |
| a = v - op2[i]; |
| k1 = a > v; |
| v = a - k; |
| k = (v > a) | k1; |
| res[i] = v; |
| } |
| return k; |
| } |
| |
| limb_t mp_sub_ui(limb_t *tab, limb_t b, mp_size_t n) |
| { |
| mp_size_t i; |
| limb_t k, a, v; |
| |
| k=b; |
| for(i=0;i<n;i++) { |
| v = tab[i]; |
| a = v - k; |
| k = a > v; |
| tab[i] = a; |
| if (k == 0) |
| break; |
| } |
| return k; |
| } |
| |
| /* r = (a + high*B^n) >> shift. Return the remainder r (0 <= r < 2^shift). |
| 1 <= shift <= LIMB_BITS - 1 */ |
| static limb_t mp_shr(limb_t *tab_r, const limb_t *tab, mp_size_t n, |
| int shift, limb_t high) |
| { |
| mp_size_t i; |
| limb_t l, a; |
| |
| assert(shift >= 1 && shift < LIMB_BITS); |
| l = high; |
| for(i = n - 1; i >= 0; i--) { |
| a = tab[i]; |
| tab_r[i] = (a >> shift) | (l << (LIMB_BITS - shift)); |
| l = a; |
| } |
| return l & (((limb_t)1 << shift) - 1); |
| } |
| |
| /* tabr[] = taba[] * b + l. Return the high carry */ |
| static limb_t mp_mul1(limb_t *tabr, const limb_t *taba, limb_t n, |
| limb_t b, limb_t l) |
| { |
| limb_t i; |
| dlimb_t t; |
| |
| for(i = 0; i < n; i++) { |
| t = (dlimb_t)taba[i] * (dlimb_t)b + l; |
| tabr[i] = t; |
| l = t >> LIMB_BITS; |
| } |
| return l; |
| } |
| |
| /* tabr[] += taba[] * b, return the high word. */ |
| static limb_t mp_add_mul1(limb_t *tabr, const limb_t *taba, limb_t n, |
| limb_t b) |
| { |
| limb_t i, l; |
| dlimb_t t; |
| |
| l = 0; |
| for(i = 0; i < n; i++) { |
| t = (dlimb_t)taba[i] * (dlimb_t)b + l + tabr[i]; |
| tabr[i] = t; |
| l = t >> LIMB_BITS; |
| } |
| return l; |
| } |
| |
| /* size of the result : op1_size + op2_size. */ |
| static void mp_mul_basecase(limb_t *result, |
| const limb_t *op1, limb_t op1_size, |
| const limb_t *op2, limb_t op2_size) |
| { |
| limb_t i, r; |
| |
| result[op1_size] = mp_mul1(result, op1, op1_size, op2[0], 0); |
| for(i=1;i<op2_size;i++) { |
| r = mp_add_mul1(result + i, op1, op1_size, op2[i]); |
| result[i + op1_size] = r; |
| } |
| } |
| |
| /* return 0 if OK, -1 if memory error */ |
| /* XXX: change API so that result can be allocated */ |
| int mp_mul(bf_context_t *s, limb_t *result, |
| const limb_t *op1, limb_t op1_size, |
| const limb_t *op2, limb_t op2_size) |
| { |
| #ifdef USE_FFT_MUL |
| if (unlikely(bf_min(op1_size, op2_size) >= FFT_MUL_THRESHOLD)) { |
| bf_t r_s, *r = &r_s; |
| r->tab = result; |
| /* XXX: optimize memory usage in API */ |
| if (fft_mul(s, r, (limb_t *)op1, op1_size, |
| (limb_t *)op2, op2_size, FFT_MUL_R_NORESIZE)) |
| return -1; |
| } else |
| #endif |
| { |
| mp_mul_basecase(result, op1, op1_size, op2, op2_size); |
| } |
| return 0; |
| } |
| |
| /* tabr[] -= taba[] * b. Return the value to substract to the high |
| word. */ |
| static limb_t mp_sub_mul1(limb_t *tabr, const limb_t *taba, limb_t n, |
| limb_t b) |
| { |
| limb_t i, l; |
| dlimb_t t; |
| |
| l = 0; |
| for(i = 0; i < n; i++) { |
| t = tabr[i] - (dlimb_t)taba[i] * (dlimb_t)b - l; |
| tabr[i] = t; |
| l = -(t >> LIMB_BITS); |
| } |
| return l; |
| } |
| |
| /* WARNING: d must be >= 2^(LIMB_BITS-1) */ |
| static inline limb_t udiv1norm_init(limb_t d) |
| { |
| limb_t a0, a1; |
| a1 = -d - 1; |
| a0 = -1; |
| return (((dlimb_t)a1 << LIMB_BITS) | a0) / d; |
| } |
| |
| /* return the quotient and the remainder in '*pr'of 'a1*2^LIMB_BITS+a0 |
| / d' with 0 <= a1 < d. */ |
| static inline limb_t udiv1norm(limb_t *pr, limb_t a1, limb_t a0, |
| limb_t d, limb_t d_inv) |
| { |
| limb_t n1m, n_adj, q, r, ah; |
| dlimb_t a; |
| n1m = ((slimb_t)a0 >> (LIMB_BITS - 1)); |
| n_adj = a0 + (n1m & d); |
| a = (dlimb_t)d_inv * (a1 - n1m) + n_adj; |
| q = (a >> LIMB_BITS) + a1; |
| /* compute a - q * r and update q so that the remainder is\ |
| between 0 and d - 1 */ |
| a = ((dlimb_t)a1 << LIMB_BITS) | a0; |
| a = a - (dlimb_t)q * d - d; |
| ah = a >> LIMB_BITS; |
| q += 1 + ah; |
| r = (limb_t)a + (ah & d); |
| *pr = r; |
| return q; |
| } |
| |
| /* b must be >= 1 << (LIMB_BITS - 1) */ |
| static limb_t mp_div1norm(limb_t *tabr, const limb_t *taba, limb_t n, |
| limb_t b, limb_t r) |
| { |
| slimb_t i; |
| |
| if (n >= UDIV1NORM_THRESHOLD) { |
| limb_t b_inv; |
| b_inv = udiv1norm_init(b); |
| for(i = n - 1; i >= 0; i--) { |
| tabr[i] = udiv1norm(&r, r, taba[i], b, b_inv); |
| } |
| } else { |
| dlimb_t a1; |
| for(i = n - 1; i >= 0; i--) { |
| a1 = ((dlimb_t)r << LIMB_BITS) | taba[i]; |
| tabr[i] = a1 / b; |
| r = a1 % b; |
| } |
| } |
| return r; |
| } |
| |
| static int mp_divnorm_large(bf_context_t *s, |
| limb_t *tabq, limb_t *taba, limb_t na, |
| const limb_t *tabb, limb_t nb); |
| |
| /* base case division: divides taba[0..na-1] by tabb[0..nb-1]. tabb[nb |
| - 1] must be >= 1 << (LIMB_BITS - 1). na - nb must be >= 0. 'taba' |
| is modified and contains the remainder (nb limbs). tabq[0..na-nb] |
| contains the quotient with tabq[na - nb] <= 1. */ |
| static int mp_divnorm(bf_context_t *s, limb_t *tabq, limb_t *taba, limb_t na, |
| const limb_t *tabb, limb_t nb) |
| { |
| limb_t r, a, c, q, v, b1, b1_inv, n, dummy_r; |
| slimb_t i, j; |
| |
| b1 = tabb[nb - 1]; |
| if (nb == 1) { |
| taba[0] = mp_div1norm(tabq, taba, na, b1, 0); |
| return 0; |
| } |
| n = na - nb; |
| if (bf_min(n, nb) >= DIVNORM_LARGE_THRESHOLD) { |
| return mp_divnorm_large(s, tabq, taba, na, tabb, nb); |
| } |
| |
| if (n >= UDIV1NORM_THRESHOLD) |
| b1_inv = udiv1norm_init(b1); |
| else |
| b1_inv = 0; |
| |
| /* first iteration: the quotient is only 0 or 1 */ |
| q = 1; |
| for(j = nb - 1; j >= 0; j--) { |
| if (taba[n + j] != tabb[j]) { |
| if (taba[n + j] < tabb[j]) |
| q = 0; |
| break; |
| } |
| } |
| tabq[n] = q; |
| if (q) { |
| mp_sub(taba + n, taba + n, tabb, nb, 0); |
| } |
| |
| for(i = n - 1; i >= 0; i--) { |
| if (unlikely(taba[i + nb] >= b1)) { |
| q = -1; |
| } else if (b1_inv) { |
| q = udiv1norm(&dummy_r, taba[i + nb], taba[i + nb - 1], b1, b1_inv); |
| } else { |
| dlimb_t al; |
| al = ((dlimb_t)taba[i + nb] << LIMB_BITS) | taba[i + nb - 1]; |
| q = al / b1; |
| r = al % b1; |
| } |
| r = mp_sub_mul1(taba + i, tabb, nb, q); |
| |
| v = taba[i + nb]; |
| a = v - r; |
| c = (a > v); |
| taba[i + nb] = a; |
| |
| if (c != 0) { |
| /* negative result */ |
| for(;;) { |
| q--; |
| c = mp_add(taba + i, taba + i, tabb, nb, 0); |
| /* propagate carry and test if positive result */ |
| if (c != 0) { |
| if (++taba[i + nb] == 0) { |
| break; |
| } |
| } |
| } |
| } |
| tabq[i] = q; |
| } |
| return 0; |
| } |
| |
| /* compute r=B^(2*n)/a such as a*r < B^(2*n) < a*r + 2 with n >= 1. 'a' |
| has n limbs with a[n-1] >= B/2 and 'r' has n+1 limbs with r[n] = 1. |
| |
| See Modern Computer Arithmetic by Richard P. Brent and Paul |
| Zimmermann, algorithm 3.5 */ |
| int mp_recip(bf_context_t *s, limb_t *tabr, const limb_t *taba, limb_t n) |
| { |
| mp_size_t l, h, k, i; |
| limb_t *tabxh, *tabt, c, *tabu; |
| |
| if (n <= 2) { |
| /* return ceil(B^(2*n)/a) - 1 */ |
| /* XXX: could avoid allocation */ |
| tabu = bf_malloc(s, sizeof(limb_t) * (2 * n + 1)); |
| tabt = bf_malloc(s, sizeof(limb_t) * (n + 2)); |
| if (!tabt || !tabu) |
| goto fail; |
| for(i = 0; i < 2 * n; i++) |
| tabu[i] = 0; |
| tabu[2 * n] = 1; |
| if (mp_divnorm(s, tabt, tabu, 2 * n + 1, taba, n)) |
| goto fail; |
| for(i = 0; i < n + 1; i++) |
| tabr[i] = tabt[i]; |
| if (mp_scan_nz(tabu, n) == 0) { |
| /* only happens for a=B^n/2 */ |
| mp_sub_ui(tabr, 1, n + 1); |
| } |
| } else { |
| l = (n - 1) / 2; |
| h = n - l; |
| /* n=2p -> l=p-1, h = p + 1, k = p + 3 |
| n=2p+1-> l=p, h = p + 1; k = p + 2 |
| */ |
| tabt = bf_malloc(s, sizeof(limb_t) * (n + h + 1)); |
| tabu = bf_malloc(s, sizeof(limb_t) * (n + 2 * h - l + 2)); |
| if (!tabt || !tabu) |
| goto fail; |
| tabxh = tabr + l; |
| if (mp_recip(s, tabxh, taba + l, h)) |
| goto fail; |
| if (mp_mul(s, tabt, taba, n, tabxh, h + 1)) /* n + h + 1 limbs */ |
| goto fail; |
| while (tabt[n + h] != 0) { |
| mp_sub_ui(tabxh, 1, h + 1); |
| c = mp_sub(tabt, tabt, taba, n, 0); |
| mp_sub_ui(tabt + n, c, h + 1); |
| } |
| /* T = B^(n+h) - T */ |
| mp_neg(tabt, tabt, n + h + 1, 0); |
| tabt[n + h]++; |
| if (mp_mul(s, tabu, tabt + l, n + h + 1 - l, tabxh, h + 1)) |
| goto fail; |
| /* n + 2*h - l + 2 limbs */ |
| k = 2 * h - l; |
| for(i = 0; i < l; i++) |
| tabr[i] = tabu[i + k]; |
| mp_add(tabr + l, tabr + l, tabu + 2 * h, h, 0); |
| } |
| bf_free(s, tabt); |
| bf_free(s, tabu); |
| return 0; |
| fail: |
| bf_free(s, tabt); |
| bf_free(s, tabu); |
| return -1; |
| } |
| |
| /* return -1, 0 or 1 */ |
| static int mp_cmp(const limb_t *taba, const limb_t *tabb, mp_size_t n) |
| { |
| mp_size_t i; |
| for(i = n - 1; i >= 0; i--) { |
| if (taba[i] != tabb[i]) { |
| if (taba[i] < tabb[i]) |
| return -1; |
| else |
| return 1; |
| } |
| } |
| return 0; |
| } |
| |
| //#define DEBUG_DIVNORM_LARGE |
| //#define DEBUG_DIVNORM_LARGE2 |
| |
| /* subquadratic divnorm */ |
| static int mp_divnorm_large(bf_context_t *s, |
| limb_t *tabq, limb_t *taba, limb_t na, |
| const limb_t *tabb, limb_t nb) |
| { |
| limb_t *tabb_inv, nq, *tabt, i, n; |
| nq = na - nb; |
| #ifdef DEBUG_DIVNORM_LARGE |
| printf("na=%d nb=%d nq=%d\n", (int)na, (int)nb, (int)nq); |
| mp_print_str("a", taba, na); |
| mp_print_str("b", tabb, nb); |
| #endif |
| assert(nq >= 1); |
| n = nq; |
| if (nq < nb) |
| n++; |
| tabb_inv = bf_malloc(s, sizeof(limb_t) * (n + 1)); |
| tabt = bf_malloc(s, sizeof(limb_t) * 2 * (n + 1)); |
| if (!tabb_inv || !tabt) |
| goto fail; |
| |
| if (n >= nb) { |
| for(i = 0; i < n - nb; i++) |
| tabt[i] = 0; |
| for(i = 0; i < nb; i++) |
| tabt[i + n - nb] = tabb[i]; |
| } else { |
| /* truncate B: need to increment it so that the approximate |
| inverse is smaller that the exact inverse */ |
| for(i = 0; i < n; i++) |
| tabt[i] = tabb[i + nb - n]; |
| if (mp_add_ui(tabt, 1, n)) { |
| /* tabt = B^n : tabb_inv = B^n */ |
| memset(tabb_inv, 0, n * sizeof(limb_t)); |
| tabb_inv[n] = 1; |
| goto recip_done; |
| } |
| } |
| if (mp_recip(s, tabb_inv, tabt, n)) |
| goto fail; |
| recip_done: |
| /* Q=A*B^-1 */ |
| if (mp_mul(s, tabt, tabb_inv, n + 1, taba + na - (n + 1), n + 1)) |
| goto fail; |
| |
| for(i = 0; i < nq + 1; i++) |
| tabq[i] = tabt[i + 2 * (n + 1) - (nq + 1)]; |
| #ifdef DEBUG_DIVNORM_LARGE |
| mp_print_str("q", tabq, nq + 1); |
| #endif |
| |
| bf_free(s, tabt); |
| bf_free(s, tabb_inv); |
| tabb_inv = NULL; |
| |
| /* R=A-B*Q */ |
| tabt = bf_malloc(s, sizeof(limb_t) * (na + 1)); |
| if (!tabt) |
| goto fail; |
| if (mp_mul(s, tabt, tabq, nq + 1, tabb, nb)) |
| goto fail; |
| /* we add one more limb for the result */ |
| mp_sub(taba, taba, tabt, nb + 1, 0); |
| bf_free(s, tabt); |
| /* the approximated quotient is smaller than than the exact one, |
| hence we may have to increment it */ |
| #ifdef DEBUG_DIVNORM_LARGE2 |
| int cnt = 0; |
| static int cnt_max; |
| #endif |
| for(;;) { |
| if (taba[nb] == 0 && mp_cmp(taba, tabb, nb) < 0) |
| break; |
| taba[nb] -= mp_sub(taba, taba, tabb, nb, 0); |
| mp_add_ui(tabq, 1, nq + 1); |
| #ifdef DEBUG_DIVNORM_LARGE2 |
| cnt++; |
| #endif |
| } |
| #ifdef DEBUG_DIVNORM_LARGE2 |
| if (cnt > cnt_max) { |
| cnt_max = cnt; |
| printf("\ncnt=%d nq=%d nb=%d\n", cnt_max, (int)nq, (int)nb); |
| } |
| #endif |
| return 0; |
| fail: |
| bf_free(s, tabb_inv); |
| bf_free(s, tabt); |
| return -1; |
| } |
| |
| int bf_mul(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, |
| bf_flags_t flags) |
| { |
| int ret, r_sign; |
| |
| if (a->len < b->len) { |
| const bf_t *tmp = a; |
| a = b; |
| b = tmp; |
| } |
| r_sign = a->sign ^ b->sign; |
| /* here b->len <= a->len */ |
| if (b->len == 0) { |
| if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) { |
| bf_set_nan(r); |
| ret = 0; |
| } else if (a->expn == BF_EXP_INF || b->expn == BF_EXP_INF) { |
| if ((a->expn == BF_EXP_INF && b->expn == BF_EXP_ZERO) || |
| (a->expn == BF_EXP_ZERO && b->expn == BF_EXP_INF)) { |
| bf_set_nan(r); |
| ret = BF_ST_INVALID_OP; |
| } else { |
| bf_set_inf(r, r_sign); |
| ret = 0; |
| } |
| } else { |
| bf_set_zero(r, r_sign); |
| ret = 0; |
| } |
| } else { |
| bf_t tmp, *r1 = NULL; |
| limb_t a_len, b_len, precl; |
| limb_t *a_tab, *b_tab; |
| |
| a_len = a->len; |
| b_len = b->len; |
| |
| if ((flags & BF_RND_MASK) == BF_RNDF) { |
| /* faithful rounding does not require using the full inputs */ |
| precl = (prec + 2 + LIMB_BITS - 1) / LIMB_BITS; |
| a_len = bf_min(a_len, precl); |
| b_len = bf_min(b_len, precl); |
| } |
| a_tab = a->tab + a->len - a_len; |
| b_tab = b->tab + b->len - b_len; |
| |
| #ifdef USE_FFT_MUL |
| if (b_len >= FFT_MUL_THRESHOLD) { |
| int mul_flags = 0; |
| if (r == a) |
| mul_flags |= FFT_MUL_R_OVERLAP_A; |
| if (r == b) |
| mul_flags |= FFT_MUL_R_OVERLAP_B; |
| if (fft_mul(r->ctx, r, a_tab, a_len, b_tab, b_len, mul_flags)) |
| goto fail; |
| } else |
| #endif |
| { |
| if (r == a || r == b) { |
| bf_init(r->ctx, &tmp); |
| r1 = r; |
| r = &tmp; |
| } |
| if (bf_resize(r, a_len + b_len)) { |
| fail: |
| bf_set_nan(r); |
| ret = BF_ST_MEM_ERROR; |
| goto done; |
| } |
| mp_mul_basecase(r->tab, a_tab, a_len, b_tab, b_len); |
| } |
| r->sign = r_sign; |
| r->expn = a->expn + b->expn; |
| ret = bf_normalize_and_round(r, prec, flags); |
| done: |
| if (r == &tmp) |
| bf_move(r1, &tmp); |
| } |
| return ret; |
| } |
| |
| /* multiply 'r' by 2^e */ |
| int bf_mul_2exp(bf_t *r, slimb_t e, limb_t prec, bf_flags_t flags) |
| { |
| slimb_t e_max; |
| if (r->len == 0) |
| return 0; |
| e_max = ((limb_t)1 << BF_EXT_EXP_BITS_MAX) - 1; |
| e = bf_max(e, -e_max); |
| e = bf_min(e, e_max); |
| r->expn += e; |
| return __bf_round(r, prec, flags, r->len, 0); |
| } |
| |
| /* Return e such as a=m*2^e with m odd integer. return 0 if a is zero, |
| Infinite or Nan. */ |
| slimb_t bf_get_exp_min(const bf_t *a) |
| { |
| slimb_t i; |
| limb_t v; |
| int k; |
| |
| for(i = 0; i < a->len; i++) { |
| v = a->tab[i]; |
| if (v != 0) { |
| k = ctz(v); |
| return a->expn - (a->len - i) * LIMB_BITS + k; |
| } |
| } |
| return 0; |
| } |
| |
| /* a and b must be finite numbers with a >= 0 and b > 0. 'q' is the |
| integer defined as floor(a/b) and r = a - q * b. */ |
| static void bf_tdivremu(bf_t *q, bf_t *r, |
| const bf_t *a, const bf_t *b) |
| { |
| if (bf_cmpu(a, b) < 0) { |
| bf_set_ui(q, 0); |
| bf_set(r, a); |
| } else { |
| bf_div(q, a, b, bf_max(a->expn - b->expn + 1, 2), BF_RNDZ); |
| bf_rint(q, BF_RNDZ); |
| bf_mul(r, q, b, BF_PREC_INF, BF_RNDZ); |
| bf_sub(r, a, r, BF_PREC_INF, BF_RNDZ); |
| } |
| } |
| |
| static int __bf_div(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, |
| bf_flags_t flags) |
| { |
| bf_context_t *s = r->ctx; |
| int ret, r_sign; |
| limb_t n, nb, precl; |
| |
| r_sign = a->sign ^ b->sign; |
| if (a->expn >= BF_EXP_INF || b->expn >= BF_EXP_INF) { |
| if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) { |
| bf_set_nan(r); |
| return 0; |
| } else if (a->expn == BF_EXP_INF && b->expn == BF_EXP_INF) { |
| bf_set_nan(r); |
| return BF_ST_INVALID_OP; |
| } else if (a->expn == BF_EXP_INF) { |
| bf_set_inf(r, r_sign); |
| return 0; |
| } else { |
| bf_set_zero(r, r_sign); |
| return 0; |
| } |
| } else if (a->expn == BF_EXP_ZERO) { |
| if (b->expn == BF_EXP_ZERO) { |
| bf_set_nan(r); |
| return BF_ST_INVALID_OP; |
| } else { |
| bf_set_zero(r, r_sign); |
| return 0; |
| } |
| } else if (b->expn == BF_EXP_ZERO) { |
| bf_set_inf(r, r_sign); |
| return BF_ST_DIVIDE_ZERO; |
| } |
| |
| /* number of limbs of the quotient (2 extra bits for rounding) */ |
| precl = (prec + 2 + LIMB_BITS - 1) / LIMB_BITS; |
| nb = b->len; |
| n = bf_max(a->len, precl); |
| |
| { |
| limb_t *taba, na; |
| slimb_t d; |
| |
| na = n + nb; |
| taba = bf_malloc(s, (na + 1) * sizeof(limb_t)); |
| if (!taba) |
| goto fail; |
| d = na - a->len; |
| memset(taba, 0, d * sizeof(limb_t)); |
| memcpy(taba + d, a->tab, a->len * sizeof(limb_t)); |
| if (bf_resize(r, n + 1)) |
| goto fail1; |
| if (mp_divnorm(s, r->tab, taba, na, b->tab, nb)) { |
| fail1: |
| bf_free(s, taba); |
| goto fail; |
| } |
| /* see if non zero remainder */ |
| if (mp_scan_nz(taba, nb)) |
| r->tab[0] |= 1; |
| bf_free(r->ctx, taba); |
| r->expn = a->expn - b->expn + LIMB_BITS; |
| r->sign = r_sign; |
| ret = bf_normalize_and_round(r, prec, flags); |
| } |
| return ret; |
| fail: |
| bf_set_nan(r); |
| return BF_ST_MEM_ERROR; |
| } |
| |
| /* division and remainder. |
| |
| rnd_mode is the rounding mode for the quotient. The additional |
| rounding mode BF_RND_EUCLIDIAN is supported. |
| |
| 'q' is an integer. 'r' is rounded with prec and flags (prec can be |
| BF_PREC_INF). |
| */ |
| int bf_divrem(bf_t *q, bf_t *r, const bf_t *a, const bf_t *b, |
| limb_t prec, bf_flags_t flags, int rnd_mode) |
| { |
| bf_t a1_s, *a1 = &a1_s; |
| bf_t b1_s, *b1 = &b1_s; |
| int q_sign, ret; |
| BOOL is_ceil, is_rndn; |
| |
| assert(q != a && q != b); |
| assert(r != a && r != b); |
| assert(q != r); |
| |
| if (a->len == 0 || b->len == 0) { |
| bf_set_zero(q, 0); |
| if (a->expn == BF_EXP_NAN || b->expn == BF_EXP_NAN) { |
| bf_set_nan(r); |
| return 0; |
| } else if (a->expn == BF_EXP_INF || b->expn == BF_EXP_ZERO) { |
| bf_set_nan(r); |
| return BF_ST_INVALID_OP; |
| } else { |
| bf_set(r, a); |
| return bf_round(r, prec, flags); |
| } |
| } |
| |
| q_sign = a->sign ^ b->sign; |
| is_rndn = (rnd_mode == BF_RNDN || rnd_mode == BF_RNDNA); |
| switch(rnd_mode) { |
| default: |
| case BF_RNDZ: |
| case BF_RNDN: |
| case BF_RNDNA: |
| is_ceil = FALSE; |
| break; |
| case BF_RNDD: |
| is_ceil = q_sign; |
| break; |
| case BF_RNDU: |
| is_ceil = q_sign ^ 1; |
| break; |
| case BF_RNDA: |
| is_ceil = TRUE; |
| break; |
| case BF_DIVREM_EUCLIDIAN: |
| is_ceil = a->sign; |
| break; |
| } |
| |
| a1->expn = a->expn; |
| a1->tab = a->tab; |
| a1->len = a->len; |
| a1->sign = 0; |
| |
| b1->expn = b->expn; |
| b1->tab = b->tab; |
| b1->len = b->len; |
| b1->sign = 0; |
| |
| /* XXX: could improve to avoid having a large 'q' */ |
| bf_tdivremu(q, r, a1, b1); |
| if (bf_is_nan(q) || bf_is_nan(r)) |
| goto fail; |
| |
| if (r->len != 0) { |
| if (is_rndn) { |
| int res; |
| b1->expn--; |
| res = bf_cmpu(r, b1); |
| b1->expn++; |
| if (res > 0 || |
| (res == 0 && |
| (rnd_mode == BF_RNDNA || |
| get_bit(q->tab, q->len, q->len * LIMB_BITS - q->expn)))) { |
| goto do_sub_r; |
| } |
| } else if (is_ceil) { |
| do_sub_r: |
| ret = bf_add_si(q, q, 1, BF_PREC_INF, BF_RNDZ); |
| ret |= bf_sub(r, r, b1, BF_PREC_INF, BF_RNDZ); |
| if (ret & BF_ST_MEM_ERROR) |
| goto fail; |
| } |
| } |
| |
| r->sign ^= a->sign; |
| q->sign = q_sign; |
| return bf_round(r, prec, flags); |
| fail: |
| bf_set_nan(q); |
| bf_set_nan(r); |
| return BF_ST_MEM_ERROR; |
| } |
| |
| int bf_rem(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, |
| bf_flags_t flags, int rnd_mode) |
| { |
| bf_t q_s, *q = &q_s; |
| int ret; |
| |
| bf_init(r->ctx, q); |
| ret = bf_divrem(q, r, a, b, prec, flags, rnd_mode); |
| bf_delete(q); |
| return ret; |
| } |
| |
| static inline int bf_get_limb(slimb_t *pres, const bf_t *a, int flags) |
| { |
| #if LIMB_BITS == 32 |
| return bf_get_int32(pres, a, flags); |
| #else |
| return bf_get_int64(pres, a, flags); |
| #endif |
| } |
| |
| int bf_remquo(slimb_t *pq, bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, |
| bf_flags_t flags, int rnd_mode) |
| { |
| bf_t q_s, *q = &q_s; |
| int ret; |
| |
| bf_init(r->ctx, q); |
| ret = bf_divrem(q, r, a, b, prec, flags, rnd_mode); |
| bf_get_limb(pq, q, BF_GET_INT_MOD); |
| bf_delete(q); |
| return ret; |
| } |
| |
| static __maybe_unused inline limb_t mul_mod(limb_t a, limb_t b, limb_t m) |
| { |
| dlimb_t t; |
| t = (dlimb_t)a * (dlimb_t)b; |
| return t % m; |
| } |
| |
| #if defined(USE_MUL_CHECK) |
| static limb_t mp_mod1(const limb_t *tab, limb_t n, limb_t m, limb_t r) |
| { |
| slimb_t i; |
| dlimb_t t; |
| |
| for(i = n - 1; i >= 0; i--) { |
| t = ((dlimb_t)r << LIMB_BITS) | tab[i]; |
| r = t % m; |
| } |
| return r; |
| } |
| #endif |
| |
| static const uint16_t sqrt_table[192] = { |
| 128,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,144,145,146,147,148,149,150,150,151,152,153,154,155,155,156,157,158,159,160,160,161,162,163,163,164,165,166,167,167,168,169,170,170,171,172,173,173,174,175,176,176,177,178,178,179,180,181,181,182,183,183,184,185,185,186,187,187,188,189,189,190,191,192,192,193,193,194,195,195,196,197,197,198,199,199,200,201,201,202,203,203,204,204,205,206,206,207,208,208,209,209,210,211,211,212,212,213,214,214,215,215,216,217,217,218,218,219,219,220,221,221,222,222,223,224,224,225,225,226,226,227,227,228,229,229,230,230,231,231,232,232,233,234,234,235,235,236,236,237,237,238,238,239,240,240,241,241,242,242,243,243,244,244,245,245,246,246,247,247,248,248,249,249,250,250,251,251,252,252,253,253,254,254,255, |
| }; |
| |
| /* a >= 2^(LIMB_BITS - 2). Return (s, r) with s=floor(sqrt(a)) and |
| r=a-s^2. 0 <= r <= 2 * s */ |
| static limb_t mp_sqrtrem1(limb_t *pr, limb_t a) |
| { |
| limb_t s1, r1, s, r, q, u, num; |
| |
| /* use a table for the 16 -> 8 bit sqrt */ |
| s1 = sqrt_table[(a >> (LIMB_BITS - 8)) - 64]; |
| r1 = (a >> (LIMB_BITS - 16)) - s1 * s1; |
| if (r1 > 2 * s1) { |
| r1 -= 2 * s1 + 1; |
| s1++; |
| } |
| |
| /* one iteration to get a 32 -> 16 bit sqrt */ |
| num = (r1 << 8) | ((a >> (LIMB_BITS - 32 + 8)) & 0xff); |
| q = num / (2 * s1); /* q <= 2^8 */ |
| u = num % (2 * s1); |
| s = (s1 << 8) + q; |
| r = (u << 8) | ((a >> (LIMB_BITS - 32)) & 0xff); |
| r -= q * q; |
| if ((slimb_t)r < 0) { |
| s--; |
| r += 2 * s + 1; |
| } |
| |
| #if LIMB_BITS == 64 |
| s1 = s; |
| r1 = r; |
| /* one more iteration for 64 -> 32 bit sqrt */ |
| num = (r1 << 16) | ((a >> (LIMB_BITS - 64 + 16)) & 0xffff); |
| q = num / (2 * s1); /* q <= 2^16 */ |
| u = num % (2 * s1); |
| s = (s1 << 16) + q; |
| r = (u << 16) | ((a >> (LIMB_BITS - 64)) & 0xffff); |
| r -= q * q; |
| if ((slimb_t)r < 0) { |
| s--; |
| r += 2 * s + 1; |
| } |
| #endif |
| *pr = r; |
| return s; |
| } |
| |
| /* return floor(sqrt(a)) */ |
| limb_t bf_isqrt(limb_t a) |
| { |
| limb_t s, r; |
| int k; |
| |
| if (a == 0) |
| return 0; |
| k = clz(a) & ~1; |
| s = mp_sqrtrem1(&r, a << k); |
| s >>= (k >> 1); |
| return s; |
| } |
| |
| static limb_t mp_sqrtrem2(limb_t *tabs, limb_t *taba) |
| { |
| limb_t s1, r1, s, q, u, a0, a1; |
| dlimb_t r, num; |
| int l; |
| |
| a0 = taba[0]; |
| a1 = taba[1]; |
| s1 = mp_sqrtrem1(&r1, a1); |
| l = LIMB_BITS / 2; |
| num = ((dlimb_t)r1 << l) | (a0 >> l); |
| q = num / (2 * s1); |
| u = num % (2 * s1); |
| s = (s1 << l) + q; |
| r = ((dlimb_t)u << l) | (a0 & (((limb_t)1 << l) - 1)); |
| if (unlikely((q >> l) != 0)) |
| r -= (dlimb_t)1 << LIMB_BITS; /* special case when q=2^l */ |
| else |
| r -= q * q; |
| if ((slimb_t)(r >> LIMB_BITS) < 0) { |
| s--; |
| r += 2 * (dlimb_t)s + 1; |
| } |
| tabs[0] = s; |
| taba[0] = r; |
| return r >> LIMB_BITS; |
| } |
| |
| //#define DEBUG_SQRTREM |
| |
| /* tmp_buf must contain (n / 2 + 1 limbs). *prh contains the highest |
| limb of the remainder. */ |
| static int mp_sqrtrem_rec(bf_context_t *s, limb_t *tabs, limb_t *taba, limb_t n, |
| limb_t *tmp_buf, limb_t *prh) |
| { |
| limb_t l, h, rh, ql, qh, c, i; |
| |
| if (n == 1) { |
| *prh = mp_sqrtrem2(tabs, taba); |
| return 0; |
| } |
| #ifdef DEBUG_SQRTREM |
| mp_print_str("a", taba, 2 * n); |
| #endif |
| l = n / 2; |
| h = n - l; |
| if (mp_sqrtrem_rec(s, tabs + l, taba + 2 * l, h, tmp_buf, &qh)) |
| return -1; |
| #ifdef DEBUG_SQRTREM |
| mp_print_str("s1", tabs + l, h); |
| mp_print_str_h("r1", taba + 2 * l, h, qh); |
| mp_print_str_h("r2", taba + l, n, qh); |
| #endif |
| |
| /* the remainder is in taba + 2 * l. Its high bit is in qh */ |
| if (qh) { |
| mp_sub(taba + 2 * l, taba + 2 * l, tabs + l, h, 0); |
| } |
| /* instead of dividing by 2*s, divide by s (which is normalized) |
| and update q and r */ |
| if (mp_divnorm(s, tmp_buf, taba + l, n, tabs + l, h)) |
| return -1; |
| qh += tmp_buf[l]; |
| for(i = 0; i < l; i++) |
| tabs[i] = tmp_buf[i]; |
| ql = mp_shr(tabs, tabs, l, 1, qh & 1); |
| qh = qh >> 1; /* 0 or 1 */ |
| if (ql) |
| rh = mp_add(taba + l, taba + l, tabs + l, h, 0); |
| else |
| rh = 0; |
| #ifdef DEBUG_SQRTREM |
| mp_print_str_h("q", tabs, l, qh); |
| mp_print_str_h("u", taba + l, h, rh); |
| #endif |
| |
| mp_add_ui(tabs + l, qh, h); |
| #ifdef DEBUG_SQRTREM |
| mp_print_str_h("s2", tabs, n, sh); |
| #endif |
| |
| /* q = qh, tabs[l - 1 ... 0], r = taba[n - 1 ... l] */ |
| /* subtract q^2. if qh = 1 then q = B^l, so we can take shortcuts */ |
| if (qh) { |
| c = qh; |
| } else { |
| if (mp_mul(s, taba + n, tabs, l, tabs, l)) |
| return -1; |
| c = mp_sub(taba, taba, taba + n, 2 * l, 0); |
| } |
| rh -= mp_sub_ui(taba + 2 * l, c, n - 2 * l); |
| if ((slimb_t)rh < 0) { |
| mp_sub_ui(tabs, 1, n); |
| rh += mp_add_mul1(taba, tabs, n, 2); |
| rh += mp_add_ui(taba, 1, n); |
| } |
| *prh = rh; |
| return 0; |
| } |
| |
| /* 'taba' has 2*n limbs with n >= 1 and taba[2*n-1] >= 2 ^ (LIMB_BITS |
| - 2). Return (s, r) with s=floor(sqrt(a)) and r=a-s^2. 0 <= r <= 2 |
| * s. tabs has n limbs. r is returned in the lower n limbs of |
| taba. Its r[n] is the returned value of the function. */ |
| /* Algorithm from the article "Karatsuba Square Root" by Paul Zimmermann and |
| inspirated from its GMP implementation */ |
| int mp_sqrtrem(bf_context_t *s, limb_t *tabs, limb_t *taba, limb_t n) |
| { |
| limb_t tmp_buf1[8]; |
| limb_t *tmp_buf; |
| mp_size_t n2; |
| int ret; |
| n2 = n / 2 + 1; |
| if (n2 <= countof(tmp_buf1)) { |
| tmp_buf = tmp_buf1; |
| } else { |
| tmp_buf = bf_malloc(s, sizeof(limb_t) * n2); |
| if (!tmp_buf) |
| return -1; |
| } |
| ret = mp_sqrtrem_rec(s, tabs, taba, n, tmp_buf, taba + n); |
| if (tmp_buf != tmp_buf1) |
| bf_free(s, tmp_buf); |
| return ret; |
| } |
| |
| /* Integer square root with remainder. 'a' must be an integer. r = |
| floor(sqrt(a)) and rem = a - r^2. BF_ST_INEXACT is set if the result |
| is inexact. 'rem' can be NULL if the remainder is not needed. */ |
| int bf_sqrtrem(bf_t *r, bf_t *rem1, const bf_t *a) |
| { |
| int ret; |
| |
| if (a->len == 0) { |
| if (a->expn == BF_EXP_NAN) { |
| bf_set_nan(r); |
| } else if (a->expn == BF_EXP_INF && a->sign) { |
| goto invalid_op; |
| } else { |
| bf_set(r, a); |
| } |
| if (rem1) |
| bf_set_ui(rem1, 0); |
| ret = 0; |
| } else if (a->sign) { |
| invalid_op: |
| bf_set_nan(r); |
| if (rem1) |
| bf_set_ui(rem1, 0); |
| ret = BF_ST_INVALID_OP; |
| } else { |
| bf_t rem_s, *rem; |
| |
| bf_sqrt(r, a, (a->expn + 1) / 2, BF_RNDZ); |
| bf_rint(r, BF_RNDZ); |
| /* see if the result is exact by computing the remainder */ |
| if (rem1) { |
| rem = rem1; |
| } else { |
| rem = &rem_s; |
| bf_init(r->ctx, rem); |
| } |
| /* XXX: could avoid recomputing the remainder */ |
| bf_mul(rem, r, r, BF_PREC_INF, BF_RNDZ); |
| bf_neg(rem); |
| bf_add(rem, rem, a, BF_PREC_INF, BF_RNDZ); |
| if (bf_is_nan(rem)) { |
| ret = BF_ST_MEM_ERROR; |
| goto done; |
| } |
| if (rem->len != 0) { |
| ret = BF_ST_INEXACT; |
| } else { |
| ret = 0; |
| } |
| done: |
| if (!rem1) |
| bf_delete(rem); |
| } |
| return ret; |
| } |
| |
| int bf_sqrt(bf_t *r, const bf_t *a, limb_t prec, bf_flags_t flags) |
| { |
| bf_context_t *s = a->ctx; |
| int ret; |
| |
| assert(r != a); |
| |
| if (a->len == 0) { |
| if (a->expn == BF_EXP_NAN) { |
| bf_set_nan(r); |
| } else if (a->expn == BF_EXP_INF && a->sign) { |
| goto invalid_op; |
| } else { |
| bf_set(r, a); |
| } |
| ret = 0; |
| } else if (a->sign) { |
| invalid_op: |
| bf_set_nan(r); |
| ret = BF_ST_INVALID_OP; |
| } else { |
| limb_t *a1; |
| slimb_t n, n1; |
| limb_t res; |
| |
| /* convert the mantissa to an integer with at least 2 * |
| prec + 4 bits */ |
| n = (2 * (prec + 2) + 2 * LIMB_BITS - 1) / (2 * LIMB_BITS); |
| if (bf_resize(r, n)) |
| goto fail; |
| a1 = bf_malloc(s, sizeof(limb_t) * 2 * n); |
| if (!a1) |
| goto fail; |
| n1 = bf_min(2 * n, a->len); |
| memset(a1, 0, (2 * n - n1) * sizeof(limb_t)); |
| memcpy(a1 + 2 * n - n1, a->tab + a->len - n1, n1 * sizeof(limb_t)); |
| if (a->expn & 1) { |
| res = mp_shr(a1, a1, 2 * n, 1, 0); |
| } else { |
| res = 0; |
| } |
| if (mp_sqrtrem(s, r->tab, a1, n)) { |
| bf_free(s, a1); |
| goto fail; |
| } |
| if (!res) { |
| res = mp_scan_nz(a1, n + 1); |
| } |
| bf_free(s, a1); |
| if (!res) { |
| res = mp_scan_nz(a->tab, a->len - n1); |
| } |
| if (res != 0) |
| r->tab[0] |= 1; |
| r->sign = 0; |
| r->expn = (a->expn + 1) >> 1; |
| ret = bf_round(r, prec, flags); |
| } |
| return ret; |
| fail: |
| bf_set_nan(r); |
| return BF_ST_MEM_ERROR; |
| } |
| |
| static no_inline int bf_op2(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, |
| bf_flags_t flags, bf_op2_func_t *func) |
| { |
| bf_t tmp; |
| int ret; |
| |
| if (r == a || r == b) { |
| bf_init(r->ctx, &tmp); |
| ret = func(&tmp, a, b, prec, flags); |
| bf_move(r, &tmp); |
| } else { |
| ret = func(r, a, b, prec, flags); |
| } |
| return ret; |
| } |
| |
| int bf_add(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, |
| bf_flags_t flags) |
| { |
| return bf_op2(r, a, b, prec, flags, __bf_add); |
| } |
| |
| int bf_sub(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, |
| bf_flags_t flags) |
| { |
| return bf_op2(r, a, b, prec, flags, __bf_sub); |
| } |
| |
| int bf_div(bf_t *r, const bf_t *a, const bf_t *b, limb_t prec, |
| bf_flags_t flags) |
| { |
| return bf_op2(r, a, b, prec, flags, __bf_div); |
| } |
| |
| int bf_mul_ui(bf_t *r, const bf_t *a, uint64_t b1, limb_t prec, |
| bf_flags_t flags) |
| { |
| bf_t b; |
| int ret; |
| bf_init(r->ctx, &b); |
| ret = bf_set_ui(&b, b1); |
| ret |= bf_mul(r, a, &b, prec, flags); |
| bf_delete(&b); |
| return ret; |
| } |
| |
| int bf_mul_si(bf_t *r, const bf_t *a, int64_t b1, limb_t prec, |
| bf_flags_t flags) |
| { |
| bf_t b; |
| int ret; |
| bf_init(r->ctx, &b); |
| ret = bf_set_si(&b, b1); |
| ret |= bf_mul(r, a, &b, prec, flags); |
| bf_delete(&b); |
| return ret; |
| } |
| |
| int bf_add_si(bf_t *r, const bf_t *a, int64_t b1, limb_t prec, |
| bf_flags_t flags) |
| { |
| bf_t b; |
| int ret; |
| |
| bf_init(r->ctx, &b); |
| ret = bf_set_si(&b, b1); |
| ret |= bf_add(r, a, &b, prec, flags); |
| bf_delete(&b); |
| return ret; |
| } |
| |
| static int bf_pow_ui(bf_t *r, const bf_t *a, limb_t b, limb_t prec, |
| bf_flags_t flags) |
| { |
| int ret, n_bits, i; |
| |
| assert(r != a); |
| if (b == 0) |
| return bf_set_ui(r, 1); |
| ret = bf_set(r, a); |
| n_bits = LIMB_BITS - clz(b); |
| for(i = n_bits - 2; i >= 0; i--) { |
| ret |= bf_mul(r, r, r, prec, flags); |
| if ((b >> i) & 1) |
| ret |= bf_mul(r, r, a, prec, flags); |
| } |
| return ret; |
| } |
| |
| static int bf_pow_ui_ui(bf_t *r, limb_t a1, limb_t b, |
| limb_t prec, bf_flags_t flags) |
| { |
| bf_t a; |
| int ret; |
| |
| if (a1 == 10 && b <= LIMB_DIGITS) { |
| /* use precomputed powers. We do not round at this point |
| because we expect the caller to do it */ |
| ret = bf_set_ui(r, mp_pow_dec[b]); |
| } else { |
| bf_init(r->ctx, &a); |
| ret = bf_set_ui(&a, a1); |
| ret |= bf_pow_ui(r, &a, b, prec, flags); |
| bf_delete(&a); |
| } |
| return ret; |
| } |
| |
| /* convert to integer (infinite precision) */ |
| int bf_rint(bf_t *r, int rnd_mode) |
| { |
| return bf_round(r, 0, rnd_mode | BF_FLAG_RADPNT_PREC); |
| } |
| |
| /* logical operations */ |
| #define BF_LOGIC_OR 0 |
| #define BF_LOGIC_XOR 1 |
| #define BF_LOGIC_AND 2 |
| |
| static inline limb_t bf_logic_op1(limb_t a, limb_t b, int op) |
| { |
| switch(op) { |
| case BF_LOGIC_OR: |
| return a | b; |
| case BF_LOGIC_XOR: |
| return a ^ b; |
| default: |
| case BF_LOGIC_AND: |
| return a & b; |
| } |
| } |
| |
| static int bf_logic_op(bf_t *r, const bf_t *a1, const bf_t *b1, int op) |
| { |
| bf_t b1_s, a1_s, *a, *b; |
| limb_t a_sign, b_sign, r_sign; |
| slimb_t l, i, a_bit_offset, b_bit_offset; |
| limb_t v1, v2, v1_mask, v2_mask, r_mask; |
| int ret; |
| |
| assert(r != a1 && r != b1); |
| |
| if (a1->expn <= 0) |
| a_sign = 0; /* minus zero is considered as positive */ |
| else |
| a_sign = a1->sign; |
| |
| if (b1->expn <= 0) |
| b_sign = 0; /* minus zero is considered as positive */ |
| else |
| b_sign = b1->sign; |
| |
| if (a_sign) { |
| a = &a1_s; |
| bf_init(r->ctx, a); |
| if (bf_add_si(a, a1, 1, BF_PREC_INF, BF_RNDZ)) { |
| b = NULL; |
| goto fail; |
| } |
| } else { |
| a = (bf_t *)a1; |
| } |
| |
| if (b_sign) { |
| b = &b1_s; |
| bf_init(r->ctx, b); |
| if (bf_add_si(b, b1, 1, BF_PREC_INF, BF_RNDZ)) |
| goto fail; |
| } else { |
| b = (bf_t *)b1; |
| } |
| |
| r_sign = bf_logic_op1(a_sign, b_sign, op); |
| if (op == BF_LOGIC_AND && r_sign == 0) { |
| /* no need to compute extra zeros for and */ |
| if (a_sign == 0 && b_sign == 0) |
| l = bf_min(a->expn, b->expn); |
| else if (a_sign == 0) |
| l = a->expn; |
| else |
| l = b->expn; |
| } else { |
| l = bf_max(a->expn, b->expn); |
| } |
| /* Note: a or b can be zero */ |
| l = (bf_max(l, 1) + LIMB_BITS - 1) / LIMB_BITS; |
| if (bf_resize(r, l)) |
| goto fail; |
| a_bit_offset = a->len * LIMB_BITS - a->expn; |
| b_bit_offset = b->len * LIMB_BITS - b->expn; |
| v1_mask = -a_sign; |
| v2_mask = -b_sign; |
| r_mask = -r_sign; |
| for(i = 0; i < l; i++) { |
| v1 = get_bits(a->tab, a->len, a_bit_offset + i * LIMB_BITS) ^ v1_mask; |
| v2 = get_bits(b->tab, b->len, b_bit_offset + i * LIMB_BITS) ^ v2_mask; |
| r->tab[i] = bf_logic_op1(v1, v2, op) ^ r_mask; |
| } |
| r->expn = l * LIMB_BITS; |
| r->sign = r_sign; |
| bf_normalize_and_round(r, BF_PREC_INF, BF_RNDZ); /* cannot fail */ |
| if (r_sign) { |
| if (bf_add_si(r, r, -1, BF_PREC_INF, BF_RNDZ)) |
| goto fail; |
| } |
| ret = 0; |
| done: |
| if (a == &a1_s) |
| bf_delete(a); |
| if (b == &b1_s) |
| bf_delete(b); |
| return ret; |
| fail: |
| bf_set_nan(r); |
| ret = BF_ST_MEM_ERROR; |
| goto done; |
| } |
| |
| /* 'a' and 'b' must be integers. Return 0 or BF_ST_MEM_ERROR. */ |
| int bf_logic_or(bf_t *r, const bf_t *a, const bf_t *b) |
| { |
| return bf_logic_op(r, a, b, BF_LOGIC_OR); |
| } |
| |
| /* 'a' and 'b' must be integers. Return 0 or BF_ST_MEM_ERROR. */ |
| int bf_logic_xor(bf_t *r, const bf_t *a, const bf_t *b) |
| { |
| return bf_logic_op(r, a, b, BF_LOGIC_XOR); |
| } |
| |
| /* 'a' and 'b' must be integers. Return 0 or BF_ST_MEM_ERROR. */ |
| int bf_logic_and(bf_t *r, const bf_t *a, const bf_t *b) |
| { |
| return bf_logic_op(r, a, b, BF_LOGIC_AND); |
| } |
| |
| /* conversion between fixed size types */ |
| |
| typedef union { |
| double d; |
| uint64_t u; |
| } Float64Union; |
| |
| int bf_get_float64(const bf_t *a, double *pres, bf_rnd_t rnd_mode) |
| { |
| Float64Union u; |
| int e, ret; |
| uint64_t m; |
| |
| ret = 0; |
| if (a->expn == BF_EXP_NAN) { |
| u.u = 0x7ff8000000000000; /* quiet nan */ |
| } else { |
| bf_t b_s, *b = &b_s; |
| |
| bf_init(a->ctx, b); |
| bf_set(b, a); |
| if (bf_is_finite(b)) { |
| ret = bf_round(b, 53, rnd_mode | BF_FLAG_SUBNORMAL | bf_set_exp_bits(11)); |
| } |
| if (b->expn == BF_EXP_INF) { |
| e = (1 << 11) - 1; |
| m = 0; |
| } else if (b->expn == BF_EXP_ZERO) { |
| e = 0; |
| m = 0; |
| } else { |
| e = b->expn + 1023 - 1; |
| #if LIMB_BITS == 32 |
| if (b->len == 2) { |
| m = ((uint64_t)b->tab[1] << 32) | b->tab[0]; |
| } else { |
| m = ((uint64_t)b->tab[0] << 32); |
| } |
| #else |
| m = b->tab[0]; |
| #endif |
| if (e <= 0) { |
| /* subnormal */ |
| m = m >> (12 - e); |
| e = 0; |
| } else { |
| m = (m << 1) >> 12; |
| } |
| } |
| u.u = m | ((uint64_t)e << 52) | ((uint64_t)b->sign << 63); |
| bf_delete(b); |
| } |
| *pres = u.d; |
| return ret; |
| } |
| |
| int bf_set_float64(bf_t *a, double d) |
| { |
| Float64Union u; |
| uint64_t m; |
| int shift, e, sgn; |
| |
| u.d = d; |
| sgn = u.u >> 63; |
| e = (u.u >> 52) & ((1 << 11) - 1); |
| m = u.u & (((uint64_t)1 << 52) - 1); |
| if (e == ((1 << 11) - 1)) { |
| if (m != 0) { |
| bf_set_nan(a); |
| } else { |
| bf_set_inf(a, sgn); |
| } |
| } else if (e == 0) { |
| if (m == 0) { |
| bf_set_zero(a, sgn); |
| } else { |
| /* subnormal number */ |
| m <<= 12; |
| shift = clz64(m); |
| m <<= shift; |
| e = -shift; |
| goto norm; |
| } |
| } else { |
| m = (m << 11) | ((uint64_t)1 << 63); |
| norm: |
| a->expn = e - 1023 + 1; |
| #if LIMB_BITS == 32 |
| if (bf_resize(a, 2)) |
| goto fail; |
| a->tab[0] = m; |
| a->tab[1] = m >> 32; |
| #else |
| if (bf_resize(a, 1)) |
| goto fail; |
| a->tab[0] = m; |
| #endif |
| a->sign = sgn; |
| } |
| return 0; |
| fail: |
| bf_set_nan(a); |
| return BF_ST_MEM_ERROR; |
| } |
| |
| /* The rounding mode is always BF_RNDZ. Return BF_ST_INVALID_OP if there |
| is an overflow and 0 otherwise. */ |
| int bf_get_int32(int *pres, const bf_t *a, int flags) |
| { |
| uint32_t v; |
| int ret; |
| if (a->expn >= BF_EXP_INF) { |
| ret = BF_ST_INVALID_OP; |
| if (flags & BF_GET_INT_MOD) { |
| v = 0; |
| } else if (a->expn == BF_EXP_INF) { |
| v = (uint32_t)INT32_MAX + a->sign; |
| } else { |
| v = INT32_MAX; |
| } |
| } else if (a->expn <= 0) { |
| v = 0; |
| ret = 0; |
| } else if (a->expn <= 31) { |
| v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn); |
| if (a->sign) |
| v = -v; |
| ret = 0; |
| } else if (!(flags & BF_GET_INT_MOD)) { |
| ret = BF_ST_INVALID_OP; |
| if (a->sign) { |
| v = (uint32_t)INT32_MAX + 1; |
| if (a->expn == 32 && |
| (a->tab[a->len - 1] >> (LIMB_BITS - 32)) == v) { |
| ret = 0; |
| } |
| } else { |
| v = INT32_MAX; |
| } |
| } else { |
| v = get_bits(a->tab, a->len, a->len * LIMB_BITS - a->expn); |
| if (a->sign) |
| v = -v; |
| ret = 0; |
| } |
| *pres = v; |
| return ret; |
| } |
| |
| /* The rounding mode is always BF_RNDZ. Return BF_ST_INVALID_OP if there |
| is an overflow and 0 otherwise. */ |
| int bf_get_int64(int64_t *pres, const bf_t *a, int flags) |
| { |
| uint64_t v; |
| int ret; |
| if (a->expn >= BF_EXP_INF) { |
| ret = BF_ST_INVALID_OP; |
| if (flags & BF_GET_INT_MOD) { |
| v = 0; |
| } else if (a->expn == BF_EXP_INF) { |
| v = (uint64_t)INT64_MAX + a->sign; |
| } else { |
| v = INT64_MAX; |
| } |
| } else if (a->expn <= 0) { |
| v = 0; |
| ret = 0; |
| } else if (a->expn <= 63) { |
| #if LIMB_BITS == 32 |
| if (a->expn <= 32) |
| v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn); |
| else |
| v = (((uint64_t)a->tab[a->len - 1] << 32) | |
| get_limbz(a, a->len - 2)) >> (64 - a->expn); |
| #else |
| v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn); |
| #endif |
| if (a->sign) |
| v = -v; |
| ret = 0; |
| } else if (!(flags & BF_GET_INT_MOD)) { |
| ret = BF_ST_INVALID_OP; |
| if (a->sign) { |
| uint64_t v1; |
| v = (uint64_t)INT64_MAX + 1; |
| if (a->expn == 64) { |
| v1 = a->tab[a->len - 1]; |
| #if LIMB_BITS == 32 |
| v1 = (v1 << 32) | get_limbz(a, a->len - 2); |
| #endif |
| if (v1 == v) |
| ret = 0; |
| } |
| } else { |
| v = INT64_MAX; |
| } |
| } else { |
| slimb_t bit_pos = a->len * LIMB_BITS - a->expn; |
| v = get_bits(a->tab, a->len, bit_pos); |
| #if LIMB_BITS == 32 |
| v |= (uint64_t)get_bits(a->tab, a->len, bit_pos + 32) << 32; |
| #endif |
| if (a->sign) |
| v = -v; |
| ret = 0; |
| } |
| *pres = v; |
| return ret; |
| } |
| |
| /* The rounding mode is always BF_RNDZ. Return BF_ST_INVALID_OP if there |
| is an overflow and 0 otherwise. */ |
| int bf_get_uint64(uint64_t *pres, const bf_t *a) |
| { |
| uint64_t v; |
| int ret; |
| if (a->expn == BF_EXP_NAN) { |
| goto overflow; |
| } else if (a->expn <= 0) { |
| v = 0; |
| ret = 0; |
| } else if (a->sign) { |
| v = 0; |
| ret = BF_ST_INVALID_OP; |
| } else if (a->expn <= 64) { |
| #if LIMB_BITS == 32 |
| if (a->expn <= 32) |
| v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn); |
| else |
| v = (((uint64_t)a->tab[a->len - 1] << 32) | |
| get_limbz(a, a->len - 2)) >> (64 - a->expn); |
| #else |
| v = a->tab[a->len - 1] >> (LIMB_BITS - a->expn); |
| #endif |
| ret = 0; |
| } else { |
| overflow: |
| v = UINT64_MAX; |
| ret = BF_ST_INVALID_OP; |
| } |
| *pres = v; |
| return ret; |
| } |
| |
| /* base conversion from radix */ |
| |
| static const uint8_t digits_per_limb_table[BF_RADIX_MAX - 1] = { |
| #if LIMB_BITS == 32 |
| 32,20,16,13,12,11,10,10, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, |
| #else |
| 64,40,32,27,24,22,21,20,19,18,17,17,16,16,16,15,15,15,14,14,14,14,13,13,13,13,13,13,13,12,12,12,12,12,12, |
| #endif |
| }; |
| |
| static limb_t get_limb_radix(int radix) |
| { |
| int i, k; |
| limb_t radixl; |
| |
| k = digits_per_limb_table[radix - 2]; |
| radixl = radix; |
| for(i = 1; i < k; i++) |
| radixl *= radix; |
| return radixl; |
| } |
| |
| /* return != 0 if error */ |
| static int bf_integer_from_radix_rec(bf_t *r, const limb_t *tab, |
| limb_t n, int level, limb_t n0, |
| limb_t radix, bf_t *pow_tab) |
| { |
| int ret; |
| if (n == 1) { |
| ret = bf_set_ui(r, tab[0]); |
| } else { |
| bf_t T_s, *T = &T_s, *B; |
| limb_t n1, n2; |
| |
| n2 = (((n0 * 2) >> (level + 1)) + 1) / 2; |
| n1 = n - n2; |
| // printf("level=%d n0=%ld n1=%ld n2=%ld\n", level, n0, n1, n2); |
| B = &pow_tab[level]; |
|