blob: eaf6744463f3a729f2ef7df72468fed0469aacdc [file] [log] [blame]
/*************************************************************************/
/* */
/* Language Technologies Institute */
/* Carnegie Mellon University */
/* Copyright (c) 1999 */
/* All Rights Reserved. */
/* */
/* Permission is hereby granted, free of charge, to use and distribute */
/* this software and its documentation without restriction, including */
/* without limitation the rights to use, copy, modify, merge, publish, */
/* distribute, sublicense, and/or sell copies of this work, and to */
/* permit persons to whom this work is furnished to do so, subject to */
/* the following conditions: */
/* 1. The code must retain the above copyright notice, this list of */
/* conditions and the following disclaimer. */
/* 2. Any modifications must be clearly marked as such. */
/* 3. Original authors' names are not deleted. */
/* 4. The authors' names are not used to endorse or promote products */
/* derived from this software without specific prior written */
/* permission. */
/* */
/* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */
/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
/* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */
/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
/* THIS SOFTWARE. */
/* */
/*************************************************************************/
/* Author: Alan W Black (awb@cs.cmu.edu) */
/* Date: December 1999 */
/*************************************************************************/
/* */
/* Typed values */
/* */
/*************************************************************************/
#include "cst_math.h"
#include "cst_file.h"
#include "cst_val.h"
#include "cst_string.h"
#include "cst_tokenstream.h"
static cst_val *new_val()
{
return cst_alloc(struct cst_val_struct,1);
}
cst_val *int_val(int i)
{
cst_val *v = new_val();
CST_VAL_TYPE(v) = CST_VAL_TYPE_INT;
CST_VAL_INT(v) = i;
return v;
}
cst_val *float_val(float f)
{
cst_val *v = new_val();
CST_VAL_TYPE(v) = CST_VAL_TYPE_FLOAT;
CST_VAL_FLOAT(v) = f;
return v;
}
cst_val *string_val(const char *s)
{
cst_val *v = new_val();
CST_VAL_TYPE(v) = CST_VAL_TYPE_STRING;
/* would be nice to note if this is a deletable string or not */
CST_VAL_STRING_LVAL(v) = cst_strdup(s);
return v;
}
cst_val *cons_val(const cst_val *a, const cst_val *b)
{
cst_val *v = new_val();
CST_VAL_CAR(v)=((!a || cst_val_consp(a)) ?
(cst_val *)(void *)a:val_inc_refcount(a));
CST_VAL_CDR(v)=((!b || cst_val_consp(b)) ?
(cst_val *)(void *)b:val_inc_refcount(b));
return v;
}
cst_val *val_new_typed(int type,void *vv)
{
cst_val *v = new_val();
CST_VAL_TYPE(v) = type;
CST_VAL_VOID(v) = vv;
return v;
}
void delete_val_list(cst_val *v)
{
if (v)
{
if (cst_val_consp(v))
{
delete_val_list(CST_VAL_CDR(v));
cst_free(v);
}
else
delete_val(v);
}
}
void delete_val(cst_val *v)
{
if (v)
{
if (cst_val_consp(v))
{
delete_val(CST_VAL_CAR(v));
delete_val(CST_VAL_CDR(v));
cst_free(v);
}
else if (val_dec_refcount(v) == 0)
{
if (CST_VAL_TYPE(v) == CST_VAL_TYPE_STRING)
cst_free(CST_VAL_VOID(v));
else if (CST_VAL_TYPE(v) >= CST_VAL_TYPE_FIRST_FREE)
{
if (cst_val_defs[CST_VAL_TYPE(v)/2].delete_function)
(cst_val_defs[CST_VAL_TYPE(v)/2].delete_function)
(CST_VAL_VOID(v));
}
cst_free(v);
}
}
}
/* Accessor functions */
int val_int(const cst_val *v)
{
if (v && (CST_VAL_TYPE(v) == CST_VAL_TYPE_INT))
return CST_VAL_INT(v);
else if (v && (CST_VAL_TYPE(v) == CST_VAL_TYPE_FLOAT))
return (int)CST_VAL_FLOAT(v);
else if (v && (CST_VAL_TYPE(v) == CST_VAL_TYPE_STRING))
return atoi(CST_VAL_STRING(v));
else
{
cst_errmsg("VAL: tried to access int in %d typed val\n",
(v ? CST_VAL_TYPE(v) : -1));
cst_error();
}
return 0;
}
float val_float(const cst_val *v)
{
if (v && (CST_VAL_TYPE(v) == CST_VAL_TYPE_INT))
return (float)CST_VAL_INT(v);
else if (v && (CST_VAL_TYPE(v) == CST_VAL_TYPE_FLOAT))
return CST_VAL_FLOAT(v);
else if (v && (CST_VAL_TYPE(v) == CST_VAL_TYPE_STRING))
return cst_atof(CST_VAL_STRING(v));
else
{
cst_errmsg("VAL: tried to access float in %d typed val\n",
(v ? CST_VAL_TYPE(v) : -1));
cst_error();
}
return 0;
}
const char *val_string(const cst_val *v)
{
if (v && (CST_VAL_TYPE(v) == CST_VAL_TYPE_STRING))
return CST_VAL_STRING(v);
else
{
cst_errmsg("VAL: tried to access string in %d typed val\n",
(v ? CST_VAL_TYPE(v) : -1));
cst_error();
}
return 0;
}
const cst_val *val_car(const cst_val *v)
{
if (v && cst_val_consp(v))
return CST_VAL_CAR(v);
else
{
cst_errmsg("VAL: tried to access car in %d typed val\n",
(v ? CST_VAL_TYPE(v) : -1));
cst_error();
}
return 0;
}
const cst_val *val_cdr(const cst_val *v)
{
if (v && cst_val_consp(v))
return CST_VAL_CDR(v);
else
{
cst_errmsg("VAL: tried to access cdr in %d typed val\n",
(v ? CST_VAL_TYPE(v) : -1));
cst_error();
}
return 0;
}
void *val_generic(const cst_val *v, int type, const char *stype)
{ /* a generic access function that checks the expected type */
if (v && CST_VAL_TYPE(v) == type)
return CST_VAL_VOID(v);
else
{
cst_errmsg("VAL: tried to access %s in %d type val\n",
stype,
(v ? CST_VAL_TYPE(v) : -1));
cst_error();
}
return NULL;
}
void *val_void(const cst_val *v)
{
/* The scary, do anything function, this shouldn't be called by mortals */
if ((v == NULL) ||
(CST_VAL_TYPE(v) == CST_VAL_TYPE_CONS) ||
(CST_VAL_TYPE(v) == CST_VAL_TYPE_INT) ||
(CST_VAL_TYPE(v) == CST_VAL_TYPE_FLOAT))
{
cst_errmsg("VAL: tried to access void in %d typed val\n",
(v ? CST_VAL_TYPE(v) : -1));
cst_error();
return NULL;
}
else
return CST_VAL_VOID(v);
}
int cst_val_consp(const cst_val *v)
{
/* To keep a val cell down to 8 bytes we identify non-cons cells */
/* with non-zero values in the least significant bit of the first */
/* address in the cell (this is a standard technique used on Lisp */
/* machines) */
#if 0
void *t;
int t1;
/* Hmm this still isn't right (it can be) but this isn't it */
t = CST_VAL_CAR(v);
t1 = *(int *)&t;
if ((t1&0x1) == 0)
return TRUE;
else
return FALSE;
#endif
const cst_val_atom *t;
t = (const cst_val_atom *)v;
if (t->type % 2 == 0)
return TRUE;
else
return FALSE;
}
const cst_val *set_cdr(cst_val *v1, const cst_val *v2)
{
/* destructive set cdr, be careful you have a pointer to current cdr */
if (!cst_val_consp(v1))
{
cst_errmsg("VAL: tried to set cdr of non-consp cell\n");
cst_error();
return NULL;
}
else
{
if (CST_VAL_CDR(v1))
{
val_dec_refcount(CST_VAL_CDR(v1));
val_inc_refcount(v1);
}
CST_VAL_CDR(v1) = (cst_val *)v2;
}
return v1;
}
const cst_val *set_car(cst_val *v1, const cst_val *v2)
{
/* destructive set car, be careful you have a pointer to current car */
if (!cst_val_consp(v1))
{
cst_errmsg("VAL: tried to set car of non-consp cell\n");
cst_error();
return NULL;
}
else
{
val_dec_refcount(CST_VAL_CAR(v1));
val_inc_refcount(v1);
CST_VAL_CAR(v1) = (cst_val *)v2;
}
return v1;
}
void val_print(cst_file fd,const cst_val *v)
{
const cst_val *p;
if (v == NULL)
cst_fprintf(fd,"[null]");
else if (CST_VAL_TYPE(v) == CST_VAL_TYPE_INT)
cst_fprintf(fd,"%d",val_int(v));
else if (CST_VAL_TYPE(v) == CST_VAL_TYPE_FLOAT)
cst_fprintf(fd,"%f",val_float(v));
else if (CST_VAL_TYPE(v) == CST_VAL_TYPE_STRING)
cst_fprintf(fd,"%s",val_string(v));
else if (cst_val_consp(v))
{
cst_fprintf(fd,"(");
for (p=v; p; )
{
val_print(fd,val_car(p));
p=val_cdr(p);
if (p)
cst_fprintf(fd," ");
if (p && !cst_val_consp(p)) /* dotted pairs for non-list */
{
cst_fprintf(fd,". ");
val_print(fd,p);
break;
}
}
cst_fprintf(fd,")");
}
else
cst_fprintf(fd,"[Val %s 0x%p]",
cst_val_defs[CST_VAL_TYPE(v)/2].name,CST_VAL_VOID(v));
}
cst_val *val_reverse(cst_val *l)
{ /* destructively reverse the list */
cst_val *n,*np,*nl;
for (nl=0,n=l; n; nl=n,n=np)
{
np=CST_VAL_CDR(n);
CST_VAL_CDR(n) = nl;
}
return nl;
}
cst_val *val_append(cst_val *l1, cst_val *l2)
{
/* Destructively add l2 to the end of l1 return l1 */
cst_val *t;
if (l1 == 0)
return l2;
else
{
for (t=l1; val_cdr(t); t=CST_VAL_CDR(t));
CST_VAL_CDR(t) = l2;
return l1;
}
}
int val_length(const cst_val *l)
{
const cst_val *n;
int i;
for (i=0,n=l; n; n=val_cdr(n))
i++;
return i;
}
int val_equal(const cst_val *v1, const cst_val *v2)
{
if (v1 == v2)
return TRUE; /* its eq so its equal */
else if (v1 == 0)
return FALSE;
else if (CST_VAL_TYPE(v1) == CST_VAL_TYPE(v2))
{
if (cst_val_consp(v1))
return ((val_equal(val_car(v1),val_car(v2))) &&
(val_equal(val_cdr(v1),val_cdr(v2))));
else if (CST_VAL_TYPE(v1) == CST_VAL_TYPE_INT)
return (val_int(v1) == val_int(v2));
else if (CST_VAL_TYPE(v1) == CST_VAL_TYPE_FLOAT)
return (val_float(v1) == val_float(v2));
else if (CST_VAL_TYPE(v1) == CST_VAL_TYPE_STRING)
return (cst_streq(CST_VAL_STRING(v1),CST_VAL_STRING(v2)));
else
return CST_VAL_VOID(v1) == CST_VAL_VOID(v2);
}
else
return FALSE;
}
int val_less(const cst_val *v1, const cst_val *v2)
{
return val_float(v1) < val_float(v2);
}
int val_greater(const cst_val *v1,const cst_val *v2)
{
return val_float(v1) > val_float(v2);
}
int val_member(const cst_val *v1,const cst_val *l)
{
const cst_val *i;
for (i=l; i; i=val_cdr(i))
{
if (val_equal(val_car(i),v1))
return TRUE;
}
return FALSE;
}
int val_member_string(const char *v1,const cst_val *l)
{
const cst_val *i;
for (i=l; i; i=val_cdr(i))
{
if (cst_streq(v1,val_string(val_car(i))))
return TRUE;
}
return FALSE;
}
cst_val *val_inc_refcount(const cst_val *b)
{
cst_val *wb;
/* Well I was lying, they're not really const, but this is the place */
/* where breaking const is reasonable */
wb = (cst_val *)(void *)b;
if (CST_VAL_REFCOUNT(wb) == -1)
/* or is a cons cell in the text segment, how do I do that ? */
return wb;
else if (!cst_val_consp(wb)) /* we don't ref count cons cells */
CST_VAL_REFCOUNT(wb) += 1;
return wb;
}
int val_dec_refcount(const cst_val *b)
{
cst_val *wb;
wb = (cst_val *)(void *)b;
if (CST_VAL_REFCOUNT(wb) == -1)
/* or is a cons cell in the text segment, how do I do that ? */
return -1;
else if (cst_val_consp(wb)) /* we don't ref count cons cells */
return 0;
else if (CST_VAL_REFCOUNT(wb) == 0)
{
/* Otherwise, trying to free a val outside an
item/relation/etc has rather the opposite effect from what
you might have intended... */
return 0;
}
else
{
CST_VAL_REFCOUNT(wb) -= 1;
return CST_VAL_REFCOUNT(wb);
}
}
int utf8_sequence_length(char c0)
{
// Get the expected length of UTF8 sequence given its most
// significant byte
return (( 0xE5000000 >> (( c0 >> 3 ) & 0x1E )) & 3 ) + 1;
}
cst_val *cst_utf8_explode(const cst_string *utf8string)
{
// Return a list of utf8 characters as strings
cst_val *chars=NULL;
const unsigned char *str = (const unsigned char*)utf8string;
char utf8char[5];
char c0;
int charlength;
while ((c0 = *str))
{
charlength = utf8_sequence_length(c0);
snprintf(utf8char, charlength + 1, "%s", str);
chars = cons_val(string_val(utf8char),chars);
str += charlength;
}
return val_reverse(chars);
}
static int utf8_ord(const char *utf8_seq) {
unsigned int len;
int ord;
unsigned char c0, c1, c2, c3; // Potential bytes in the UTF8 symbol
c0 = utf8_seq[0];
len = utf8_sequence_length(c0);
// Make sure the string sequence we received matches with the
// expected length, and that the expected length is nonzero.
if ( (len == 0) ||
(len != strlen(utf8_seq))) {
return -1;
}
if (len == 1) {
// ASCII sequence.
return c0;
}
c1 = utf8_seq[1];
if (len == 2) {
ord = ((c0 & 0x1F) << 6) | (c1 & 0x3F);
if (ord < 0x80)
return -1;
return ord;
}
c2 = utf8_seq[2];
if (len == 3) {
if ((c2 & 0xC0) != 0x80)
return -1;
ord = ((c0 & 0x0F) << 12) | ((c1 & 0x3F) << 6) | (c2 & 0x3F);
if (ord < 0x800 ||
(ord >= 0xD800 && ord <= 0xDFFF))
return -1;
return ord;
}
c3 = utf8_seq[3];
if (len == 4) {
if ((c3 & 0xC0) != 0x80)
return -1;
ord =
((c0 & 0x7) << 18) | ((c1 & 0x3F) << 12) |
((c2 & 0x3F) << 6) | (c3 & 0x3F);
if (ord < 0x10000 || ord > 0x10FFFF)
return -1;
return ord;
}
return -1;
}
cst_val *cst_utf8_ord(const cst_val *utf8_char) {
const char *ch=(const char *)val_string(utf8_char);
return int_val(utf8_ord(ch));
}
int cst_utf8_ord_string(const char *utf8_char)
{
return utf8_ord(utf8_char);
}
static int utf8_chr(int ord, char* utf8char) {
unsigned int utf8len;
int i = 0;
if (ord < 0x80) {
utf8len = 1;
} else if (ord < 0x800) {
utf8len = 2;
} else if (ord <= 0xFFFF) {
utf8len = 3;
} else if (ord <= 0x200000) {
utf8len = 4;
} else {
// Replace invalid character with FFFD
utf8len = 2;
ord = 0xFFFD;
}
i = utf8len; // Index into utf8char
utf8char[i--] = 0;
switch (utf8len) {
// These fallthrough deliberately
case 6:
utf8char[i--] = (ord | 0x80) & 0xBF;
ord >>= 6;
case 5:
utf8char[i--] = (ord | 0x80) & 0xBF;
ord >>= 6;
case 4:
utf8char[i--] = (ord | 0x80) & 0xBF;
ord >>= 6;
case 3:
utf8char[i--] = (ord | 0x80) & 0xBF;
ord >>= 6;
case 2:
utf8char[i--] = (ord | 0x80) & 0xBF;
ord >>= 6;
case 1:
switch (utf8len) {
case 0:
case 1:
utf8char[i--] = ord;
break;
case 2:
utf8char[i--] = ord | 0xC0;
break;
case 3:
utf8char[i--] = ord | 0xE0;
break;
case 4:
utf8char[i--] = ord | 0xF0;
}
}
return utf8len;
}
cst_val *cst_utf8_chr(const cst_val *ord) {
char ch[5];
int utf8len;
utf8len = utf8_chr(val_int(ord),ch);
if (utf8len == 0) {
return 0;
}
return string_val(ch);
}
int val_stringp(const cst_val *v)
{
if (cst_val_consp(v))
return FALSE;
else if (CST_VAL_TYPE(v) == CST_VAL_TYPE_STRING)
return TRUE;
else
return FALSE;
}
const cst_val *val_assoc_string(const char *v1,const cst_val *al)
{
const cst_val *i;
for (i=al; i; i=val_cdr(i))
{
if (cst_streq(v1,val_string(val_car(val_car(i)))))
return val_car(i);
}
return NULL;
}
cst_string *cst_implode(const cst_val *sl)
{
const cst_val *v;
int l=0;
char *s;
for (v=sl; v; v=val_cdr(v))
{
if (val_stringp(val_car(v)))
l += cst_strlen(val_string(val_car(v)));
}
s = cst_alloc(cst_string,l+1);
for (v=sl; v; v=val_cdr(v))
{
if (val_stringp(val_car(v)))
cst_sprintf(s,"%s%s",s,val_string(val_car(v)));
}
return s;
}
cst_val *val_readlist_string(const char *str)
{ /* not fully general but a good start */
cst_tokenstream *ts;
cst_val *v = NULL;
const char *p;
ts = ts_open_string(str,
cst_ts_default_whitespacesymbols,
"",
"",
"");
while (!ts_eof(ts))
{
p = ts_get(ts);
v = cons_val(string_val(p),v);
}
ts_close(ts);
return val_reverse(v);
}