| /*************************************************************************/ |
| /* */ |
| /* Language Technologies Institute */ |
| /* Carnegie Mellon University */ |
| /* Copyright (c) 1999 */ |
| /* All Rights Reserved. */ |
| /* */ |
| /* Permission is hereby granted, free of charge, to use and distribute */ |
| /* this software and its documentation without restriction, including */ |
| /* without limitation the rights to use, copy, modify, merge, publish, */ |
| /* distribute, sublicense, and/or sell copies of this work, and to */ |
| /* permit persons to whom this work is furnished to do so, subject to */ |
| /* the following conditions: */ |
| /* 1. The code must retain the above copyright notice, this list of */ |
| /* conditions and the following disclaimer. */ |
| /* 2. Any modifications must be clearly marked as such. */ |
| /* 3. Original authors' names are not deleted. */ |
| /* 4. The authors' names are not used to endorse or promote products */ |
| /* derived from this software without specific prior written */ |
| /* permission. */ |
| /* */ |
| /* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */ |
| /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */ |
| /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */ |
| /* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */ |
| /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */ |
| /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */ |
| /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */ |
| /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */ |
| /* THIS SOFTWARE. */ |
| /* */ |
| /*************************************************************************/ |
| /* Author: Alan W Black (awb@cs.cmu.edu) */ |
| /* Date: July 1999 */ |
| /*************************************************************************/ |
| /* */ |
| /* Tokenizer for strings and files */ |
| /* */ |
| /*************************************************************************/ |
| #include "cst_tokenstream.h" |
| |
| const cst_string * const cst_ts_default_whitespacesymbols = " \t\n\r"; |
| const cst_string * const cst_ts_default_singlecharsymbols = "(){}[]"; |
| const cst_string * const cst_ts_default_prepunctuationsymbols = "\"'`({["; |
| const cst_string * const cst_ts_default_postpunctuationsymbols = "\"'`.,:;!?(){}[]"; |
| |
| #define TS_BUFFER_SIZE 256 |
| |
| static cst_string ts_getc(cst_tokenstream *ts); |
| static cst_string internal_ts_getc(cst_tokenstream *ts); |
| |
| static void set_charclass_table(cst_tokenstream *ts) |
| { |
| int i; |
| memset(ts->charclass,0,256); /* zero everything */ |
| |
| for (i=0; ts->p_whitespacesymbols[i]; i++) |
| ts->charclass[(unsigned char)ts->p_whitespacesymbols[i]] |= TS_CHARCLASS_WHITESPACE; |
| for (i=0; ts->p_singlecharsymbols[i]; i++) |
| ts->charclass[(unsigned char)ts->p_singlecharsymbols[i]] |= TS_CHARCLASS_SINGLECHAR; |
| for (i=0; ts->p_prepunctuationsymbols[i]; i++) |
| ts->charclass[(unsigned char)ts->p_prepunctuationsymbols[i]] |= TS_CHARCLASS_PREPUNCT; |
| for (i=0; ts->p_postpunctuationsymbols[i]; i++) |
| ts->charclass[(unsigned char)ts->p_postpunctuationsymbols[i]]|=TS_CHARCLASS_POSTPUNCT; |
| return; |
| } |
| |
| void set_charclasses(cst_tokenstream *ts, |
| const cst_string *whitespace, |
| const cst_string *singlecharsymbols, |
| const cst_string *prepunctuation, |
| const cst_string *postpunctuation) |
| { |
| ts->p_whitespacesymbols = |
| (whitespace ? whitespace : cst_ts_default_whitespacesymbols); |
| ts->p_singlecharsymbols = |
| (singlecharsymbols ? singlecharsymbols : cst_ts_default_singlecharsymbols); |
| ts->p_prepunctuationsymbols = |
| (prepunctuation ? prepunctuation : cst_ts_default_prepunctuationsymbols); |
| ts->p_postpunctuationsymbols = |
| (postpunctuation ? postpunctuation : cst_ts_default_postpunctuationsymbols); |
| |
| set_charclass_table(ts); |
| return; |
| } |
| |
| static void extend_buffer(cst_string **buffer,int *buffer_max) |
| { |
| int new_max; |
| cst_string *new_buffer; |
| |
| new_max = (*buffer_max)+(*buffer_max)/5; |
| new_buffer = cst_alloc(cst_string,new_max); |
| memmove(new_buffer,*buffer,*buffer_max); |
| cst_free(*buffer); |
| *buffer = new_buffer; |
| *buffer_max = new_max; |
| } |
| |
| static cst_tokenstream *new_tokenstream(const cst_string *whitespace, |
| const cst_string *singlechars, |
| const cst_string *prepunct, |
| const cst_string *postpunct) |
| { /* Constructor function */ |
| cst_tokenstream *ts = cst_alloc(cst_tokenstream,1); |
| ts->fd = NULL; |
| ts->file_pos = 0; |
| ts->line_number = 0; |
| ts->eof_flag = 0; |
| ts->string_buffer = NULL; |
| ts->token_pos = 0; |
| ts->whitespace = cst_alloc(cst_string,TS_BUFFER_SIZE); |
| ts->ws_max = TS_BUFFER_SIZE; |
| if (prepunct && prepunct[0]) |
| { |
| ts->prepunctuation = cst_alloc(cst_string,TS_BUFFER_SIZE); |
| ts->prep_max = TS_BUFFER_SIZE; |
| } |
| ts->token = cst_alloc(cst_string,TS_BUFFER_SIZE); |
| ts->token_max = TS_BUFFER_SIZE; |
| if (postpunct && postpunct[0]) |
| { |
| ts->postpunctuation = cst_alloc(cst_string,TS_BUFFER_SIZE); |
| ts->postp_max = TS_BUFFER_SIZE; |
| } |
| |
| set_charclasses(ts,whitespace,singlechars,prepunct,postpunct); |
| ts->current_char = 0; |
| |
| return ts; |
| } |
| |
| void delete_tokenstream(cst_tokenstream *ts) |
| { |
| cst_free(ts->whitespace); |
| cst_free(ts->token); |
| if (ts->tags) delete_features(ts->tags); |
| if (ts->prepunctuation) cst_free(ts->prepunctuation); |
| if (ts->postpunctuation) cst_free(ts->postpunctuation); |
| cst_free(ts); |
| } |
| |
| cst_tokenstream *ts_open(const char *filename, |
| const cst_string *whitespace, |
| const cst_string *singlechars, |
| const cst_string *prepunct, |
| const cst_string *postpunct) |
| { |
| cst_tokenstream *ts = new_tokenstream(whitespace, |
| singlechars, |
| prepunct, |
| postpunct); |
| |
| #ifndef UNDER_CE |
| if (cst_streq("-",filename)) |
| ts->fd = stdin; |
| else |
| #endif |
| ts->fd = cst_fopen(filename,CST_OPEN_READ|CST_OPEN_BINARY); |
| ts_getc(ts); |
| |
| if (ts->fd == NULL) |
| { |
| delete_tokenstream(ts); |
| return NULL; |
| } |
| else |
| return ts; |
| } |
| |
| cst_tokenstream *ts_open_string(const cst_string *string, |
| const cst_string *whitespace, |
| const cst_string *singlechars, |
| const cst_string *prepunct, |
| const cst_string *postpunct) |
| { |
| cst_tokenstream *ts = new_tokenstream(whitespace, |
| singlechars, |
| prepunct, |
| postpunct); |
| |
| ts->string_buffer = cst_strdup(string); |
| ts_getc(ts); |
| |
| return ts; |
| } |
| |
| cst_tokenstream *ts_open_generic(const char *filename, |
| const cst_string *whitespacesymbols, |
| const cst_string *singlecharsymbols, |
| const cst_string *prepunctsymbols, |
| const cst_string *postpunctsymbols, |
| void *streamtype_data, |
| int (*open)(cst_tokenstream *ts, |
| const char *filename), |
| void (*close)(cst_tokenstream *ts), |
| int (*eof)(cst_tokenstream *ts), |
| int (*seek)(cst_tokenstream *ts, int pos), |
| int (*tell)(cst_tokenstream *ts), |
| int (*size)(cst_tokenstream *ts), |
| int (*getc)(cst_tokenstream *ts)) |
| { /* Its a generic token stream where user has specified the low level */ |
| /* file/stream access functions */ |
| cst_tokenstream *ts = new_tokenstream(whitespacesymbols, |
| singlecharsymbols, |
| prepunctsymbols, |
| postpunctsymbols); |
| |
| ts->streamtype_data = streamtype_data; |
| ts->open = open; |
| ts->close = close; |
| ts->eof = eof; |
| ts->seek = seek; |
| ts->tell = tell; |
| ts->size = size; |
| ts->getc = getc; |
| |
| if ((ts->open)(ts,filename) != 0) |
| { |
| (ts->getc)(ts); |
| return ts; |
| } |
| else |
| { |
| delete_tokenstream(ts); |
| return NULL; |
| } |
| } |
| |
| void ts_close(cst_tokenstream *ts) |
| { |
| if (ts->fd != NULL) |
| { |
| #ifndef UNDER_CE |
| if (ts->fd != stdin) |
| #endif |
| cst_fclose(ts->fd); |
| ts->fd = NULL; /* just in case close gets called twice */ |
| } |
| if (ts->string_buffer != NULL) |
| { |
| cst_free(ts->string_buffer); |
| ts->string_buffer = NULL; |
| } |
| if (ts->open) |
| (ts->close)(ts); |
| delete_tokenstream(ts); |
| } |
| |
| static void get_token_sub_part(cst_tokenstream *ts, |
| int charclass, |
| cst_string **buffer, |
| int *buffer_max) |
| { |
| int p; |
| |
| for (p=0; ((!ts_eof(ts)) && |
| (ts_charclass(ts->current_char,charclass,ts)) && |
| (!ts_charclass(ts->current_char, |
| TS_CHARCLASS_SINGLECHAR,ts))); p++) |
| { |
| if (p+1 >= *buffer_max) extend_buffer(buffer,buffer_max); |
| (*buffer)[p] = ts->current_char; |
| ts_getc(ts); |
| } |
| (*buffer)[p] = '\0'; |
| } |
| |
| int ts_utf8_sequence_length(char c0) |
| { |
| // Get the expected length of UTF8 sequence given its most |
| // significant byte |
| return (( 0xE5000000 >> (( c0 >> 3 ) & 0x1E )) & 3 ) + 1; |
| } |
| |
| /* Can't afford dynamically generate this char class so have separated func */ |
| /* so do the core token part -- this goes while not givenlass (while the */ |
| /* above function oes while is givenclass */ |
| static void get_token_sub_part_2(cst_tokenstream *ts, |
| int endclass1, |
| cst_string **buffer, |
| int *buffer_max) |
| { |
| int p; |
| |
| for (p=0; ((!ts_eof(ts)) && |
| (!ts_charclass(ts->current_char,endclass1,ts)) && |
| (!ts_charclass(ts->current_char, |
| TS_CHARCLASS_SINGLECHAR,ts))); |
| p++) |
| { |
| if (p+1 >= *buffer_max) extend_buffer(buffer,buffer_max); |
| (*buffer)[p] = ts->current_char; |
| /* If someone sets tags we end the token */ |
| /* This can't happen in standard tokenstreams, but can in user */ |
| /* defined ones */ |
| if (ts->tags) break; |
| |
| /* In the special utf8 char by char mode we end at end of a utf8 char */ |
| if ((ts->utf8_explode_mode) && |
| (p == ts_utf8_sequence_length((*buffer)[0]))) |
| break; |
| |
| ts_getc(ts); |
| } |
| (*buffer)[p] = '\0'; |
| } |
| |
| static void get_token_postpunctuation(cst_tokenstream *ts) |
| { |
| int p,t; |
| |
| t = cst_strlen(ts->token); |
| for (p=t; |
| (p > 0) && |
| ((ts->token[p] == '\0') || |
| (ts_charclass(ts->token[p],TS_CHARCLASS_POSTPUNCT,ts))); |
| p--); |
| |
| if (t != p) |
| { |
| if (t-p >= ts->postp_max) |
| extend_buffer(&ts->postpunctuation,&ts->postp_max); |
| /* Copy postpunctuation from token */ |
| memmove(ts->postpunctuation,&ts->token[p+1],(t-p)); |
| /* truncate token at postpunctuation */ |
| ts->token[p+1] = '\0'; |
| } |
| } |
| |
| int ts_eof(cst_tokenstream *ts) |
| { |
| if (ts->eof_flag) |
| return TRUE; |
| else |
| return FALSE; |
| } |
| |
| int ts_set_stream_pos(cst_tokenstream *ts, int pos) |
| { |
| /* Note this doesn't preserve line_pos */ |
| int new_pos, l; |
| |
| if (ts->fd) |
| { |
| new_pos = (int)cst_fseek(ts->fd,(long)pos,CST_SEEK_ABSOLUTE); |
| if (new_pos == pos) |
| ts->eof_flag = FALSE; |
| } |
| else if (ts->string_buffer) |
| { |
| l = cst_strlen(ts->string_buffer); |
| if (pos > l) |
| new_pos = l; |
| else if (pos < 0) |
| new_pos = 0; |
| else |
| new_pos = pos; |
| ts->eof_flag = FALSE; |
| } |
| else if (ts->open) |
| { |
| new_pos = (ts->seek)(ts,pos); |
| if (new_pos == pos) |
| ts->eof_flag = FALSE; |
| } |
| else |
| new_pos = pos; /* not sure it can get here */ |
| ts->file_pos = new_pos; |
| ts->current_char = ' '; /* To be safe (but this is wrong) */ |
| |
| return ts->file_pos; |
| } |
| |
| int ts_get_stream_pos(cst_tokenstream *ts) |
| { |
| if (ts->open) |
| return (ts->tell)(ts); |
| else |
| return ts->file_pos; |
| } |
| |
| int ts_get_stream_size(cst_tokenstream *ts) |
| { |
| int current_pos, end_pos; |
| if (ts->fd) |
| { |
| current_pos = ts->file_pos; |
| end_pos = (int)cst_fseek(ts->fd,(long)0,CST_SEEK_ENDREL); |
| cst_fseek(ts->fd,(long)current_pos,CST_SEEK_ABSOLUTE); |
| return end_pos; |
| } else if (ts->string_buffer) |
| return cst_strlen(ts->string_buffer); |
| else if (ts->open) |
| return (ts->size)(ts); |
| else |
| return 0; |
| } |
| |
| cst_string private_ts_getc(cst_tokenstream *ts) |
| { |
| return internal_ts_getc(ts); |
| } |
| |
| static cst_string ts_getc(cst_tokenstream *ts) |
| { |
| if (ts->open) |
| ts->current_char = (ts->getc)(ts); |
| else |
| ts->current_char = internal_ts_getc(ts); |
| return ts->current_char; |
| } |
| |
| static cst_string internal_ts_getc(cst_tokenstream *ts) |
| { |
| if (ts->fd) |
| { |
| ts->current_char = cst_fgetc(ts->fd); |
| if (ts->current_char == -1) |
| ts->eof_flag = TRUE; |
| } |
| else if (ts->string_buffer) |
| { |
| if (ts->string_buffer[ts->file_pos] == '\0') |
| { |
| ts->eof_flag = TRUE; |
| ts->current_char = '\0'; |
| } |
| else |
| ts->current_char = ts->string_buffer[ts->file_pos]; |
| } |
| |
| if (!ts_eof(ts)) |
| ts->file_pos++; |
| if (ts->current_char == '\n') |
| ts->line_number++; |
| return ts->current_char; |
| } |
| |
| const cst_string *ts_get_quoted_token(cst_tokenstream *ts, |
| char quote, |
| char escape) |
| { |
| /* for reading the next quoted token that starts with quote and |
| ends with quote, quote may appear only if preceded by escape */ |
| int p; |
| |
| /* Hmm can't change quotes within a ts */ |
| ts->charclass[(unsigned int)quote] |= TS_CHARCLASS_QUOTE; |
| ts->charclass[(unsigned int)escape] |= TS_CHARCLASS_QUOTE; |
| |
| /* skipping whitespace */ |
| get_token_sub_part(ts,TS_CHARCLASS_WHITESPACE, |
| &ts->whitespace, |
| &ts->ws_max); |
| ts->token_pos = ts->file_pos - 1; |
| |
| if (ts->current_char == quote) |
| { /* go until quote */ |
| ts_getc(ts); |
| for (p=0; ((!ts_eof(ts)) && |
| (ts->current_char != quote)); |
| p++) |
| { |
| if (p >= ts->token_max) |
| extend_buffer(&ts->token,&ts->token_max); |
| ts->token[p] = ts->current_char; |
| ts_getc(ts); |
| if (ts->current_char == escape) |
| { |
| ts_get(ts); |
| if (p >= ts->token_max) |
| extend_buffer(&ts->token,&ts->token_max); |
| ts->token[p] = ts->current_char; |
| ts_get(ts); |
| } |
| } |
| ts->token[p] = '\0'; |
| ts_getc(ts); |
| } |
| else /* its not quoted, like to be careful dont you */ |
| { /* treat is as standard token */ |
| /* Get prepunctuation */ |
| extend_buffer(&ts->prepunctuation,&ts->prep_max); |
| get_token_sub_part(ts,TS_CHARCLASS_PREPUNCT, |
| &ts->prepunctuation, |
| &ts->prep_max); |
| /* Get the symbol itself */ |
| if (ts_charclass(ts->current_char,TS_CHARCLASS_SINGLECHAR,ts)) |
| { |
| if (2 >= ts->token_max) extend_buffer(&ts->token,&ts->token_max); |
| ts->token[0] = ts->current_char; |
| ts->token[1] = '\0'; |
| ts_getc(ts); |
| } |
| else |
| get_token_sub_part_2(ts, |
| TS_CHARCLASS_WHITESPACE, /* end class1 */ |
| &ts->token, |
| &ts->token_max); |
| /* This'll have token *plus* post punctuation in ts->token */ |
| /* Get postpunctuation */ |
| get_token_postpunctuation(ts); |
| } |
| |
| return ts->token; |
| } |
| |
| const cst_string *ts_get(cst_tokenstream *ts) |
| { |
| /* Get next token */ |
| |
| if (ts->tags) |
| { /* Someone didn't delete them before -- so we delete them now */ |
| delete_features(ts->tags); |
| ts->tags = NULL; |
| } |
| |
| /* Skip whitespace */ |
| get_token_sub_part(ts, |
| TS_CHARCLASS_WHITESPACE, |
| &ts->whitespace, |
| &ts->ws_max); |
| |
| /* quoted strings currently ignored */ |
| ts->token_pos = ts->file_pos - 1; |
| |
| /* Get prepunctuation */ |
| if (!ts_eof(ts) && |
| ts_charclass(ts->current_char,TS_CHARCLASS_PREPUNCT,ts)) |
| get_token_sub_part(ts, |
| TS_CHARCLASS_PREPUNCT, |
| &ts->prepunctuation, |
| &ts->prep_max); |
| else if (ts->prepunctuation) |
| ts->prepunctuation[0] = '\0'; |
| /* Get the symbol itself */ |
| if (!ts_eof(ts) && |
| ts_charclass(ts->current_char,TS_CHARCLASS_SINGLECHAR,ts)) |
| { |
| if (2 >= ts->token_max) extend_buffer(&ts->token,&ts->token_max); |
| ts->token[0] = ts->current_char; |
| ts->token[1] = '\0'; |
| ts_getc(ts); |
| } |
| else |
| get_token_sub_part_2(ts, |
| TS_CHARCLASS_WHITESPACE, /* end class1 */ |
| &ts->token, |
| &ts->token_max); |
| /* This'll have token *plus* post punctuation in ts->token */ |
| /* Get postpunctuation */ |
| if (ts->postpunctuation) |
| ts->postpunctuation[0] = '\0'; |
| if (ts->p_postpunctuationsymbols[0]) |
| get_token_postpunctuation(ts); |
| |
| return ts->token; |
| } |
| |
| int ts_read(void *buff, int size, int num, cst_tokenstream *ts) |
| { |
| /* people should complain about the speed here */ |
| /* people will complain about EOF as end of file */ |
| int i,j,p; |
| cst_string *cbuff; |
| |
| cbuff = (cst_string *)buff; |
| |
| for (p=i=0; i < num; i++) |
| for (j=0; j < size; j++,p++) |
| cbuff[p] = ts_getc(ts); |
| |
| return i; |
| } |