blob: 2c643a59ea8bbd847538416b0a50453dc6bcbd60 [file] [log] [blame]
/*************************************************************************/
/* */
/* Language Technologies Institute */
/* Carnegie Mellon University */
/* Copyright (c) 1999 */
/* All Rights Reserved. */
/* */
/* Permission is hereby granted, free of charge, to use and distribute */
/* this software and its documentation without restriction, including */
/* without limitation the rights to use, copy, modify, merge, publish, */
/* distribute, sublicense, and/or sell copies of this work, and to */
/* permit persons to whom this work is furnished to do so, subject to */
/* the following conditions: */
/* 1. The code must retain the above copyright notice, this list of */
/* conditions and the following disclaimer. */
/* 2. Any modifications must be clearly marked as such. */
/* 3. Original authors' names are not deleted. */
/* 4. The authors' names are not used to endorse or promote products */
/* derived from this software without specific prior written */
/* permission. */
/* */
/* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */
/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
/* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */
/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
/* THIS SOFTWARE. */
/* */
/*************************************************************************/
/* Author: Alan W Black (awb@cs.cmu.edu) */
/* Date: July 1999 */
/*************************************************************************/
/* */
/* Tokenizer for strings and files */
/* */
/*************************************************************************/
#ifndef _CST_TOKENSTREAM_H__
#define _CST_TOKENSTREAM_H__
#include "cst_alloc.h"
#include "cst_string.h"
#include "cst_file.h"
#include "cst_features.h"
typedef struct cst_tokenstream_struct {
cst_file fd;
int file_pos;
int line_number;
int eof_flag;
cst_string *string_buffer;
int current_char;
int token_pos;
int ws_max;
cst_string *whitespace;
int prep_max;
cst_string *prepunctuation;
int token_max;
cst_string *token;
int postp_max;
cst_string *postpunctuation;
cst_features *tags; /* e.g xml tags */
/* if set will find token boundaries at every utf8 character */
int utf8_explode_mode;
void *streamtype_data;
/* Should only be set through set_charclasses as charclass table needs */
/* to be updated when you reset these */
const cst_string *p_whitespacesymbols;
const cst_string *p_singlecharsymbols;
const cst_string *p_prepunctuationsymbols;
const cst_string *p_postpunctuationsymbols;
cst_string charclass[256];
/* To allow externally specified reading functions e.g. epub/xml */
int (*open)(struct cst_tokenstream_struct *ts, const char *filename);
void (*close)(struct cst_tokenstream_struct *ts);
int (*eof)(struct cst_tokenstream_struct *ts);
int (*seek)(struct cst_tokenstream_struct *ts, int pos);
int (*tell)(struct cst_tokenstream_struct *ts);
int (*size)(struct cst_tokenstream_struct *ts);
int (*getc)(struct cst_tokenstream_struct *ts);
} cst_tokenstream;
#define TS_CHARCLASS_NONE 0
#define TS_CHARCLASS_WHITESPACE 2
#define TS_CHARCLASS_SINGLECHAR 4
#define TS_CHARCLASS_PREPUNCT 8
#define TS_CHARCLASS_POSTPUNCT 16
#define TS_CHARCLASS_QUOTE 32
#define ts_charclass(C,CLASS,TS) ((TS)->charclass[(unsigned char)C] & CLASS)
extern const cst_string * const cst_ts_default_whitespacesymbols;
extern const cst_string * const cst_ts_default_prepunctuationsymbols;
extern const cst_string * const cst_ts_default_postpunctuationsymbols;
extern const cst_string * const cst_ts_default_singlecharsymbols;
/* Public functions for tokenstream manipulation */
cst_tokenstream *ts_open(const char *filename,
const cst_string *whitespacesymbols,
const cst_string *singlecharsymbols,
const cst_string *prepunctsymbols,
const cst_string *postpunctsymbols);
cst_tokenstream *ts_open_string(const cst_string *string,
const cst_string *whitespacesymbols,
const cst_string *singlecharsymbols,
const cst_string *prepunctsymbols,
const cst_string *postpunctsymbols);
cst_tokenstream *ts_open_generic(const char *filename,
const cst_string *whitespacesymbols,
const cst_string *singlecharsymbols,
const cst_string *prepunctsymbols,
const cst_string *postpunctsymbols,
void *streamtype_data,
int (*open)(cst_tokenstream *ts,
const char *filename),
void (*close)(cst_tokenstream *ts),
int (*eof)(cst_tokenstream *ts),
int (*seek)(cst_tokenstream *ts, int pos),
int (*tell)(cst_tokenstream *ts),
int (*size)(cst_tokenstream *ts),
int (*getc)(cst_tokenstream *ts));
void ts_close(cst_tokenstream *ts);
int ts_eof(cst_tokenstream *ts);
const cst_string *ts_get(cst_tokenstream *ts);
const cst_string *ts_get_quoted_token(cst_tokenstream *ts,
char quote,
char escape);
/* Externally specified ts interfaces may need this */
cst_string private_ts_getc(cst_tokenstream *ts);
void set_charclasses(cst_tokenstream *ts,
const cst_string *whitespace,
const cst_string *singlecharsymbols,
const cst_string *prepunctuation,
const cst_string *postpunctuation);
int ts_read(void *buff, int size, int num, cst_tokenstream *ts);
int ts_set_stream_pos(cst_tokenstream *ts,int pos);
int ts_get_stream_pos(cst_tokenstream *ts);
int ts_get_stream_size(cst_tokenstream *ts);
#endif