include/cst_tokenstream.h - third_party/flite - Git at Google

 /*************************************************************************/
 /*                                                                       */
 /*                  Language Technologies Institute                      */
 /*                     Carnegie Mellon University                        */
 /*                        Copyright (c) 1999                             */
 /*                        All Rights Reserved.                           */
 /*                                                                       */
 /*  Permission is hereby granted, free of charge, to use and distribute  */
 /*  this software and its documentation without restriction, including   */
 /*  without limitation the rights to use, copy, modify, merge, publish,  */
 /*  distribute, sublicense, and/or sell copies of this work, and to      */
 /*  permit persons to whom this work is furnished to do so, subject to   */
 /*  the following conditions:                                            */
 /*   1. The code must retain the above copyright notice, this list of    */
 /*      conditions and the following disclaimer.                         */
 /*   2. Any modifications must be clearly marked as such.                */
 /*   3. Original authors' names are not deleted.                         */
 /*   4. The authors' names are not used to endorse or promote products   */
 /*      derived from this software without specific prior written        */
 /*      permission.                                                      */
 /*                                                                       */
 /*  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         */
 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
 /*  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      */
 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
 /*  THIS SOFTWARE.                                                       */
 /*                                                                       */
 /*************************************************************************/
 /*             Author:  Alan W Black (awb@cs.cmu.edu)                    */
 /*               Date:  July 1999                                        */
 /*************************************************************************/
 /*                                                                       */
 /*  Tokenizer for strings and files                                      */
 /*                                                                       */
 /*************************************************************************/
 #ifndef _CST_TOKENSTREAM_H__
 #define _CST_TOKENSTREAM_H__

 #include "cst_alloc.h"
 #include "cst_string.h"
 #include "cst_file.h"
 #include "cst_features.h"

 typedef struct  cst_tokenstream_struct {
     cst_file fd;
     int file_pos;
     int line_number;
     int eof_flag;
     cst_string *string_buffer;

     int current_char;

     int token_pos;
     int ws_max;
     cst_string *whitespace;
     int prep_max;
     cst_string *prepunctuation;
     int token_max;
     cst_string *token;
     int postp_max;
     cst_string *postpunctuation;

     cst_features *tags;  /* e.g xml tags */
     /* if set will find token boundaries at every utf8 character */
     int utf8_explode_mode;

     void *streamtype_data;

     /* Should only be set through set_charclasses as charclass table needs */
     /* to be updated when you reset these                                  */
     const cst_string *p_whitespacesymbols;
     const cst_string *p_singlecharsymbols;
     const cst_string *p_prepunctuationsymbols;
     const cst_string *p_postpunctuationsymbols;

     cst_string charclass[256];

     /* To allow externally specified reading functions e.g. epub/xml */
     int (*open)(struct cst_tokenstream_struct *ts, const char *filename);
     void (*close)(struct cst_tokenstream_struct *ts);
     int (*eof)(struct cst_tokenstream_struct *ts);
     int (*seek)(struct cst_tokenstream_struct *ts, int pos);
     int (*tell)(struct cst_tokenstream_struct *ts);
     int (*size)(struct cst_tokenstream_struct *ts);
     int (*getc)(struct cst_tokenstream_struct *ts);
 } cst_tokenstream;

 #define TS_CHARCLASS_NONE        0
 #define TS_CHARCLASS_WHITESPACE  2
 #define TS_CHARCLASS_SINGLECHAR  4
 #define TS_CHARCLASS_PREPUNCT    8
 #define TS_CHARCLASS_POSTPUNCT  16
 #define TS_CHARCLASS_QUOTE      32

 #define ts_charclass(C,CLASS,TS) ((TS)->charclass[(unsigned char)C] & CLASS)

 extern const cst_string * const cst_ts_default_whitespacesymbols;
 extern const cst_string * const cst_ts_default_prepunctuationsymbols;
 extern const cst_string * const cst_ts_default_postpunctuationsymbols;
 extern const cst_string * const cst_ts_default_singlecharsymbols;

 /* Public functions for tokenstream manipulation */
 cst_tokenstream *ts_open(const char *filename,
 			 const cst_string *whitespacesymbols,
 			 const cst_string *singlecharsymbols,
 			 const cst_string *prepunctsymbols,
 			 const cst_string *postpunctsymbols);
 cst_tokenstream *ts_open_string(const cst_string *string,
 				const cst_string *whitespacesymbols,
 				const cst_string *singlecharsymbols,
 				const cst_string *prepunctsymbols,
 				const cst_string *postpunctsymbols);
 cst_tokenstream *ts_open_generic(const char *filename,
                                  const cst_string *whitespacesymbols,
                                  const cst_string *singlecharsymbols,
                                  const cst_string *prepunctsymbols,
                                  const cst_string *postpunctsymbols,
                                  void *streamtype_data,
                                  int (*open)(cst_tokenstream *ts,
                                              const char *filename),
                                  void (*close)(cst_tokenstream *ts),
                                  int (*eof)(cst_tokenstream *ts),
                                  int (*seek)(cst_tokenstream *ts, int pos),
                                  int (*tell)(cst_tokenstream *ts),
                                  int (*size)(cst_tokenstream *ts),
                                  int (*getc)(cst_tokenstream *ts));
 void ts_close(cst_tokenstream *ts);

 int ts_eof(cst_tokenstream *ts);
 const cst_string *ts_get(cst_tokenstream *ts);

 const cst_string *ts_get_quoted_token(cst_tokenstream *ts,
 				char quote,
 				char escape);
 /* Externally specified ts interfaces may need this */
 cst_string private_ts_getc(cst_tokenstream *ts);


 void set_charclasses(cst_tokenstream *ts,
 		     const cst_string *whitespace,
 		     const cst_string *singlecharsymbols,
 		     const cst_string *prepunctuation,
 		     const cst_string *postpunctuation);

 int ts_read(void *buff, int size, int num, cst_tokenstream *ts);

 int ts_set_stream_pos(cst_tokenstream *ts,int pos);
 int ts_get_stream_pos(cst_tokenstream *ts);
 int ts_get_stream_size(cst_tokenstream *ts);

 #endif
	/*************************************************************************/
	/* */
	/* Language Technologies Institute */
	/* Carnegie Mellon University */
	/* Copyright (c) 1999 */
	/* All Rights Reserved. */
	/* */
	/* Permission is hereby granted, free of charge, to use and distribute */
	/* this software and its documentation without restriction, including */
	/* without limitation the rights to use, copy, modify, merge, publish, */
	/* distribute, sublicense, and/or sell copies of this work, and to */
	/* permit persons to whom this work is furnished to do so, subject to */
	/* the following conditions: */
	/* 1. The code must retain the above copyright notice, this list of */
	/* conditions and the following disclaimer. */
	/* 2. Any modifications must be clearly marked as such. */
	/* 3. Original authors' names are not deleted. */
	/* 4. The authors' names are not used to endorse or promote products */
	/* derived from this software without specific prior written */
	/* permission. */
	/* */
	/* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */
	/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
	/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
	/* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */
	/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
	/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
	/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
	/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
	/* THIS SOFTWARE. */
	/* */
	/*************************************************************************/
	/* Author: Alan W Black (awb@cs.cmu.edu) */
	/* Date: July 1999 */
	/*************************************************************************/
	/* */
	/* Tokenizer for strings and files */
	/* */
	/*************************************************************************/
	#ifndef _CST_TOKENSTREAM_H__
	#define _CST_TOKENSTREAM_H__

	#include "cst_alloc.h"
	#include "cst_string.h"
	#include "cst_file.h"
	#include "cst_features.h"

	typedef struct cst_tokenstream_struct {
	cst_file fd;
	int file_pos;
	int line_number;
	int eof_flag;
	cst_string *string_buffer;

	int current_char;

	int token_pos;
	int ws_max;
	cst_string *whitespace;
	int prep_max;
	cst_string *prepunctuation;
	int token_max;
	cst_string *token;
	int postp_max;
	cst_string *postpunctuation;

	cst_features tags; / e.g xml tags */
	/* if set will find token boundaries at every utf8 character */
	int utf8_explode_mode;

	void *streamtype_data;

	/* Should only be set through set_charclasses as charclass table needs */
	/* to be updated when you reset these */
	const cst_string *p_whitespacesymbols;
	const cst_string *p_singlecharsymbols;
	const cst_string *p_prepunctuationsymbols;
	const cst_string *p_postpunctuationsymbols;

	cst_string charclass[256];

	/* To allow externally specified reading functions e.g. epub/xml */
	int (open)(struct cst_tokenstream_struct ts, const char *filename);
	void (close)(struct cst_tokenstream_struct ts);
	int (eof)(struct cst_tokenstream_struct ts);
	int (seek)(struct cst_tokenstream_struct ts, int pos);
	int (tell)(struct cst_tokenstream_struct ts);
	int (size)(struct cst_tokenstream_struct ts);
	int (getc)(struct cst_tokenstream_struct ts);
	} cst_tokenstream;

	#define TS_CHARCLASS_NONE 0
	#define TS_CHARCLASS_WHITESPACE 2
	#define TS_CHARCLASS_SINGLECHAR 4
	#define TS_CHARCLASS_PREPUNCT 8
	#define TS_CHARCLASS_POSTPUNCT 16
	#define TS_CHARCLASS_QUOTE 32

	#define ts_charclass(C,CLASS,TS) ((TS)->charclass[(unsigned char)C] & CLASS)

	extern const cst_string * const cst_ts_default_whitespacesymbols;
	extern const cst_string * const cst_ts_default_prepunctuationsymbols;
	extern const cst_string * const cst_ts_default_postpunctuationsymbols;
	extern const cst_string * const cst_ts_default_singlecharsymbols;

	/* Public functions for tokenstream manipulation */
	cst_tokenstream ts_open(const char filename,
	const cst_string *whitespacesymbols,
	const cst_string *singlecharsymbols,
	const cst_string *prepunctsymbols,
	const cst_string *postpunctsymbols);
	cst_tokenstream ts_open_string(const cst_string string,
	const cst_string *whitespacesymbols,
	const cst_string *singlecharsymbols,
	const cst_string *prepunctsymbols,
	const cst_string *postpunctsymbols);
	cst_tokenstream ts_open_generic(const char filename,
	const cst_string *whitespacesymbols,
	const cst_string *singlecharsymbols,
	const cst_string *prepunctsymbols,
	const cst_string *postpunctsymbols,
	void *streamtype_data,
	int (open)(cst_tokenstream ts,
	const char *filename),
	void (close)(cst_tokenstream ts),
	int (eof)(cst_tokenstream ts),
	int (seek)(cst_tokenstream ts, int pos),
	int (tell)(cst_tokenstream ts),
	int (size)(cst_tokenstream ts),
	int (getc)(cst_tokenstream ts));
	void ts_close(cst_tokenstream *ts);

	int ts_eof(cst_tokenstream *ts);
	const cst_string ts_get(cst_tokenstream ts);

	const cst_string ts_get_quoted_token(cst_tokenstream ts,
	char quote,
	char escape);
	/* Externally specified ts interfaces may need this */
	cst_string private_ts_getc(cst_tokenstream *ts);


	void set_charclasses(cst_tokenstream *ts,
	const cst_string *whitespace,
	const cst_string *singlecharsymbols,
	const cst_string *prepunctuation,
	const cst_string *postpunctuation);

	int ts_read(void buff, int size, int num, cst_tokenstream ts);

	int ts_set_stream_pos(cst_tokenstream *ts,int pos);
	int ts_get_stream_pos(cst_tokenstream *ts);
	int ts_get_stream_size(cst_tokenstream *ts);

	#endif