blob: c384d5e87d37732c01a7e06367d206fe52745067 [file] [log] [blame]
/*************************************************************************/
/* */
/* Language Technologies Institute */
/* Carnegie Mellon University */
/* Copyright (c) 2001 */
/* All Rights Reserved. */
/* */
/* Permission is hereby granted, free of charge, to use and distribute */
/* this software and its documentation without restriction, including */
/* without limitation the rights to use, copy, modify, merge, publish, */
/* distribute, sublicense, and/or sell copies of this work, and to */
/* permit persons to whom this work is furnished to do so, subject to */
/* the following conditions: */
/* 1. The code must retain the above copyright notice, this list of */
/* conditions and the following disclaimer. */
/* 2. Any modifications must be clearly marked as such. */
/* 3. Original authors' names are not deleted. */
/* 4. The authors' names are not used to endorse or promote products */
/* derived from this software without specific prior written */
/* permission. */
/* */
/* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */
/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
/* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */
/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
/* THIS SOFTWARE. */
/* */
/*************************************************************************/
/* Author: Alan W Black (awb@cs.cmu.edu) */
/* Date: January 2001 */
/*************************************************************************/
/* Feature functions used by various cart trees etc */
/* These have been create as needed, and as some of the trees are */
/* from University of Edinburgh's Festival system their names and */
/* semantics follow them */
/*************************************************************************/
#include "cst_hrg.h"
#include "cst_phoneset.h"
#include "cst_regex.h"
#include "cst_ffeatures.h"
#include "us_ffeatures.h"
static const cst_val *gpos(const cst_item *word);
DEF_STATIC_CONST_VAL_STRING(val_string_numeric,"numeric");
DEF_STATIC_CONST_VAL_STRING(val_string_number,"number");
DEF_STATIC_CONST_VAL_STRING(val_string_month,"month");
DEF_STATIC_CONST_VAL_STRING(val_string_day,"day");
DEF_STATIC_CONST_VAL_STRING(val_string_other,"_other_");
DEF_STATIC_CONST_VAL_STRING(val_string_a,"a");
DEF_STATIC_CONST_VAL_STRING(val_string_flight,"flight");
DEF_STATIC_CONST_VAL_STRING(val_string_to,"to");
DEF_STATIC_CONST_VAL_STRING(val_string_content,"content");
static const cst_val *gpos(const cst_item *word)
{
/* Guess at part of speech (function/content) */
const char *w;
int s,t;
w = item_feat_string(word,"name");
for (s=0; us_gpos[s]; s++)
{
for (t=1; us_gpos[s][t]; t++)
if (cst_streq(w,val_string(us_gpos[s][t])))
return us_gpos[s][0];
}
return (cst_val *)&val_string_content;
}
static const cst_val *num_digits(const cst_item *token)
{
const char *name = item_feat_string(token,"name");
return val_int_n(cst_strlen(name));
}
static const cst_val *month_range(const cst_item *token)
{
int v = item_feat_int(token,"name");
if ((v > 0) && ( v < 32))
return VAL_STRING_1;
else
return VAL_STRING_0;
}
static const cst_val *token_pos_guess(const cst_item *token)
{
const char *name = item_feat_string(token,"name");
char *dc = cst_downcase(name);
const cst_val *r;
if (cst_regex_match(cst_rx_digits,dc))
r = &val_string_numeric;
else if ((cst_regex_match(cst_rx_double,dc)) ||
(cst_regex_match(cst_rx_double,dc)))
r = &val_string_number;
else if (cst_streq(dc,"jan") ||
cst_streq(dc,"january") ||
cst_streq(dc,"feb") ||
cst_streq(dc,"february") ||
cst_streq(dc,"mar") ||
cst_streq(dc,"march") ||
cst_streq(dc,"apr") ||
cst_streq(dc,"april") ||
cst_streq(dc,"may") ||
cst_streq(dc,"jun") ||
cst_streq(dc,"june") ||
cst_streq(dc,"jul") ||
cst_streq(dc,"july") ||
cst_streq(dc,"aug") ||
cst_streq(dc,"august") ||
cst_streq(dc,"sep") ||
cst_streq(dc,"sept") ||
cst_streq(dc,"september") ||
cst_streq(dc,"oct") ||
cst_streq(dc,"october") ||
cst_streq(dc,"nov") ||
cst_streq(dc,"november") ||
cst_streq(dc,"dec") ||
cst_streq(dc,"december"))
r = &val_string_month;
else if (cst_streq(dc,"sun") ||
cst_streq(dc,"sunday") ||
cst_streq(dc,"mon") ||
cst_streq(dc,"monday") ||
cst_streq(dc,"tue") ||
cst_streq(dc,"tues") ||
cst_streq(dc,"tuesday") ||
cst_streq(dc,"wed") ||
cst_streq(dc,"wednesday") ||
cst_streq(dc,"thu") ||
cst_streq(dc,"thurs") ||
cst_streq(dc,"thursday") ||
cst_streq(dc,"fri") ||
cst_streq(dc,"friday") ||
cst_streq(dc,"sat") ||
cst_streq(dc,"saturday"))
r = &val_string_day;
/* ignoring the "token_most_common" condition, does get used */
else if (cst_streq(dc,"a"))
r = &val_string_a;
else if (cst_streq(dc,"flight"))
r = &val_string_flight;
else if (cst_streq(dc,"to"))
r = &val_string_to;
else
r = &val_string_other;
cst_free(dc);
return r;
}
const cst_val *content_words_in(const cst_item *p)
{
const cst_item *s;
int i=0;
p=item_as(p,"Word");
s=item_as(path_to_item(p,"R:SylStructure.R:Phrase.parent.daughter1"),"Word");
for (;s && !item_equal(p,s);s=item_next(s))
{
if (!strcmp(ffeature_string(s,"gpos"),"content"))
{i++;}
}
// if(!strcmp(ffeature_string(p,"gpos"), "content")){i++;}
return val_string_n(i);
}
const cst_val *content_words_out(const cst_item *p)
{
const cst_item *s;
int i=0;
p=item_as(p,"Word");
s=item_as(path_to_item(p,"R:SylStructure.R:Phrase.parent.daughtern"),"Word");
#if 1 /* fix by uratec */
for (;s && !item_equal(p,s);s=item_prev(s))
{
if (!strcmp(ffeature_string(s,"gpos"),"content"))
{i++;}
}
#else
for (;s && !item_equal(p,s);p=item_next(p))
{
if (!strcmp(ffeature_string(p,"gpos"),"content"))
{i++;}
}
if(!strcmp(ffeature_string(s,"gpos"), "content")){i++;}
#endif
return val_string_n(i);
}
const cst_val *cg_content_words_in_phrase(const cst_item *p)
{
return float_val(ffeature_float(p,"R:SylStructure.parent.parent.R:Word.content_words_in") + ffeature_float(p,"R:SylStructure.parent.parent.R:Word.content_words_out")) ;//- (strcmp(ffeature_string(p,"R:SylStructure.parent.parent.R:Word.gpos"),"content")==0?1:0));
}
void us_ff_register(cst_features *ffunctions)
{
/* The language independent ones */
basic_ff_register(ffunctions);
ff_register(ffunctions, "gpos",gpos);
ff_register(ffunctions, "num_digits",num_digits);
ff_register(ffunctions, "month_range",month_range);
ff_register(ffunctions, "token_pos_guess",token_pos_guess);
ff_register(ffunctions, "content_words_in",content_words_in);
ff_register(ffunctions, "content_words_out",content_words_out);
ff_register(ffunctions, "lisp_cg_content_words_in_phrase",cg_content_words_in_phrase);
}