| /* |
| Copyright 2002-2003 John Plevyak, All Rights Reserved |
| */ |
| |
| #include "d.h" |
| |
| static char *action_types[] = { "ACCEPT", "SHIFT", "REDUCE" }; |
| |
| static void print_state(State *s); |
| |
| Production * |
| new_production(Grammar *g, char *name) { |
| Production *p; |
| if ((p = lookup_production(g, name, strlen(name)))) |
| return p; |
| p = MALLOC(sizeof(Production)); |
| memset(p, 0, sizeof(Production)); |
| vec_add(&g->productions, p); |
| p->name = name; |
| p->name_len = strlen(name); |
| return p; |
| } |
| |
| static Elem * |
| new_elem() { |
| Elem *e = MALLOC(sizeof(Elem)); |
| memset(e, 0, sizeof(Elem)); |
| return e; |
| } |
| |
| Rule * |
| new_rule(Grammar *g, Production *p) { |
| Rule *r = MALLOC(sizeof(Rule)); |
| memset(r, 0, sizeof(Rule)); |
| r->prod = p; |
| r->end = new_elem(); |
| r->end->kind = ELEM_END; |
| r->end->rule = r; |
| r->action_index = g->action_index; |
| return r; |
| } |
| |
| static Term * |
| new_term() { |
| Term *term = MALLOC(sizeof(Term)); |
| memset(term, 0, sizeof(Term)); |
| return term; |
| } |
| |
| static Elem * |
| new_elem_term(Term *t, Rule *r) { |
| Elem *e = new_elem(); |
| e->kind = ELEM_TERM; |
| e->e.term = t; |
| e->rule = r; |
| vec_add(&r->elems, e); |
| return e; |
| } |
| |
| Elem * |
| new_elem_nterm(Production *p, Rule *r) { |
| Elem *e = new_elem(); |
| e->kind = ELEM_NTERM; |
| e->e.nterm = p; |
| e->rule = r; |
| return e; |
| } |
| |
| static Elem * |
| new_term_string(Grammar *g, char *s, char *e, Rule *r) { |
| Term *t = new_term(); |
| Elem *elem; |
| |
| t->string = MALLOC(e - s + 1); |
| memcpy(t->string, s, e - s); |
| t->string[e - s] = 0; |
| t->string_len = e - s; |
| vec_add(&g->terminals, t); |
| elem = new_elem_term(t, r); |
| return elem; |
| } |
| |
| #define ESC(_c) *ss++ = '\\'; *ss++ = _c; break; |
| char * |
| escape_string_for_regex(char *s) { |
| char *ss = (char*)MALLOC((strlen(s) + 1) * 2), *sss = ss; |
| for (; *s; s++) { |
| switch (*s) { |
| case '(': |
| case ')': |
| case '[': |
| case ']': |
| case '-': |
| case '^': |
| case '*': |
| case '?': |
| case '+': |
| *ss++ = '\\'; |
| /* fall through */ |
| default: *ss++ = *s; break; |
| } |
| } |
| *ss = 0; |
| return sss; |
| } |
| |
| static void |
| unescape_term_string(Term *t) { |
| char *s, *start = 0, *ss; |
| int length, base = 0; |
| |
| for (ss = s = t->string; *s; s++) { |
| if (*s == '\\') { |
| switch (s[1]) { |
| case 'b': *ss = '\b'; s++; break; |
| case 'f': *ss = '\f'; s++; break; |
| case 'n': *ss = '\n'; s++; break; |
| case 'r': *ss = '\r'; s++; break; |
| case 't': *ss = '\t'; s++; break; |
| case 'v': *ss = '\v'; s++; break; |
| case 'a': *ss = '\a'; s++; break; |
| case 'c': *ss = 0; return; |
| case '\"': |
| if (t->kind == TERM_REGEX) |
| { *ss = '\"'; s++; break; } |
| else |
| goto Ldefault; |
| case '\'': |
| if (t->kind == TERM_STRING) |
| { *ss = '\''; s++; break; } |
| else |
| goto Ldefault; |
| case 'x': |
| length = 0; |
| if (isxdigit(s[2])) { |
| base = 16; |
| start = s + 2; |
| length++; |
| if (isxdigit(s[3])) |
| length++; |
| } |
| s += length + 1; |
| goto Lncont; |
| case 'd': |
| length = 0; |
| if (isdigit(s[2])) { |
| base = 10; |
| start = s + 2; |
| length++; |
| if (isdigit(s[3])) { |
| length++; |
| if (isdigit(s[4]) && ((s[2] < '2') || ((s[2] == '2') && ((s[3] < '5') || |
| ((s[3] == '5') && (s[4] < '6')))))) |
| length++; |
| } |
| } |
| s += length + 1; |
| goto Lncont; |
| case '0': case '1': case '2': case '3': |
| case '4': case '5': case '6': case '7': |
| length = 1; |
| base = 8; |
| start = s + 1; |
| if (isdigit(s[2]) && (s[2] != '8') && (s[2] != '9')) { |
| length++; |
| if (isdigit(s[3]) && (s[3] != '8') && (s[3] != '9')) { |
| length++; |
| } |
| } |
| s += length; |
| /* fall through */ |
| Lncont: |
| if (length > 0) { |
| char saved_c = start[length]; |
| start[length] = '\0'; |
| *ss = (char) strtol(start, NULL, base); |
| start[length] = saved_c; |
| if (*s > 0) |
| break; |
| d_fail("encountered an escaped NULL while processing '%s'", t->string); |
| } else |
| goto next; |
| Ldefault: |
| default: |
| *ss++ = *s; |
| *ss = s[1]; |
| s++; |
| break; |
| } |
| } else |
| *ss = *s; |
| ss++; |
| next:; |
| } |
| *ss = 0; |
| t->string_len = strlen(t->string); |
| if (!t->string_len) |
| d_fail("empty string after unescape '%s'", t->string); |
| } |
| |
| Elem * |
| new_string(Grammar *g, char *s, char *e, Rule *r) { |
| Elem *x = new_term_string(g, s + 1, e - 1, r); |
| x->e.term->kind = (*s == '"') ? TERM_REGEX : TERM_STRING; |
| unescape_term_string(x->e.term); |
| return x; |
| } |
| |
| Elem * |
| new_ident(char *s, char *e, Rule *r) { |
| Elem *x = new_elem(); |
| x->kind = ELEM_UNRESOLVED; |
| x->e.unresolved.string = dup_str(s, e); |
| x->e.unresolved.len = strlen(x->e.unresolved.string); |
| x->rule = r; |
| if (r) |
| vec_add(&r->elems, x); |
| return x; |
| } |
| |
| void |
| new_token(Grammar *g, char *s, char *e) { |
| Term *t = new_term(); |
| t->string = MALLOC(e - s + 1); |
| memcpy(t->string, s, e - s); |
| t->string[e - s] = 0; |
| t->string_len = e - s; |
| vec_add(&g->terminals, t); |
| t->kind = TERM_TOKEN; |
| } |
| |
| Elem * |
| new_code(Grammar *g, char *s, char *e, Rule *r) { |
| Elem *x = new_term_string(g, s, e, r); |
| x->e.term->kind = TERM_CODE; |
| return x; |
| } |
| |
| Elem * |
| dup_elem(Elem *e, Rule *r) { |
| Elem *ee = MALLOC(sizeof(Elem)); |
| memcpy(ee, e, sizeof(Elem)); |
| ee->rule = r; |
| return ee; |
| } |
| |
| void |
| add_global_code(Grammar *g, char *start, char *end, int line) { |
| if (!g->code) g->code = MALLOC(sizeof(Code) * 4); |
| else if (!((g->ncode + 1) & 4)) |
| g->code = REALLOC(g->code, sizeof(Code) * (g->ncode + 4)); |
| g->code[g->ncode].code = dup_str(start, end); |
| g->code[g->ncode].line = line; |
| g->ncode++; |
| } |
| |
| void |
| new_declaration(Grammar *g, Elem *e, uint kind) { |
| Declaration *d = MALLOC(sizeof(*d)); |
| d->elem = e; |
| d->kind = kind; |
| d->index = g->declarations.n; |
| vec_add(&g->declarations, d); |
| } |
| |
| void |
| add_declaration(Grammar *g, char *start, char *end, uint kind, uint line) { |
| if (start == end) { |
| switch (kind) { |
| case DECLARE_SET_OP_PRIORITY: g->set_op_priority_from_rule = 1; return; |
| case DECLARE_STATES_FOR_ALL_NTERMS: g->states_for_all_nterms = 1; return; |
| case DECLARE_LONGEST_MATCH: g->longest_match = 1; return; |
| case DECLARE_ALL_MATCHES: g->longest_match = 0; return; |
| case DECLARE_TOKENIZE: g->tokenizer = 1; return; |
| case DECLARE_SAVE_PARSE_TREE: g->save_parse_tree = 1; return; |
| default: d_fail("declare expects argument, line %d", line); |
| } |
| } |
| switch (kind) { |
| case DECLARE_WHITESPACE: g->default_white_space = dup_str(start, end); return; |
| case DECLARE_SET_OP_PRIORITY: |
| d_fail("declare does not expect argument, line %d", line); |
| default: |
| new_declaration(g, new_ident(start, end, NULL), kind); |
| break; |
| } |
| } |
| |
| D_Pass * |
| find_pass(Grammar *g, char *start, char *end) { |
| int i, l; |
| while (*start && isspace(*start)) start++; |
| l = end - start; |
| for (i = 0; i < g->passes.n; i++) |
| if (l == g->passes.v[i]->name_len && |
| !strncmp(g->passes.v[i]->name, start, l)) |
| return g->passes.v[i]; |
| return NULL; |
| } |
| |
| void |
| add_pass(Grammar *g, char *start, char *end, uint kind, uint line) { |
| if (find_pass(g, start, end)) |
| d_fail("duplicate pass '%s' line %d", dup_str(start, end), line); |
| else { |
| D_Pass *p = MALLOC(sizeof(*p)); |
| p->name = dup_str(start, end); |
| p->name_len = end - start; |
| p->kind = kind; |
| p->index = g->pass_index++; |
| vec_add(&g->passes, p); |
| } |
| } |
| |
| void |
| add_pass_code(Grammar *g, Rule *r, char *pass_start, char *pass_end, |
| char *code_start, char *code_end, uint pass_line, uint code_line) |
| { |
| D_Pass *p = find_pass(g, pass_start, pass_end); |
| if (!p) |
| d_fail("unknown pass '%s' line %d", dup_str(pass_start, pass_end), pass_line); |
| while (r->pass_code.n <= p->index) vec_add(&r->pass_code, NULL); |
| r->pass_code.v[p->index] = MALLOC(sizeof(Code)); |
| r->pass_code.v[p->index]->code = dup_str(code_start, code_end); |
| r->pass_code.v[p->index]->line = code_line; |
| } |
| |
| |
| Production * |
| new_internal_production(Grammar *g, Production *p) { |
| char *n = p ? p->name : " _synthetic"; |
| char *name = MALLOC(strlen(n) + 20); |
| Production *pp = NULL, *tp = NULL, *ttp; |
| int i, found = 0; |
| sprintf(name, "%s.%d", n, g->productions.n); |
| pp = new_production(g, name); |
| pp->internal = INTERNAL_HIDDEN; |
| pp->regex = p ? p->regex : 0; |
| if (p) { |
| for (i = 0; i < g->productions.n; i++) { |
| if (found) { |
| ttp = g->productions.v[i]; |
| g->productions.v[i] = tp; |
| tp = ttp; |
| } else if (p == g->productions.v[i]) { |
| found = 1; |
| tp = g->productions.v[i+1]; |
| g->productions.v[i+1] = pp; |
| i++; |
| } |
| } |
| } |
| return pp; |
| } |
| |
| void |
| conditional_EBNF(Grammar *g) { |
| Production *pp; |
| Rule *rr; |
| |
| pp = new_internal_production(g, g->p); |
| pp->internal = INTERNAL_CONDITIONAL; |
| rr = new_rule(g, pp); |
| vec_add(&rr->elems, last_elem(g->r)); |
| last_elem(g->r)->rule = rr; |
| rr->elems.v[rr->elems.n - 1]->rule = rr; |
| vec_add(&pp->rules, rr); |
| vec_add(&pp->rules, new_rule(g, pp)); |
| last_elem(g->r) = new_elem_nterm(pp, g->r); |
| } |
| |
| void |
| star_EBNF(Grammar *g) { |
| Production *pp; |
| Rule *rr; |
| |
| pp = new_internal_production(g, g->p); |
| pp->internal = INTERNAL_STAR; |
| rr = new_rule(g, pp); |
| if (!g->right_recursive_BNF) { |
| vec_add(&rr->elems, new_elem_nterm(pp, rr)); |
| vec_add(&rr->elems, last_elem(g->r)); |
| last_elem(g->r) = new_elem_nterm(pp, g->r); |
| last_elem(rr)->rule = rr; |
| } else { |
| vec_add(&rr->elems, last_elem(g->r)); |
| last_elem(g->r) = new_elem_nterm(pp, g->r); |
| last_elem(rr)->rule = rr; |
| vec_add(&rr->elems, new_elem_nterm(pp, rr)); |
| } |
| vec_add(&pp->rules, rr); |
| vec_add(&pp->rules, new_rule(g, pp)); |
| } |
| |
| void |
| plus_EBNF(Grammar *g) { |
| Production *pp; |
| Rule *rr; |
| Elem *elem; |
| |
| pp = new_internal_production(g, g->p); |
| pp->internal = INTERNAL_PLUS; |
| rr = new_rule(g, pp); |
| if (!g->right_recursive_BNF) { |
| elem = last_elem(g->r); |
| vec_add(&rr->elems, new_elem_nterm(pp, rr)); |
| vec_add(&rr->elems, dup_elem(elem, rr)); |
| last_elem(g->r) = new_elem_nterm(pp, g->r); |
| } else { |
| elem = last_elem(g->r); |
| vec_add(&rr->elems, dup_elem(elem, rr)); |
| last_elem(g->r) = new_elem_nterm(pp, g->r); |
| vec_add(&rr->elems, new_elem_nterm(pp, rr)); |
| } |
| vec_add(&pp->rules, rr); |
| rr = new_rule(g, pp); |
| vec_add(&rr->elems, elem); |
| elem->rule = rr; |
| vec_add(&pp->rules, rr); |
| } |
| |
| void |
| initialize_productions(Grammar *g) { |
| Production *pp, *ppp; |
| Rule *rrr; |
| |
| ppp = new_production(g, strdup("0 Start")); |
| ppp->internal = INTERNAL_HIDDEN; |
| rrr = new_rule(g, ppp); |
| vec_add(&rrr->elems, new_elem_nterm(NULL, rrr)); |
| vec_add(&ppp->rules, rrr); |
| pp = new_production(g, strdup("1 Start")); |
| pp->internal = INTERNAL_HIDDEN; |
| rrr->elems.v[0]->e.nterm = pp; |
| } |
| |
| void |
| finish_productions(Grammar *g) { |
| Production *pp = g->productions.v[1]; |
| Rule *rr = new_rule(g, g->productions.v[1]); |
| vec_add(&rr->elems, new_elem_nterm(NULL, rr)); |
| vec_add(&pp->rules, rr); |
| rr->elems.v[0]->e.nterm = g->productions.v[2]; |
| } |
| |
| Production * |
| lookup_production(Grammar *g, char *name, int l) { |
| int i; |
| |
| for (i = 0; i < g->productions.n; i++) { |
| Production *pp = g->productions.v[i]; |
| if (pp->name_len != l || strncmp(pp->name, name, l)) |
| continue; |
| return pp; |
| } |
| return NULL; |
| } |
| |
| static Term * |
| lookup_token(Grammar *g, char *name, int l) { |
| int i; |
| |
| for (i = 0; i < g->terminals.n; i++) { |
| Term *t = g->terminals.v[i]; |
| if (t->kind != TERM_TOKEN || t->string_len != l || |
| strncmp(t->string, name, l)) |
| continue; |
| return t; |
| } |
| return NULL; |
| } |
| |
| static Term * |
| unique_term(Grammar *g, Term *t) { |
| int i; |
| for (i = 0; i < g->terminals.n; i++) |
| if (t->kind == g->terminals.v[i]->kind && |
| t->string_len == g->terminals.v[i]->string_len && |
| t->term_priority == g->terminals.v[i]->term_priority && |
| (!g->set_op_priority_from_rule || |
| (t->op_assoc == g->terminals.v[i]->op_assoc && |
| t->op_priority == g->terminals.v[i]->op_priority)) && |
| !strncmp(t->string, g->terminals.v[i]->string, t->string_len)) |
| return g->terminals.v[i]; |
| return t; |
| } |
| |
| static void |
| compute_nullable(Grammar *g) { |
| int i, j, k, changed = 1; |
| Elem *e; |
| |
| /* ensure that the trivial case is the first cause */ |
| for (i = 0; i < g->productions.n; i++) { |
| for (j = 0; j < g->productions.v[i]->rules.n; j++) |
| if (!g->productions.v[i]->rules.v[j]->elems.n) { |
| g->productions.v[i]->nullable = g->productions.v[i]->rules.v[j]; |
| break; |
| } |
| } |
| /* transitive closure */ |
| while (changed) { |
| changed = 0; |
| for (i = 0; i < g->productions.n; i++) { |
| if (!g->productions.v[i]->nullable) |
| for (j = 0; j < g->productions.v[i]->rules.n; j++) { |
| for (k = 0; k < g->productions.v[i]->rules.v[j]->elems.n; k++) { |
| e = g->productions.v[i]->rules.v[j]->elems.v[k]; |
| if (e->kind != ELEM_NTERM || !e->e.nterm->nullable) |
| goto Lnot_nullable; |
| } |
| changed = 1; |
| g->productions.v[i]->nullable = g->productions.v[i]->rules.v[j]; |
| break; |
| } |
| Lnot_nullable:; |
| } |
| } |
| } |
| |
| /* |
| verify and cleanup the grammar datastructures |
| - resolve non-terminals |
| - set element indexes |
| */ |
| static void |
| resolve_grammar(Grammar *g) { |
| int i, j, k, l; |
| Production *p, *pp; |
| Rule *r; |
| Elem *e; |
| Term *last_term, *t; |
| |
| g->rule_index = 0; |
| for (i = 0; i < g->productions.n; i++) { |
| p = g->productions.v[i]; |
| if (p != lookup_production(g, p->name, p->name_len)) |
| d_fail("duplicate production '%s'", p->name); |
| p->index = i; |
| for (j = 0; j < p->rules.n; j++) { |
| r = p->rules.v[j]; |
| r->index = g->rule_index++; |
| last_term = NULL; |
| for (k = 0; k < r->elems.n; k++) { |
| e = r->elems.v[k]; |
| e->index = k; |
| if (e->kind == ELEM_UNRESOLVED) { |
| l = e->e.unresolved.len; |
| if ((pp = lookup_production(g, e->e.unresolved.string, l))) { |
| e->kind = ELEM_NTERM; |
| e->e.nterm = pp; |
| } else if ((t = lookup_token(g, e->e.unresolved.string, l))) { |
| e->kind = ELEM_TERM; |
| e->e.term = t; |
| } else { |
| char str[256]; |
| strncpy(str, e->e.unresolved.string, l); |
| str[l < 255 ? l : 255] = 0; |
| d_fail("unresolved identifier: '%s'", str); |
| } |
| } |
| if (e->kind == ELEM_TERM) |
| last_term = e->e.term; |
| } |
| r->end->index = r->elems.n; |
| if (g->set_op_priority_from_rule) { |
| if (last_term && r->rule_assoc) { |
| last_term->op_assoc = r->rule_assoc; |
| last_term->op_priority = r->rule_priority; |
| } |
| } |
| } |
| } |
| for (i = 0; i < g->terminals.n; i++) |
| g->terminals.v[i]->index = i; |
| compute_nullable(g); |
| } |
| |
| static void |
| merge_identical_terminals(Grammar *g) { |
| int i, j, k; |
| Production *p; |
| Rule *r; |
| Elem *e; |
| |
| for (i = 0; i < g->productions.n; i++) { |
| p = g->productions.v[i]; |
| for (j = 0; j < p->rules.n; j++) { |
| r = p->rules.v[j]; |
| for (k = 0; k < r->elems.n; k++) { |
| e = r->elems.v[k]; |
| if (e->kind == ELEM_TERM) |
| e->e.term = unique_term(g, e->e.term); |
| } |
| } |
| } |
| } |
| |
| void |
| print_term(Term *t) { |
| char *s = t->string ? escape_string(t->string) : NULL; |
| if (t->kind == TERM_STRING) { |
| if (!t->string || !*t->string) |
| printf("<EOF> "); |
| else |
| printf("string(\"%s\") ", s); |
| } else if (t->kind == TERM_REGEX) |
| printf("regex(\"%s\") ", s); |
| else if (t->kind == TERM_CODE) |
| printf("code(\"%s\") ", s); |
| else if (t->kind == TERM_TOKEN) |
| printf("token(\"%s\") ", s); |
| else |
| d_fail("unknown token kind"); |
| if (s) |
| FREE(s); |
| } |
| |
| void |
| print_elem(Elem *ee) { |
| if (ee->kind == ELEM_TERM) |
| print_term(ee->e.term); |
| else if (ee->kind == ELEM_UNRESOLVED) |
| printf("%s ", ee->e.unresolved.string); |
| else |
| printf("%s ", ee->e.nterm->name); |
| } |
| |
| struct EnumStr { |
| uint e; |
| char *s; |
| } assoc_strings[] = { |
| { ASSOC_NONE, "$none" }, |
| { ASSOC_NARY_LEFT, "$left" }, |
| { ASSOC_NARY_RIGHT, "$right" }, |
| { ASSOC_UNARY_LEFT, "$unary_left" }, |
| { ASSOC_UNARY_RIGHT, "$unary_right" }, |
| { ASSOC_BINARY_LEFT, "$binary_left" }, |
| { ASSOC_BINARY_RIGHT, "$binary_right" }, |
| { ASSOC_NO, "$noassoc" } |
| }; |
| |
| static char * |
| assoc_str(uint e) { |
| int i; |
| |
| for (i = 0; i < sizeof(assoc_strings) / sizeof(assoc_strings[0]); i++) |
| if (e == assoc_strings[i].e) |
| return assoc_strings[i].s; |
| return assoc_strings[0].s; |
| } |
| |
| void |
| print_rule(Rule *r) { |
| int k; |
| |
| printf("%s: ", r->prod->name); |
| for (k = 0; k < r->elems.n; k++) |
| print_elem(r->elems.v[k]); |
| if (r->speculative_code.code) |
| printf("SPECULATIVE_CODE\n%s\nEND CODE\n", r->speculative_code.code); |
| if (r->final_code.code) |
| printf("FINAL_CODE\n%s\nEND CODE\n", r->final_code.code); |
| } |
| |
| void |
| print_grammar(Grammar *g) { |
| uint i, j, k; |
| Production *pp; |
| Rule *rr; |
| |
| if (!g->productions.n) |
| return; |
| printf("PRODUCTIONS\n\n"); |
| for (i = 0; i < g->productions.n; i++) { |
| pp = g->productions.v[i]; |
| printf("%s (%d)\n", pp->name, i); |
| for (j = 0; j < pp->rules.n; j++) { |
| rr = pp->rules.v[j]; |
| if (!j) |
| printf("\t: "); |
| else |
| printf("\t| "); |
| for (k = 0; k < rr->elems.n; k++) |
| print_elem(rr->elems.v[k]); |
| if (rr->op_priority) |
| printf("op %d ", rr->op_priority); |
| if (rr->op_assoc) |
| printf("%s ", assoc_str(rr->op_assoc)); |
| if (rr->rule_priority) |
| printf("rule %d ", rr->rule_priority); |
| if (rr->rule_assoc) |
| printf("%s ", assoc_str(rr->rule_assoc)); |
| if (rr->speculative_code.code) |
| printf("%s ", rr->speculative_code.code); |
| if (rr->final_code.code) |
| printf("%s ", rr->final_code.code); |
| printf("\n"); |
| } |
| printf("\t;\n"); |
| printf("\n"); |
| } |
| printf("TERMINALS\n\n"); |
| for (i = 0; i < g->terminals.n; i++) { |
| printf("\t"); |
| print_term(g->terminals.v[i]); |
| printf("(%d)\n", i + g->productions.n); |
| } |
| printf("\n"); |
| } |
| |
| static void |
| print_item(Item *i) { |
| int j, end = 1; |
| |
| printf("\t%s: ", i->rule->prod->name); |
| for (j = 0; j < i->rule->elems.n; j++) { |
| Elem *e = i->rule->elems.v[j]; |
| if (i == e) { |
| printf(". "); |
| end = 0; |
| } |
| print_elem(e); |
| } |
| if (end) |
| printf(". "); |
| printf("\n"); |
| } |
| |
| static void |
| print_conflict(char *kind, int *conflict) { |
| if (!*conflict) { |
| printf(" CONFLICT (before precedence and associativity)\n"); |
| *conflict = 1; |
| } |
| printf("\t%s conflict ", kind); |
| printf("\n"); |
| } |
| |
| static void |
| print_state(State *s) { |
| int j, conflict = 0; |
| |
| printf("STATE %d (%d ITEMS)%s\n", s->index, s->items.n, |
| s->accept ? " ACCEPT" : ""); |
| for (j = 0; j < s->items.n; j++) |
| print_item(s->items.v[j]); |
| if (s->gotos.n) |
| printf(" GOTO\n"); |
| for (j = 0; j < s->gotos.n; j++) { |
| printf("\t"); |
| print_elem(s->gotos.v[j]->elem); |
| printf(" : %d\n", s->gotos.v[j]->state->index); |
| } |
| printf(" ACTION\n"); |
| for (j = 0; j < s->reduce_actions.n; j++) { |
| Action *a = s->reduce_actions.v[j]; |
| printf("\t%s\t", action_types[a->kind]); |
| print_rule(a->rule); |
| printf("\n"); |
| } |
| for (j = 0; j < s->shift_actions.n; j++) { |
| Action *a = s->shift_actions.v[j]; |
| printf("\t%s\t", action_types[a->kind]); |
| if (a->kind == ACTION_SHIFT) { |
| print_term(a->term); |
| printf("%d", a->state->index); |
| } |
| printf("\n"); |
| } |
| if (s->reduce_actions.n > 1) |
| print_conflict("reduce/reduce", &conflict); |
| if (s->reduce_actions.n && s->shift_actions.n) |
| print_conflict("shift/reduce", &conflict); |
| printf("\n"); |
| } |
| |
| void |
| print_states(Grammar *g) { |
| int i; |
| |
| for (i = 0; i < g->states.n; i++) |
| print_state(g->states.v[i]); |
| } |
| |
| int |
| state_for_declaration(Grammar *g, int iproduction) { |
| int i; |
| for (i = 0; i < g->declarations.n; i++) |
| if (g->declarations.v[i]->kind == DECLARE_STATE_FOR && |
| g->declarations.v[i]->elem->e.nterm->index == iproduction) |
| return 1; |
| return 0; |
| } |
| |
| static void |
| make_elems_for_productions(Grammar *g) { |
| int i, j, k, l; |
| Rule *rr; |
| Production *pp, *ppp; |
| |
| pp = g->productions.v[0]; |
| for (i = 0; i < g->productions.n; i++) |
| if (!g->productions.v[i]->internal) { |
| if (g->states_for_all_nterms || |
| state_for_declaration(g, i)) { |
| /* try to find an existing elem */ |
| for (j = 0; j < g->productions.n; j++) |
| for (k = 0; k < g->productions.v[j]->rules.n; k++) { |
| rr = g->productions.v[j]->rules.v[k]; |
| for (l = 0; l < rr->elems.n; l++) |
| if (rr->elems.v[l]->e.term_or_nterm == g->productions.v[i]) { |
| g->productions.v[i]->elem = rr->elems.v[l]; |
| break; |
| } |
| } |
| if (j >= g->productions.n) { /* not found */ |
| g->productions.v[i]->elem = |
| new_elem_nterm(g->productions.v[i], new_rule(g, pp)); |
| g->productions.v[i]->elem->rule->index = g->rule_index++; /* fake */ |
| } |
| } |
| } |
| if (!g->states_for_all_nterms && |
| g->states_for_whitespace && |
| (ppp = lookup_production(g, "whitespace", sizeof("whitespace")-1))) |
| { |
| ppp->elem = new_elem_nterm(ppp, new_rule(g, pp)); |
| ppp->elem->rule->index = g->rule_index++; /* fake */ |
| } |
| } |
| |
| static void |
| convert_regex_production_one(Grammar *g, Production *p) { |
| int j, k, l; |
| Production *pp; |
| Rule *r, *rr; |
| Elem *e; |
| Term *t; |
| int circular = 0; |
| char *buf = 0, *b, *s; |
| int buf_len = 0; |
| |
| if (p->regex_term) /* already done */ |
| return; |
| if (p->in_regex) |
| d_fail("circular regex production '%s'", p->name); |
| p->in_regex = 1; |
| for (j = 0; j < p->rules.n; j++) { |
| r = p->rules.v[j]; |
| if (r->final_code.code || (r->speculative_code.code && p->rules.n > 1)) |
| d_fail("final and/or multi-rule code not permitted in regex productions '%s'", p->name); |
| for (k = 0; k < r->elems.n; k++) { |
| e = r->elems.v[k]; |
| if (e->kind == ELEM_NTERM) { |
| if (!e->e.nterm->regex) |
| d_fail("regex production '%s' cannot invoke non-regex production '%s'", |
| p->name, e->e.nterm->name); |
| pp = e->e.nterm; |
| for (l = 0; l < pp->rules.n; l++) |
| if (pp->rules.v[l]->speculative_code.code || pp->rules.v[l]->final_code.code) |
| d_fail("code not permitted in rule %d of regex productions '%s'", l, p->name); |
| if (p != pp) { |
| convert_regex_production_one(g, pp); |
| buf_len += pp->regex_term->string_len + 5; |
| } else { |
| circular = 1; |
| buf_len += 5; |
| } |
| } else { /* e->kind == ELEM_TERM */ |
| if (e->e.term->kind == TERM_CODE || e->e.term->kind == TERM_TOKEN) |
| d_fail("regex production '%s' cannot include scanners or tokens"); |
| buf_len += e->e.term->string_len + 5; |
| } |
| } |
| } |
| b = buf = (char*)MALLOC(buf_len + 1); |
| t = new_term(); |
| t->kind = TERM_REGEX; |
| t->string = buf; |
| t->index = g->terminals.n; |
| t->regex_production = p; |
| vec_add(&g->terminals, t); |
| p->regex_term = t; |
| if (circular) { /* attempt to match to regex operators */ |
| if (p->rules.n != 2) |
| Lfail: d_fail("unable to resolve circular regex production: '%s'", p->name); |
| l = p->rules.v[0]->elems.n + p->rules.v[1]->elems.n; |
| if (l == 2 || l == 3) { |
| if (p->rules.v[0]->elems.n != 2 && p->rules.v[1]->elems.n != 2) |
| goto Lfail; |
| r = p->rules.v[0]->elems.n == 2 ? p->rules.v[0] : p->rules.v[1]; |
| rr = p->rules.v[0] == r ? p->rules.v[1] : p->rules.v[0]; |
| if (r->elems.v[0]->e.nterm != p && r->elems.v[1]->e.nterm != p) |
| goto Lfail; |
| e = r->elems.v[0]->e.nterm == p ? r->elems.v[1] : r->elems.v[1]; |
| if (rr->elems.n && e->e.term_or_nterm != rr->elems.v[0]->e.term_or_nterm) |
| goto Lfail; |
| t = e->kind == ELEM_TERM ? e->e.term : e->e.nterm->regex_term; |
| *b++ = '('; |
| if (t->kind == TERM_STRING) |
| s = escape_string_for_regex(t->string); |
| else |
| s = t->string; |
| memcpy(b, s, strlen(s)); b += strlen(s); |
| if (t->kind == TERM_STRING) |
| FREE(s); |
| *b++ = ')'; |
| if (l == 2) |
| *b++ = '*'; |
| else |
| *b++ = '+'; |
| *b = 0; |
| p->regex_term->string_len = strlen(p->regex_term->string); |
| } else |
| goto Lfail; |
| } else { /* handle the base case, p = (r | r'), r = (e e') */ |
| if (p->rules.n > 1) |
| *b++ = '('; |
| for (j = 0; j < p->rules.n; j++) { |
| r = p->rules.v[j]; |
| if (r->elems.n > 1) |
| *b++ = '('; |
| for (k = 0; k < r->elems.n; k++) { |
| e = r->elems.v[k]; |
| t = e->kind == ELEM_TERM ? e->e.term : e->e.nterm->regex_term; |
| if (t->kind == TERM_STRING) |
| s = escape_string_for_regex(t->string); |
| else |
| s = t->string; |
| memcpy(b, s, strlen(s)); b += strlen(s); |
| if (t->kind == TERM_STRING) |
| FREE(s); |
| } |
| if (r->elems.n > 1) |
| *b++ = ')'; |
| if (j != p->rules.n - 1) |
| *b++ = '|'; |
| } |
| if (p->rules.n > 1) |
| *b++ = ')'; |
| *b = 0; |
| p->regex_term->string_len = strlen(p->regex_term->string); |
| } |
| p->in_regex = 0; |
| } |
| |
| static void |
| convert_regex_productions(Grammar *g) { |
| int i, j, k; |
| Production *p; |
| Rule *r; |
| |
| for (i = 0; i < g->productions.n; i++) { |
| p = g->productions.v[i]; |
| if (!p->regex) |
| continue; |
| convert_regex_production_one(g, p); |
| } |
| for (i = 0; i < g->productions.n; i++) { |
| p = g->productions.v[i]; |
| for (j = 0; j < p->rules.n; j++) { |
| r = p->rules.v[j]; |
| for (k = 0; k < r->elems.n; k++) { |
| if (r->elems.v[k]->kind == ELEM_NTERM && r->elems.v[k]->e.nterm->regex_term) { |
| r->elems.v[k]->e.term = r->elems.v[k]->e.nterm->regex_term; |
| r->elems.v[k]->kind = ELEM_TERM; |
| } |
| } |
| } |
| } |
| } |
| |
| static void |
| check_default_actions(Grammar *g) { |
| Production *pdefault; |
| |
| pdefault = lookup_production(g, "_", 1); |
| if (pdefault && pdefault->rules.n > 1) |
| d_fail("number of rules in default action != 1"); |
| } |
| |
| typedef struct { |
| struct State *eq; |
| struct Rule *diff_rule; |
| struct State *diff_state; |
| } EqState; |
| |
| void |
| build_eq(Grammar *g) { |
| int i, j, k, changed = 1, x, xx; |
| State *s, *ss; |
| EqState *eq, *e, *ee; |
| |
| eq = (EqState*)MALLOC(sizeof(EqState)*g->states.n); |
| memset(eq, 0, sizeof(EqState)*g->states.n); |
| while (changed) { |
| changed = 0; |
| for (i = 0; i < g->states.n; i++) { |
| s = g->states.v[i]; |
| e = &eq[s->index]; |
| for (j = i + 1; j < g->states.n; j++) { |
| ss = g->states.v[j]; |
| ee = &eq[ss->index]; |
| if (e->eq || ee->eq) |
| continue; |
| if (s->same_shifts != ss->same_shifts && ss->same_shifts != s) |
| continue; |
| /* check gotos */ |
| if (s->gotos.n != ss->gotos.n) |
| continue; |
| for (k = 0; k < s->gotos.n; k++) { |
| if (elem_symbol(g, s->gotos.v[k]->elem) != elem_symbol(g, ss->gotos.v[k]->elem)) |
| goto Lcontinue; |
| if (s->gotos.v[k]->state != ss->gotos.v[k]->state) { |
| EqState *ge = &eq[s->gotos.v[k]->state->index]; |
| EqState *gee = &eq[ss->gotos.v[k]->state->index]; |
| if (ge->eq != ss->gotos.v[k]->state && gee->eq != s->gotos.v[k]->state) |
| goto Lcontinue; |
| if ((ee->diff_state && ee->diff_state != eq[ss->gotos.v[k]->state->index].eq) || |
| (e->diff_state && e->diff_state != eq[s->gotos.v[k]->state->index].eq)) |
| goto Lcontinue; |
| /* allow one different state */ |
| ee->diff_state = ss->gotos.v[k]->state; |
| e->diff_state = s->gotos.v[k]->state; |
| } |
| } |
| /* check reductions */ |
| if (s->reduce_actions.n != ss->reduce_actions.n) |
| continue; |
| for (k = 0; k < s->reduce_actions.n; k++) { |
| if (s->reduce_actions.v[k]->rule == ss->reduce_actions.v[k]->rule) |
| continue; |
| if (s->reduce_actions.v[k]->rule->prod != |
| ss->reduce_actions.v[k]->rule->prod) |
| goto Lcontinue; |
| if ((x = s->reduce_actions.v[k]->rule->elems.n) != |
| (xx = ss->reduce_actions.v[k]->rule->elems.n)) { |
| if ((ee->diff_rule && ee->diff_rule != ss->reduce_actions.v[k]->rule) || |
| (e->diff_rule && e->diff_rule != s->reduce_actions.v[k]->rule)) |
| goto Lcontinue; |
| /* allow one different rule */ |
| ee->diff_rule = ss->reduce_actions.v[k]->rule; |
| e->diff_rule = s->reduce_actions.v[k]->rule; |
| } |
| } |
| ee->eq = s; |
| changed = 1; |
| Lcontinue:; |
| } |
| } |
| } |
| for (i = 0; i < g->states.n; i++) { |
| s = g->states.v[i]; |
| e = &eq[s->index]; |
| if (e->eq) { |
| if (verbose_level > 2) { |
| printf("eq %d %d ", s->index, e->eq->index); |
| if (e->diff_state) |
| printf("diff state (%d %d) ", |
| e->diff_state->index, |
| eq[e->eq->index].diff_state->index); |
| if (e->diff_rule) { |
| printf("diff rule "); |
| printf("[ "); |
| print_rule(e->diff_rule); |
| printf("][ "); |
| print_rule(eq[e->eq->index].diff_rule); |
| printf("]"); |
| } |
| printf("\n"); |
| } |
| } |
| } |
| for (i = 0; i < g->states.n; i++) { |
| s = g->states.v[i]; |
| e = &eq[s->index]; |
| if (e->eq && e->diff_state) { |
| if (eq[e->diff_state->index].diff_rule && |
| eq[e->diff_state->index].diff_rule->elems.n == 2) |
| { |
| s->reduces_to = e->eq; |
| s->reduces_with = eq[e->eq->index].diff_rule; |
| s->reduces_to_then_with = e->diff_rule; |
| } else if (eq[eq[e->eq->index].diff_state->index].diff_rule && |
| eq[eq[e->eq->index].diff_state->index].diff_rule->elems.n == 2) |
| { |
| e->eq->reduces_to = s; |
| s->reduces_with = e->diff_rule; |
| s->reduces_to_then_with = eq[e->eq->index].diff_rule; |
| } |
| } |
| } |
| for (i = 0; i < g->states.n; i++) { |
| s = g->states.v[i]; |
| if (s->reduces_to) |
| if (verbose_level) |
| printf("reduces_to %d %d\n", s->index, s->reduces_to->index); |
| } |
| FREE(eq); |
| } |
| |
| Grammar * |
| new_D_Grammar(char *pathname) { |
| Grammar *g = (Grammar *)MALLOC(sizeof(Grammar)); |
| memset(g, 0, sizeof(Grammar)); |
| g->pathname = dup_str(pathname, pathname + strlen(pathname)); |
| return g; |
| } |
| |
| void |
| free_D_Grammar(Grammar *g) { |
| FREE(g->pathname); |
| } |
| |
| int |
| parse_grammar(Grammar *g, D_ParserTables *t, int sizeof_ParseNode_User) { |
| FILE *fp = fopen(g->pathname, "r"); |
| char *s; |
| D_Parser *p; |
| |
| if (!fp) |
| return -1; |
| if (!(s = sbuf_read(g->pathname))) |
| return -1; |
| initialize_productions(g); |
| p = new_D_Parser(t, sizeof_ParseNode_User); |
| p->initial_globals = g; |
| p->loc.pathname = g->pathname; |
| if (dparse(p, s, strlen(s))) { |
| if (g->productions.n > 2) |
| finish_productions(g); |
| return 0; |
| } else |
| return -1; |
| } |
| |
| static int |
| scanner_declaration(Declaration *d) { |
| switch (d->kind) { |
| case DECLARE_TOKENIZE: |
| case DECLARE_LONGEST_MATCH: |
| case DECLARE_ALL_MATCHES: |
| return 1; |
| default: |
| return 0; |
| } |
| } |
| |
| static void |
| set_declaration_group(Production *p, Production *root, Declaration *d) { |
| int i, j; |
| if (p->declaration_group[d->kind] == root) |
| return; |
| if (d->kind == DECLARE_TOKENIZE && p->declaration_group[d->kind]) { |
| d_fail("shared tokenize subtrees"); |
| return; |
| } |
| p->declaration_group[d->kind] = root; |
| p->last_declaration[d->kind] = d; |
| for (i = 0; i < p->rules.n; i++) { |
| for (j = 0; j < p->rules.v[i]->elems.n; j++) |
| if (p->rules.v[i]->elems.v[j]->kind == ELEM_NTERM) |
| set_declaration_group(p->rules.v[i]->elems.v[j]->e.nterm, root, d); |
| } |
| } |
| |
| static void |
| propogate_declarations(Grammar *g) { |
| int i, j, k; |
| Production *p; |
| Rule *r; |
| Elem *e; |
| |
| /* global defaults */ |
| if (g->tokenizer) |
| new_declaration(g, new_elem_nterm(g->productions.v[0], NULL), DECLARE_TOKENIZE); |
| if (g->longest_match) |
| new_declaration(g, new_elem_nterm(g->productions.v[0], NULL), DECLARE_LONGEST_MATCH); |
| /* resolve declarations */ |
| for (i = 0; i < g->declarations.n; i++) { |
| e = g->declarations.v[i]->elem; |
| if (e->kind == ELEM_UNRESOLVED) { |
| if (!(p = lookup_production(g, e->e.unresolved.string, e->e.unresolved.len))) |
| d_fail("unresolved declaration '%s'", e->e.unresolved.string); |
| e->kind = ELEM_NTERM; |
| e->e.nterm = p; |
| } |
| } |
| /* build declaration groups (covering a production subtrees) */ |
| for (i = 0; i < g->declarations.n; i++) { |
| if (scanner_declaration(g->declarations.v[i])) { |
| p = g->declarations.v[i]->elem->e.nterm; |
| set_declaration_group(p, p, g->declarations.v[i]); |
| } |
| } |
| /* set terminal scan_kind */ |
| for (i = 0; i < g->productions.n; i++) { |
| p = g->productions.v[i]; |
| for (j = 0; j < p->rules.n; j++) { |
| r = p->rules.v[j]; |
| for (k = 0; k < r->elems.n; k++) { |
| e = r->elems.v[k]; |
| if (e->kind == ELEM_TERM) { |
| if (!p->declaration_group[DECLARE_LONGEST_MATCH] && |
| !p->declaration_group[DECLARE_ALL_MATCHES]) |
| e->e.term->scan_kind = D_SCAN_DEFAULT; |
| else if (p->declaration_group[DECLARE_LONGEST_MATCH] && |
| !p->declaration_group[DECLARE_ALL_MATCHES]) |
| e->e.term->scan_kind = D_SCAN_LONGEST; |
| else if (!p->declaration_group[DECLARE_LONGEST_MATCH] && |
| p->declaration_group[DECLARE_ALL_MATCHES]) |
| e->e.term->scan_kind = D_SCAN_ALL; |
| else { |
| if (p->last_declaration[DECLARE_LONGEST_MATCH]->index > |
| p->last_declaration[DECLARE_ALL_MATCHES]->index) |
| e->e.term->scan_kind = D_SCAN_LONGEST; |
| else |
| e->e.term->scan_kind = D_SCAN_ALL; |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| static void |
| merge_shift_actions(State *to, State *from) { |
| int i, j; |
| for (i = 0; i < from->shift_actions.n; i++) { |
| for (j = 0; j < to->shift_actions.n; j++) |
| if (from->shift_actions.v[i]->term == to->shift_actions.v[j]->term) |
| goto Lnext; |
| vec_add(&to->shift_actions, from->shift_actions.v[i]); |
| Lnext:; |
| } |
| } |
| |
| static void |
| compute_declaration_states(Grammar *g, Production *p, Declaration *d) { |
| State *s, *base_s = NULL; |
| int j, k, scanner = scanner_declaration(d); |
| |
| for (j = 0; j < g->states.n; j++) { |
| s = g->states.v[j]; |
| if (d->kind == DECLARE_TOKENIZE) { |
| if (!base_s) |
| base_s = s; |
| else { |
| s->same_shifts = base_s; |
| merge_shift_actions(base_s, s); |
| } |
| } |
| if (scanner) { |
| for (k = 0; k < s->items.n; k++) |
| if (s->items.v[k]->kind == ELEM_TERM) |
| switch (s->items.v[k]->e.term->scan_kind) { |
| case D_SCAN_LONGEST: |
| if (s->scan_kind == D_SCAN_RESERVED || |
| s->scan_kind == D_SCAN_LONGEST) |
| s->scan_kind = D_SCAN_LONGEST; |
| else |
| s->scan_kind = D_SCAN_MIXED; |
| break; |
| case D_SCAN_ALL: |
| if (s->scan_kind == D_SCAN_RESERVED || |
| s->scan_kind == D_SCAN_ALL) |
| s->scan_kind = D_SCAN_ALL; |
| else |
| s->scan_kind = D_SCAN_MIXED; |
| break; |
| default: |
| break; |
| } |
| } |
| } |
| } |
| |
| static void |
| map_declarations_to_states(Grammar *g) { |
| int i; |
| State *s; |
| |
| for (i = 0; i < g->states.n; i++) { |
| s = g->states.v[i]; |
| s->scan_kind = D_SCAN_RESERVED; |
| } |
| /* map groups to sets of states */ |
| for (i = 0; i < g->declarations.n; i++) |
| if (scanner_declaration(g->declarations.v[i])) |
| compute_declaration_states(g, g->declarations.v[i]->elem->e.nterm, |
| g->declarations.v[i]); |
| for (i = 0; i < g->states.n; i++) { |
| s = g->states.v[i]; |
| if (s->scan_kind == D_SCAN_RESERVED) |
| s->scan_kind = D_SCAN_DEFAULT; /* set the default */ |
| } |
| } |
| |
| int |
| build_grammar(Grammar *g) { |
| resolve_grammar(g); |
| convert_regex_productions(g); |
| propogate_declarations(g); |
| merge_identical_terminals(g); |
| make_elems_for_productions(g); |
| check_default_actions(g); |
| build_LR_tables(g); |
| map_declarations_to_states(g); |
| if (verbose_level) { |
| printf("%d productions %d terminals %d states %d declarations\n", |
| g->productions.n, g->terminals.n, g->states.n, |
| g->declarations.n); |
| } |
| if (verbose_level > 1) { |
| print_grammar(g); |
| print_states(g); |
| } |
| build_scanners(g); |
| build_eq(g); |
| return 0; |
| } |