| /* |
| Copyright 2002-2003 John Plevyak, All Rights Reserved |
| */ |
| |
| #include "d.h" |
| |
| typedef struct NFAState { |
| uint index; |
| Vec(struct NFAState*) chars[256]; |
| Vec(struct NFAState*) epsilon; |
| Vec(Action*) accepts; |
| Vec(Action*) live; |
| } NFAState; |
| |
| typedef struct DFAState { |
| Vec(struct NFAState*) states; |
| struct DFAState *chars[256]; |
| ScanState *scan; |
| } DFAState; |
| |
| typedef Vec(DFAState *) VecDFAState; |
| typedef Vec(NFAState *) VecNFAState; |
| |
| typedef struct LexState { |
| uint nfa_index; |
| VecNFAState allnfas; |
| uint transitions; |
| uint scanners; |
| uint ignore_case; |
| } LexState; |
| |
| static NFAState * |
| new_NFAState(LexState *ls) { |
| NFAState *n = MALLOC(sizeof(NFAState)); |
| memset(n, 0, sizeof(NFAState)); |
| n->index = ls->nfa_index++; |
| vec_add(&ls->allnfas, n); |
| return n; |
| } |
| |
| static DFAState * |
| new_DFAState() { |
| DFAState *n = MALLOC(sizeof(DFAState)); |
| memset(n, 0, sizeof(DFAState)); |
| return n; |
| } |
| |
| static void |
| free_DFAState(DFAState *y) { |
| vec_free(&y->states); |
| FREE(y); |
| } |
| |
| static void |
| free_VecDFAState(VecDFAState *dfas) { |
| int i; |
| for (i = 0; i < dfas->n; i++) |
| free_DFAState(dfas->v[i]); |
| vec_free(dfas); |
| } |
| |
| static void |
| free_NFAState(NFAState *y) { |
| int i; |
| for (i = 0; i < 256; i++) |
| vec_free(&y->chars[i]); |
| vec_free(&y->epsilon); |
| vec_free(&y->accepts); |
| FREE(y); |
| } |
| |
| static void |
| free_VecNFAState(VecNFAState *nfas) { |
| int i; |
| for (i = 0; i < nfas->n; i++) |
| free_NFAState(nfas->v[i]); |
| vec_free(nfas); |
| } |
| |
| static ScanState * |
| new_ScanState() { |
| ScanState *n = MALLOC(sizeof(ScanState)); |
| memset(n, 0, sizeof(ScanState)); |
| return n; |
| } |
| |
| static int |
| nfacmp(const void *ai, const void *aj) { |
| uint32 i = (*(NFAState**)ai)->index; |
| uint32 j = (*(NFAState**)aj)->index; |
| return (i > j) ? 1 : ((i < j) ? -1 : 0); |
| } |
| |
| static void |
| nfa_closure(DFAState *x) { |
| int i, j, k; |
| |
| for (i = 0; i < x->states.n; i++) |
| for (j = 0; j < x->states.v[i]->epsilon.n; j++) { |
| for (k = 0; k < x->states.n; k++) |
| if (x->states.v[i]->epsilon.v[j] == x->states.v[k]) |
| goto Lbreak; |
| vec_add(&x->states, x->states.v[i]->epsilon.v[j]); |
| Lbreak:; |
| } |
| qsort(x->states.v, x->states.n, sizeof(x->states.v[0]), nfacmp); |
| } |
| |
| static int |
| eq_dfa_state(DFAState *x, DFAState *y) { |
| int i; |
| |
| if (x->states.n != y->states.n) |
| return 0; |
| for (i = 0; i < x->states.n; i++) |
| if (x->states.v[i] != y->states.v[i]) |
| return 0; |
| return 1; |
| } |
| |
| static void |
| dfa_to_scanner(VecDFAState *alldfas, VecScanState *scanner) { |
| int i, j, k, highest, p; |
| |
| vec_clear(scanner); |
| for (i = 0; i < alldfas->n; i++) { |
| alldfas->v[i]->scan = new_ScanState(); |
| alldfas->v[i]->scan->index = i; |
| vec_add(scanner, alldfas->v[i]->scan); |
| } |
| for (i = 0; i < alldfas->n; i++) { |
| for (j = 0; j < 256; j++) |
| if (alldfas->v[i]->chars[j]) |
| alldfas->v[i]->scan->chars[j] = alldfas->v[i]->chars[j]->scan; |
| highest = INT_MIN; |
| for (j = 0; j < alldfas->v[i]->states.n; j++) |
| for (k = 0; k < alldfas->v[i]->states.v[j]->accepts.n; k++) { |
| p = alldfas->v[i]->states.v[j]->accepts.v[k]->term->term_priority; |
| if (highest < p) |
| highest = p; |
| } |
| for (j = 0; j < alldfas->v[i]->states.n; j++) |
| for (k = 0; k < alldfas->v[i]->states.v[j]->accepts.n; k++) { |
| p = alldfas->v[i]->states.v[j]->accepts.v[k]->term->term_priority; |
| if (p == highest) |
| vec_add(&alldfas->v[i]->scan->accepts, |
| alldfas->v[i]->states.v[j]->accepts.v[k]); |
| } |
| } |
| } |
| |
| static void |
| nfa_to_scanner(NFAState *n, Scanner *s) { |
| DFAState *x = new_DFAState(), *y; |
| VecDFAState alldfas; |
| int i, i_alldfas, i_states, i_char; |
| VecScanState *scanner = &s->states; |
| |
| memset(&alldfas, 0, sizeof(alldfas)); |
| vec_add(&x->states, n); |
| nfa_closure(x); |
| vec_add(&alldfas, x); |
| for (i_alldfas = 0; i_alldfas < alldfas.n; i_alldfas++) { |
| x = alldfas.v[i_alldfas]; |
| for (i_char = 0; i_char < 256; i_char++) { |
| y = NULL; |
| for (i_states = 0; i_states < x->states.n; i_states++) { |
| for (i = 0; i < x->states.v[i_states]->chars[i_char].n; i++) { |
| if (!y) |
| y = new_DFAState(); |
| set_add(&y->states, x->states.v[i_states]->chars[i_char].v[i]); |
| } |
| } |
| if (y) { |
| set_to_vec(&y->states); |
| nfa_closure(y); |
| for (i = 0; i < alldfas.n; i++) |
| if (eq_dfa_state(y, alldfas.v[i])) { |
| free_DFAState(y); |
| y = alldfas.v[i]; |
| goto Lnext; |
| } |
| vec_add(&alldfas, y); |
| Lnext: |
| x->chars[i_char] = y; |
| } |
| } |
| } |
| dfa_to_scanner(&alldfas, scanner); |
| free_VecDFAState(&alldfas); |
| } |
| |
| /* build a NFA for the regular expression */ |
| static void |
| build_regex_nfa(LexState *ls, uint8 **areg, NFAState *pp, NFAState *nn) { |
| uint8 c, pc, *reg = *areg; |
| NFAState *p = pp, *s, *x, *n = nn; |
| int reversed, i; |
| uint8 mark[256]; |
| |
| s = p; |
| while ((c = *reg++)) { |
| switch(c) { |
| case '(': |
| build_regex_nfa(ls, ®, s, (x = new_NFAState(ls))); |
| p = s; |
| s = x; |
| break; |
| case ')': |
| goto Lreturn; |
| case '|': |
| vec_add(&s->epsilon, nn); |
| vec_add(&pp->epsilon, (s = new_NFAState(ls))); |
| break; |
| case '[': |
| if (*reg == '^') { |
| reg++; |
| reversed = 1; |
| } else |
| reversed = 0; |
| memset(mark, 0, sizeof(mark)); |
| pc = UCHAR_MAX; |
| while ((c = *reg++)) { |
| switch(c) { |
| case ']': |
| goto Lsetdone; |
| case '-': |
| c = *reg++; |
| if (!c) |
| goto Lerror; |
| if (c == '\\') |
| c = *reg++; |
| if (!c) |
| goto Lerror; |
| for (;pc <= c; pc++) |
| mark[pc] = 1; |
| break; |
| case '\\': |
| c = *reg++; |
| /* fall through */ |
| default: |
| pc = c; |
| mark[c] = 1; |
| break; |
| } |
| } |
| Lsetdone: |
| x = new_NFAState(ls); |
| for (i = 1; i < 256; i++) |
| if ((!reversed && mark[i]) || (reversed && !mark[i])) |
| vec_add(&s->chars[i], x); |
| p = s; |
| s = x; |
| break; |
| case '?': |
| vec_add(&p->epsilon, s); |
| break; |
| case '*': |
| vec_add(&p->epsilon, s); |
| vec_add(&s->epsilon, p); |
| break; |
| case '+': |
| vec_add(&s->epsilon, p); |
| break; |
| case '\\': |
| c = *reg++; |
| if (!c) |
| goto Lerror; |
| /* fall through */ |
| default: |
| if (!ls->ignore_case || !isalpha(c)) |
| vec_add(&s->chars[c], (x = new_NFAState(ls))); |
| else { |
| vec_add(&s->chars[tolower(c)], (x = new_NFAState(ls))); |
| vec_add(&s->chars[toupper(c)], x); |
| } |
| p = s; |
| s = x; |
| break; |
| } |
| } |
| Lreturn: |
| vec_add(&s->epsilon, n); |
| *areg = reg; |
| return; |
| Lerror: |
| d_fail("bad (part of) regex: %s\n", *areg); |
| } |
| |
| static void |
| action_diff(VecAction *a, VecAction *b, VecAction *c) { |
| int bb = 0, cc = 0; |
| while (1) { |
| if (bb >= b->n) |
| break; |
| Lagainc: |
| if (cc >= c->n) { |
| while (bb < b->n) |
| vec_add(a, b->v[bb++]); |
| break; |
| } |
| Lagainb: |
| if (b->v[bb]->index == c->v[cc]->index) { |
| bb++; |
| cc++; |
| continue; |
| } |
| if (b->v[bb]->index < c->v[cc]->index) { |
| vec_add(a, b->v[bb++]); |
| if (bb >= b->n) |
| break; |
| goto Lagainb; |
| } |
| cc++; |
| goto Lagainc; |
| } |
| } |
| |
| static void |
| action_intersect(VecAction *a, VecAction *b, VecAction *c) { |
| int bb = 0, cc = 0; |
| while (1) { |
| if (bb >= b->n) |
| break; |
| Lagainc: |
| if (cc >= c->n) |
| break; |
| Lagainb: |
| if (b->v[bb]->index == c->v[cc]->index) { |
| vec_add(a, b->v[bb++]); |
| cc++; |
| continue; |
| } |
| if (b->v[bb]->index < c->v[cc]->index) { |
| bb++; |
| if (bb >= b->n) |
| break; |
| goto Lagainb; |
| } |
| cc++; |
| goto Lagainc; |
| } |
| } |
| |
| static void |
| compute_liveness(Scanner *scanner) { |
| int i, j, changed = 1; |
| ScanState *ss, *sss; |
| VecScanState *states = &scanner->states; |
| |
| /* basis */ |
| for (i = 0; i < states->n; i++) { |
| ss = states->v[i]; |
| set_union(&ss->live, &ss->accepts); |
| } |
| while (changed) { |
| changed = 0; |
| for (i = 0; i < states->n; i++) { |
| ss = states->v[i]; |
| for (j = 0; j < 256; j++) { |
| if ((sss = ss->chars[j])) { |
| if (ss != sss) |
| if (set_union(&ss->live, &sss->live)) |
| changed = 1; |
| } |
| } |
| } |
| } |
| for (i = 0; i < states->n; i++) { |
| ss = states->v[i]; |
| set_to_vec(&ss->live); |
| sort_VecAction(&ss->live); |
| } |
| } |
| |
| static uint32 |
| trans_hash_fn(ScanStateTransition *a, hash_fns_t *fns) { |
| uint h = 0; |
| int i; |
| |
| if (!fns->data[0]) |
| for (i = 0; i < a->live_diff.n; i++) |
| h += 3 * a->live_diff.v[i]->index; |
| for (i = 0; i < a->accepts_diff.n; i++) |
| h += 3 * a->accepts_diff.v[i]->index; |
| return h; |
| } |
| |
| static int |
| trans_cmp_fn(ScanStateTransition *a, ScanStateTransition *b, hash_fns_t *fns) { |
| int i; |
| |
| if (!fns->data[0]) |
| if (a->live_diff.n != b->live_diff.n) |
| return 1; |
| if (a->accepts_diff.n != b->accepts_diff.n) |
| return 1; |
| if (!fns->data[0]) |
| for (i = 0; i < a->live_diff.n; i++) |
| if (a->live_diff.v[i] != b->live_diff.v[i]) |
| return 1; |
| for (i = 0; i < a->accepts_diff.n; i++) |
| if (a->accepts_diff.v[i] != b->accepts_diff.v[i]) |
| return 1; |
| return 0; |
| } |
| |
| static hash_fns_t trans_hash_fns = { |
| (hash_fn_t)trans_hash_fn, |
| (cmp_fn_t)trans_cmp_fn, |
| { 0, 0 } |
| }; |
| |
| static void |
| build_transitions(LexState *ls, Scanner *s) { |
| int i, j; |
| ScanState *ss; |
| ScanStateTransition *trans = NULL, *x; |
| VecScanState *states = &s->states; |
| |
| #ifdef LIVE_DIFF_IN_TRANSITIONS |
| trans_hash_fns.data[0] = (void*)0; |
| #else |
| trans_hash_fns.data[0] = (void*)1; |
| #endif |
| for (i = 0; i < states->n; i++) { |
| ss = states->v[i]; |
| for (j = 0; j < 256; j++) { |
| if (!trans) { |
| trans = MALLOC(sizeof(*trans)); |
| memset(trans, 0, sizeof(*trans)); |
| } |
| if (ss->chars[j]) { |
| action_diff(&trans->live_diff, &ss->live, &ss->chars[j]->live); |
| action_intersect(&trans->accepts_diff, &ss->accepts, |
| &trans->live_diff); |
| } |
| if ((x = set_add_fn(&s->transitions, trans, &trans_hash_fns)) == trans) |
| trans = NULL; |
| else { |
| vec_free(&trans->live_diff); |
| vec_free(&trans->accepts_diff); |
| } |
| ss->transition[j] = x; |
| } |
| } |
| j = 0; |
| set_to_vec(&s->transitions); |
| for (i = 0; i < s->transitions.n; i++) |
| s->transitions.v[i]->index = i; |
| ls->transitions += s->transitions.n; |
| } |
| |
| static void |
| compute_transitions(LexState *ls, Scanner *s) { |
| compute_liveness(s); |
| build_transitions(ls, s); |
| } |
| |
| static void |
| build_state_scanner(LexState *ls, State *s) { |
| NFAState *n, *nn, *nnn; |
| Action *a; |
| uint8 *c, *reg; |
| int j, one; |
| |
| one = 0; |
| n = new_NFAState(ls); |
| /* first strings since they can be trivially combined as a tree */ |
| for (j = 0; j < s->shift_actions.n; j++) { |
| a = s->shift_actions.v[j]; |
| if (a->kind == ACTION_ACCEPT) { |
| one = 1; |
| if (!n->chars[0].n) |
| vec_add(&n->chars[0], (nnn = new_NFAState(ls))); |
| else |
| nnn = n->chars[0].v[0]; |
| vec_add(&nnn->accepts, a); |
| } else if (a->kind == ACTION_SHIFT && a->term->kind == TERM_STRING) { |
| one = 1; |
| nn = n; |
| if (!a->term->ignore_case) { |
| for (c = a->term->string; *c; c++) { |
| if (!nn->chars[*c].n) |
| vec_add(&nn->chars[*c], (nnn = new_NFAState(ls))); |
| else |
| nnn = nn->chars[*c].v[0]; |
| nn = nnn; |
| } |
| } else { /* use new states */ |
| for (c = a->term->string; *c; c++) { |
| if (isalpha(*c)) { |
| vec_add(&nn->chars[toupper(*c)], (nnn = new_NFAState(ls))); |
| vec_add(&nn->chars[tolower(*c)], nnn); |
| } else |
| vec_add(&nn->chars[*c], (nnn = new_NFAState(ls))); |
| nn = nnn; |
| } |
| } |
| vec_add(&nn->accepts, a); |
| } |
| } |
| /* now regexes */ |
| for (j = 0; j < s->shift_actions.n; j++) { |
| a = s->shift_actions.v[j]; |
| if (a->kind == ACTION_SHIFT && a->term->kind == TERM_REGEX) { |
| one = 1; |
| reg = a->term->string; |
| vec_add(&n->epsilon, (nnn = new_NFAState(ls))); |
| nn = new_NFAState(ls); |
| ls->ignore_case = a->term->ignore_case; |
| build_regex_nfa(ls, ®, nnn, nn); |
| vec_add(&nn->accepts, a); |
| } |
| } |
| if (one) { |
| nfa_to_scanner(n, &s->scanner); |
| compute_transitions(ls, &s->scanner); |
| } |
| free_VecNFAState(&ls->allnfas); |
| ls->scanners++; |
| } |
| |
| static LexState * |
| new_LexState() { |
| LexState *ls = MALLOC(sizeof(LexState)); |
| memset(ls, 0, sizeof(LexState)); |
| vec_clear(&ls->allnfas); |
| return ls; |
| } |
| |
| void |
| build_scanners(Grammar *g) { |
| int i, j, k; |
| State *s; |
| LexState *ls = new_LexState(); |
| |
| /* detect identical scanners */ |
| for (i = 0; i < g->states.n; i++) { |
| s = g->states.v[i]; |
| if (s->same_shifts) |
| continue; |
| for (j = 0; j < i; j++) { |
| if (g->states.v[j]->same_shifts) |
| continue; |
| if (g->states.v[j]->shift_actions.n != g->states.v[i]->shift_actions.n) |
| continue; |
| for (k = 0; k < g->states.v[j]->shift_actions.n; k++) |
| if (g->states.v[i]->shift_actions.v[k]->term != |
| g->states.v[j]->shift_actions.v[k]->term) |
| break; |
| if (k >= g->states.v[j]->shift_actions.n) { |
| s->same_shifts = g->states.v[j]; |
| break; |
| } |
| } |
| } |
| /* build scanners */ |
| for (i = 0; i < g->states.n; i++) { |
| s = g->states.v[i]; |
| if (s->shift_actions.n) { |
| if (s->same_shifts) |
| s->scanner = s->same_shifts->scanner; |
| else |
| build_state_scanner(ls, s); |
| } |
| } |
| if (verbose_level) |
| printf("%d scanners %d transitions\n", ls->scanners, ls->transitions); |
| FREE(ls); |
| } |
| |