/* FIXME: summary
 * decide whether we enforce valid UTF-8, right now it's enforced in certain
 *     parts of the script, but not the input...
 * nul bytes cause explosions due to use of libc string functions. thoughts?
 * lack of newline at end of file, currently we add one. what should we do?
 * allow "\\t" for "\t" etc. in regex? in replacement text?
 * POSIX says don't flush on N when out of input, but GNU and busybox do.
 */

#include <ctype.h>
#include <errno.h>
#include <regex.h>
#include <stdlib.h>
#include <string.h>

#include "utf.h"
#include "util.h"

/* Types */

/* used as queue for writes and stack for {,:,b,t */
typedef struct {
	void **data;
	size_t size;
	size_t cap;
} Vec;

/* used for arbitrary growth, str is a C string
 * FIXME: does it make sense to keep track of length? or just rely on libc
 *        string functions? If we want to support nul bytes everything changes
 */
typedef struct {
	char  *str;
	size_t cap;
} String;

typedef struct Cmd Cmd;
typedef struct {
	void  (*fn)(Cmd *);
	char *(*getarg)(Cmd *, char *);
	void  (*freearg)(Cmd *);
	unsigned char naddr;
} Fninfo;

typedef struct {
	union {
		size_t   lineno;
		regex_t *re;
	} u;
	enum {
		IGNORE, /* empty address, ignore        */
		EVERY , /* every line                   */
		LINE  , /* ilne number                  */
		LAST  , /* last line ($)                */
		REGEX , /* use included regex           */
		LASTRE, /* use most recently used regex */
	} type;
} Addr;

/* DISCUSS: naddr is not strictly necessary, but very helpful
 * naddr == 0 iff beg.type == EVERY  && end.type == IGNORE
 * naddr == 1 iff beg.type != IGNORE && end.type == IGNORE
 * naddr == 2 iff beg.type != IGNORE && end.type != IGNORE
 */
typedef struct {
	Addr          beg;
	Addr          end;
	unsigned char naddr;
} Range;

typedef struct {
	regex_t      *re; /* if NULL use last regex */
	String        repl;
	FILE         *file;
	size_t        occurrence; /* 0 for all (g flag) */
	Rune          delim;
	unsigned int  p:1;
} Sarg;

typedef struct {
	Rune *set1;
	Rune *set2;
} Yarg;

typedef struct {
	String str; /* a,c,i text. r file path */
	void  (*print)(char *, FILE *); /* check_puts for a, write_file for r, unused for c,i */
} ACIRarg;

struct Cmd {
	Range   range;
	Fninfo *fninfo;
	union {
		Cmd      *jump;   /* used for   b,t when running  */
		char     *label;  /* used for :,b,t when building */
		ptrdiff_t offset; /* used for { (pointers break during realloc) */
		FILE     *file;   /* used for w */

		/* FIXME: Should the following be in the union? or pointers and malloc? */
		Sarg      s;
		Yarg      y;
		ACIRarg   acir;
	} u; /* I find your lack of anonymous unions disturbing */
	unsigned int in_match:1;
	unsigned int negate  :1;
};

/* Files for w command (and s' w flag) */
typedef struct {
	char *path;
	FILE *file;
} Wfile;

/*
 * Function Declarations
 */

/* Dynamically allocated arrays and strings */
static void resize(void **ptr, size_t *nmemb, size_t size, size_t new_nmemb, void **next);
static void *pop(Vec *v);
static void push(Vec *v, void *p);
static void stracat(String *dst, char *src);
static void strnacat(String *dst, char *src, size_t n);
static void stracpy(String *dst, char *src);

/* Cleanup and errors */
static void usage(void);

/* Parsing functions and related utilities */
static void compile(char *s, int isfile);
static int read_line(FILE *f, String *s);
static char *make_range(Range *range, char *s);
static char *make_addr(Addr *addr, char *s);
static char *find_delim(char *s, Rune delim, int do_brackets);
static char *chompr(char *s, Rune rune);
static char *chomp(char *s);
static Rune *strtorunes(char *s, size_t nrunes);
static long stol(char *s, char **endp);
static size_t escapes(char *beg, char *end, Rune delim, int n_newline);
static size_t echarntorune(Rune *r, char *s, size_t n);
static void insert_labels(void);

/* Get and Free arg and related utilities */
static char *get_aci_arg(Cmd *c, char *s);
static void aci_append(Cmd *c, char *s);
static void free_acir_arg(Cmd *c);
static char *get_bt_arg(Cmd *c, char *s);
static char *get_r_arg(Cmd *c, char *s);
static char *get_s_arg(Cmd *c, char *s);
static void free_s_arg(Cmd *c);
static char *get_w_arg(Cmd *c, char *s);
static char *get_y_arg(Cmd *c, char *s);
static void free_y_arg(Cmd *c);
static char *get_colon_arg(Cmd *c, char *s);
static char *get_lbrace_arg(Cmd *c, char *s);
static char *get_rbrace_arg(Cmd *c, char *s);
static char *semicolon_arg(char *s);

/* Running */
static void run(void);
static int in_range(Cmd *c);
static int match_addr(Addr *a);
static int next_file(void);
static int is_eof(FILE *f);
static void do_writes(void);
static void write_file(char *path, FILE *out);
static void check_puts(char *s, FILE *f);
static void update_ranges(Cmd *beg, Cmd *end);

/* Sed functions */
static void cmd_y(Cmd *c);
static void cmd_x(Cmd *c);
static void cmd_w(Cmd *c);
static void cmd_t(Cmd *c);
static void cmd_s(Cmd *c);
static void cmd_r(Cmd *c);
static void cmd_q(Cmd *c);
static void cmd_P(Cmd *c);
static void cmd_p(Cmd *c);
static void cmd_N(Cmd *c);
static void cmd_n(Cmd *c);
static void cmd_l(Cmd *c);
static void cmd_i(Cmd *c);
static void cmd_H(Cmd *c);
static void cmd_h(Cmd *c);
static void cmd_G(Cmd *c);
static void cmd_g(Cmd *c);
static void cmd_D(Cmd *c);
static void cmd_d(Cmd *c);
static void cmd_c(Cmd *c);
static void cmd_b(Cmd *c);
static void cmd_a(Cmd *c);
static void cmd_colon(Cmd *c);
static void cmd_equal(Cmd *c);
static void cmd_lbrace(Cmd *c);
static void cmd_rbrace(Cmd *c);
static void cmd_last(Cmd *c);

/* Actions */
static void new_line(void);
static void app_line(void);
static void new_next(void);
static void old_next(void);

/*
 * Globals
 */
static Vec braces, labels, branches; /* holds ptrdiff_t. addrs of {, :, bt */
static Vec writes; /* holds cmd*. writes scheduled by a and r commands */
static Vec wfiles; /* holds Wfile*. files for w and s///w commands */

static Cmd   *prog, *pc; /* Program, program counter */
static size_t pcap;
static size_t lineno;

static regex_t *lastre; /* last used regex for empty regex search */
static char   **files;  /* list of file names from argv */
static FILE    *file;   /* current file we are reading */

static String patt, hold, genbuf;

static struct {
	unsigned int n       :1; /* -n (no print) */
	unsigned int E       :1; /* -E (extended re) */
	unsigned int s       :1; /* s/// replacement happened */
	unsigned int aci_cont:1; /* a,c,i text continuation */
	unsigned int s_cont  :1; /* s/// replacement text continuation */
	unsigned int halt    :1; /* halt execution */
} gflags;

/* FIXME: move character inside Fninfo and only use 26*sizeof(Fninfo) instead of 127*sizeof(Fninfo) bytes */
static Fninfo fns[] = {
	['a'] = { cmd_a     , get_aci_arg   , free_acir_arg , 1 }, /* schedule write of text for later                                                      */
	['b'] = { cmd_b     , get_bt_arg    , NULL          , 2 }, /* branch to label char *label when building, Cmd *jump when running                     */
	['c'] = { cmd_c     , get_aci_arg   , free_acir_arg , 2 }, /* delete pattern space. at 0 or 1 addr or end of 2 addr, write text                     */
	['d'] = { cmd_d     , NULL          , NULL          , 2 }, /* delete pattern space                                                                  */
	['D'] = { cmd_D     , NULL          , NULL          , 2 }, /* delete to first newline and start new cycle without reading (if no newline, d)        */
	['g'] = { cmd_g     , NULL          , NULL          , 2 }, /* replace pattern space with hold space                                                 */
	['G'] = { cmd_G     , NULL          , NULL          , 2 }, /* append newline and hold space to pattern space                                        */
	['h'] = { cmd_h     , NULL          , NULL          , 2 }, /* replace hold space with pattern space                                                 */
	['H'] = { cmd_H     , NULL          , NULL          , 2 }, /* append newline and pattern space to hold space                                        */
	['i'] = { cmd_i     , get_aci_arg   , free_acir_arg , 1 }, /* write text                                                                            */
	['l'] = { cmd_l     , NULL          , NULL          , 2 }, /* write pattern space in 'visually unambiguous form'                                    */
	['n'] = { cmd_n     , NULL          , NULL          , 2 }, /* write pattern space (unless -n) read to replace pattern space (if no input, quit)     */
	['N'] = { cmd_N     , NULL          , NULL          , 2 }, /* append to pattern space separated by newline, line number changes (if no input, quit) */
	['p'] = { cmd_p     , NULL          , NULL          , 2 }, /* write pattern space                                                                   */
	['P'] = { cmd_P     , NULL          , NULL          , 2 }, /* write pattern space up to first newline                                               */
	['q'] = { cmd_q     , NULL          , NULL          , 1 }, /* quit                                                                                  */
	['r'] = { cmd_r     , get_r_arg     , free_acir_arg , 1 }, /* write contents of file (unable to open/read treated as empty file)                    */
	['s'] = { cmd_s     , get_s_arg     , free_s_arg    , 2 }, /* find/replace/all that crazy s stuff                                                   */
	['t'] = { cmd_t     , get_bt_arg    , NULL          , 2 }, /* if s/// succeeded (since input or last t) branch to label (branch to end if no label) */
	['w'] = { cmd_w     , get_w_arg     , NULL          , 2 }, /* append pattern space to file                                                          */
	['x'] = { cmd_x     , NULL          , NULL          , 2 }, /* exchange pattern and hold spaces                                                      */
	['y'] = { cmd_y     , get_y_arg     , free_y_arg    , 2 }, /* replace runes in set1 with runes in set2                                              */
	[':'] = { cmd_colon , get_colon_arg , NULL          , 0 }, /* defines label for later b and t commands                                              */
	['='] = { cmd_equal , NULL          , NULL          , 1 }, /* printf("%d\n", line_number);                                                          */
	['{'] = { cmd_lbrace, get_lbrace_arg, NULL          , 2 }, /* if we match, run commands, otherwise jump to close                                    */
	['}'] = { cmd_rbrace, get_rbrace_arg, NULL          , 0 }, /* noop, hold onto open for ease of building scripts                                     */

	[0x7f] = { NULL, NULL, NULL, 0 }, /* index is checked with isascii(3p). fill out rest of array */
};

/*
 * Function Definitions
 */

/* given memory pointed to by *ptr that currently holds *nmemb members of size
 * size, realloc to hold new_nmemb members, return new_nmemb in *memb and one
 * past old end in *next. if realloc fails...explode
 */
static void
resize(void **ptr, size_t *nmemb, size_t size, size_t new_nmemb, void **next)
{
	void *n, *tmp;

	if (new_nmemb) {
		tmp = ereallocarray(*ptr, new_nmemb, size);
	} else { /* turns out realloc(*ptr, 0) != free(*ptr) */
		free(*ptr);
		tmp = NULL;
	}
	n = (char *)tmp + *nmemb * size;
	*nmemb = new_nmemb;
	*ptr   = tmp;
	if (next)
		*next = n;
}

static void *
pop(Vec *v)
{
	if (!v->size)
		return NULL;
	return v->data[--v->size];
}

static void
push(Vec *v, void *p)
{
	if (v->size == v->cap)
		resize((void **)&v->data, &v->cap, sizeof(*v->data), v->cap * 2 + 1, NULL);
	v->data[v->size++] = p;
}

static void
stracat(String *dst, char *src)
{
	int new = !dst->cap;
	size_t len;

	len = (new ? 0 : strlen(dst->str)) + strlen(src) + 1;
	if (dst->cap < len)
		resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL);
	if (new)
		*dst->str = '\0';
	strcat(dst->str, src);
}

static void
strnacat(String *dst, char *src, size_t n)
{
	int new = !dst->cap;
	size_t len;

	len = strlen(src);
	len = (new ? 0 : strlen(dst->str)) + MIN(n, len) + 1;
	if (dst->cap < len)
		resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL);
	if (new)
		*dst->str = '\0';
	strlcat(dst->str, src, len);
}

static void
stracpy(String *dst, char *src)
{
	size_t len;

	len = strlen(src) + 1;
	if (dst->cap < len)
		resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL);
	strcpy(dst->str, src);
}

static void
leprintf(char *s)
{
	if (errno)
		eprintf("%zu: %s: %s\n", lineno, s, strerror(errno));
	else
		eprintf("%zu: %s\n", lineno, s);
}

/* FIXME: write usage message */
static void
usage(void)
{
	eprintf("USAGE\n");
}

/* Differences from POSIX
 * we allows semicolons and trailing blanks inside {}
 * we allow spaces after ! (and in between !s)
 * we allow extended regular expressions (-E)
 */
static void
compile(char *s, int isfile)
{
	FILE *f;

	if (!isfile && !*s) /* empty string script */
		return;

	f = isfile ? fopen(s, "r") : fmemopen(s, strlen(s), "r");
	if (!f)
		eprintf("fopen/fmemopen:");

	/* NOTE: get arg functions can't use genbuf */
	while (read_line(f, &genbuf) != EOF) {
		s = genbuf.str;

		/* if the first two characters of the script are "#n" default output shall be suppressed */
		if (++lineno == 1 && *s == '#' && s[1] == 'n') {
			gflags.n = 1;
			continue;
		}

		if (gflags.aci_cont) {
			aci_append(pc - 1, s);
			continue;
		}
		if (gflags.s_cont)
			s = (pc - 1)->fninfo->getarg(pc - 1, s);

		while (*s) {
			s = chompr(s, ';');
			if (!*s || *s == '#')
				break;

			if ((size_t)(pc - prog) == pcap)
				resize((void **)&prog, &pcap, sizeof(*prog), pcap * 2 + 1, (void **)&pc);

			pc->range.beg.type = pc->range.end.type = IGNORE;
			pc->fninfo = NULL;
			pc->in_match = 0;

			s = make_range(&pc->range, s);
			s = chomp(s);
			pc->negate = *s == '!';
			s = chompr(s, '!');

			if (!isascii(*s) || !(pc->fninfo = &fns[(unsigned)*s])->fn)
				leprintf("bad sed function");
			if (pc->range.naddr > pc->fninfo->naddr)
				leprintf("wrong number of addresses");
			s++;

			if (pc->fninfo->getarg)
				s = pc->fninfo->getarg(pc, s);

			pc++;
		}
	}

	fshut(f, s);
}

/* FIXME: if we decide to honor lack of trailing newline, set/clear a global
 * flag when reading a line
 */
static int
read_line(FILE *f, String *s)
{
	ssize_t len;

	if (!f)
		return EOF;

	if ((len = getline(&s->str, &s->cap, f)) < 0) {
		if (ferror(f))
			eprintf("getline:");
		return EOF;
	}
	if (s->str[--len] == '\n')
		s->str[len] = '\0';
	return 0;
}

/* read first range from s, return pointer to one past end of range */
static char *
make_range(Range *range, char *s)
{
	s = make_addr(&range->beg, s);

	if (*s == ',')
		s = make_addr(&range->end, s + 1);
	else
		range->end.type = IGNORE;

	if      (range->beg.type == EVERY  && range->end.type == IGNORE) range->naddr = 0;
	else if (range->beg.type != IGNORE && range->end.type == IGNORE) range->naddr = 1;
	else if (range->beg.type != IGNORE && range->end.type != IGNORE) range->naddr = 2;
	else leprintf("this is impossible...");

	return s;
}

/* read first addr from s, return pointer to one past end of addr */
static char *
make_addr(Addr *addr, char *s)
{
	Rune r;
	char *p = s + strlen(s);
	size_t rlen = echarntorune(&r, s, p - s);

	if (r == '$') {
		addr->type = LAST;
		s += rlen;
	} else if (isdigitrune(r)) {
		addr->type = LINE;
		addr->u.lineno = stol(s, &s);
	} else if (r == '/' || r == '\\') {
		Rune delim;
		if (r == '\\') {
			s += rlen;
			rlen = echarntorune(&r, s, p - s);
		}
		if (r == '\\')
			leprintf("bad delimiter '\\'");
		delim = r;
		s += rlen;
		rlen = echarntorune(&r, s, p - s);
		if (r == delim) {
			addr->type = LASTRE;
			s += rlen;
		} else {
			addr->type = REGEX;
			p = find_delim(s, delim, 1);
			if (!*p)
				leprintf("unclosed regex");
			p -= escapes(s, p, delim, 0);
			*p++ = '\0';
			addr->u.re = emalloc(sizeof(*addr->u.re));
			eregcomp(addr->u.re, s, gflags.E ? REG_EXTENDED : 0);
			s = p;
		}
	} else {
		addr->type = EVERY;
	}

	return s;
}

/* return pointer to first delim in s that is not escaped
 * and if do_brackets is set, not in [] (note possible [::], [..], [==], inside [])
 * return pointer to trailing nul byte if no delim found
 *
 * any escaped character that is not special is just itself (POSIX undefined)
 * FIXME: pull out into some util thing, will be useful for ed as well
 */
static char *
find_delim(char *s, Rune delim, int do_brackets)
{
	enum {
		OUTSIDE         , /* not in brackets */
		BRACKETS_OPENING, /* last char was first [ or last two were first [^ */
		BRACKETS_INSIDE , /* inside [] */
		INSIDE_OPENING  , /* inside [] and last char was [ */
		CLASS_INSIDE    , /* inside class [::], or colating element [..] or [==], inside [] */
		CLASS_CLOSING   , /* inside class [::], or colating element [..] or [==], and last character was the respective : . or = */
	} state = OUTSIDE;

	Rune r, c = 0; /* no c won't be used uninitialized, shutup -Wall */
	size_t rlen;
	int escape = 0;
	char *end = s + strlen(s);

	for (; *s; s += rlen) {
		rlen = echarntorune(&r, s, end - s);

		if      (state == BRACKETS_OPENING       &&  r == '^'  ) {                            continue; }
		else if (state == BRACKETS_OPENING       &&  r == ']'  ) { state  = BRACKETS_INSIDE ; continue; }
		else if (state == BRACKETS_OPENING                     ) { state  = BRACKETS_INSIDE ;           }

		if      (state == CLASS_CLOSING          &&  r == ']'  ) { state  = BRACKETS_INSIDE ;           }
		else if (state == CLASS_CLOSING                        ) { state  = CLASS_INSIDE    ;           }
		else if (state == CLASS_INSIDE           &&  r ==  c   ) { state  = CLASS_CLOSING   ;           }
		else if (state == INSIDE_OPENING         && (r == ':'  ||
		                                             r == '.'  ||
		                                             r == '=') ) { state  = CLASS_INSIDE    ; c = r;    }
		else if (state == INSIDE_OPENING         &&  r == ']'  ) { state  = OUTSIDE         ;           }
		else if (state == BRACKETS_INSIDE        &&  r == '['  ) { state  = INSIDE_OPENING  ;           }
		else if (state == BRACKETS_INSIDE        &&  r == ']'  ) { state  = OUTSIDE         ;           }
		else if (state == OUTSIDE                &&  escape    ) { escape = 0               ;           }
		else if (state == OUTSIDE                &&  r == '\\' ) { escape = 1               ;           }
		else if (state == OUTSIDE && do_brackets &&  r == '['  ) { state  = BRACKETS_OPENING;           }
		else if (state == OUTSIDE                &&  r == delim) return s;
	}
	return s;
}

static char *
chomp(char *s)
{
	return chompr(s, 0);
}

/* eat all leading whitespace and occurrences of rune */
static char *
chompr(char *s, Rune rune)
{
	Rune   r;
	size_t rlen;
	char  *end = s + strlen(s);

	while (*s && (rlen = echarntorune(&r, s, end - s)) && (isspacerune(r) || r == rune))
		s += rlen;
	return s;
}

/* convert first nrunes Runes from UTF-8 string s in allocated Rune*
 * NOTE: sequence must be valid UTF-8, check first */
static Rune *
strtorunes(char *s, size_t nrunes)
{
	Rune *rs, *rp;

	rp = rs = ereallocarray(NULL, nrunes + 1, sizeof(*rs));

	while (nrunes--)
		s += chartorune(rp++, s);

	*rp = '\0';
	return rs;
}

static long
stol(char *s, char **endp)
{
	long n;
	errno = 0;
	n = strtol(s, endp, 10);

	if (errno)
		leprintf("strtol:");
	if (*endp == s)
		leprintf("strtol: invalid number");

	return n;
}

/* from beg to end replace "\\d" with "d" and "\\n" with "\n" (where d is delim)
 * if delim is 'n' and n_newline is 0 then "\\n" is replaced with "n" (normal)
 * if delim is 'n' and n_newline is 1 then "\\n" is replaced with "\n" (y command)
 * if delim is 0 all escaped characters represent themselves (aci text)
 * memmove rest of string (beyond end) into place
 * return the number of converted escapes (backslashes removed)
 * FIXME: this has had too many corner cases slapped on and is ugly. rewrite better
 */
static size_t
escapes(char *beg, char *end, Rune delim, int n_newline)
{
	size_t num = 0;
	char *src = beg, *dst = beg;

	while (src < end) {
		/* handle escaped backslash specially so we don't think the second
		 * backslash is escaping something */
		if (*src == '\\' && src[1] == '\\') {
			*dst++ = *src++;
			if (delim)
				*dst++ = *src++;
			else
				src++;
		} else if (*src == '\\' && !delim) {
			src++;
		} else if (*src == '\\' && src[1]) {
			Rune r;
			size_t rlen;
			num++;
			src++;
			rlen = echarntorune(&r, src, end - src);

			if (r == 'n' && delim == 'n') {
				*src = n_newline ? '\n' : 'n'; /* src so we can still memmove() */
			} else if (r == 'n') {
				*src = '\n';
			} else if (r != delim) {
				*dst++ = '\\';
				num--;
			}

			memmove(dst, src, rlen);
			dst += rlen;
			src += rlen;
		} else {
			*dst++ = *src++;
		}
	}
	memmove(dst, src, strlen(src) + 1);
	return num;
}

static size_t
echarntorune(Rune *r, char *s, size_t n)
{
	size_t rlen = charntorune(r, s, n);
	if (!rlen || *r == Runeerror)
		leprintf("invalid UTF-8");
	return rlen;
}

static void
insert_labels(void)
{
	size_t i;
	Cmd *from, *to;

	while (branches.size) {
		from = prog + (ptrdiff_t)pop(&branches);

		if (!from->u.label) {/* no label branch to end of script */
			from->u.jump = pc - 1;
		} else {
			for (i = 0; i < labels.size; i++) {
				to = prog + (ptrdiff_t)labels.data[i];
				if (!strcmp(from->u.label, to->u.label)) {
					from->u.jump = to;
					break;
				}
			}
			if (i == labels.size)
				leprintf("bad label");
		}
	}
}

/*
 * Getargs / Freeargs
 * Read argument from s, return pointer to one past last character of argument
 */

/* POSIX compliant
 * i\
 * foobar
 *
 * also allow the following non POSIX compliant
 * i        # empty line
 * ifoobar
 * ifoobar\
 * baz
 *
 * FIXME: GNU and busybox discard leading spaces
 * i  foobar
 * i foobar
 * ifoobar
 * are equivalent in GNU and busybox. We don't. Should we?
 */
static char *
get_aci_arg(Cmd *c, char *s)
{
	c->u.acir.print = check_puts;
	c->u.acir.str = (String){ NULL, 0 };

	gflags.aci_cont = !!*s; /* no continue flag if empty string */

	/* neither empty string nor POSIX compliant */
	if (*s && !(*s == '\\' && !s[1]))
		aci_append(c, s);

	return s + strlen(s);
}

static void
aci_append(Cmd *c, char *s)
{
	char *end = s + strlen(s), *p = end;

	gflags.aci_cont = 0;
	while (--p >= s && *p == '\\')
		gflags.aci_cont = !gflags.aci_cont;

	if (gflags.aci_cont)
		*--end = '\n';

	escapes(s, end, 0, 0);
	stracat(&c->u.acir.str, s);
}

static void
free_acir_arg(Cmd *c)
{
	free(c->u.acir.str.str);
}

/* POSIX dictates that label is rest of line, including semicolons, trailing
 * whitespace, closing braces, etc. and can be limited to 8 bytes
 *
 * I allow a semicolon or closing brace to terminate a label name, it's not
 * POSIX compliant, but it's useful and every sed version I've tried to date
 * does the same.
 *
 * FIXME: POSIX dictates that leading whitespace is ignored but trailing
 * whitespace is not. This is annoying and we should probably get rid of it.
 */
static char *
get_bt_arg(Cmd *c, char *s)
{
	char *p = semicolon_arg(s = chomp(s));

	if (p != s) {
		c->u.label = estrndup(s, p - s);
	} else {
		c->u.label = NULL;
	}

	push(&branches, (void *)(c - prog));

	return p;
}

/* POSIX dictates file name is rest of line including semicolons, trailing
 * whitespace, closing braces, etc. and file name must be preceded by a space
 *
 * I allow a semicolon or closing brace to terminate a file name and don't
 * enforce leading space.
 *
 * FIXME: decide whether trailing whitespace should be included and fix
 * accordingly
 */
static char *
get_r_arg(Cmd *c, char *s)
{
	char *p = semicolon_arg(s = chomp(s));

	if (p == s)
		leprintf("no file name");

	c->u.acir.str.str = estrndup(s, p - s);
	c->u.acir.print = write_file;

	return p;
}

/* we allow "\\n" in replacement text to mean "\n" (undefined in POSIX)
 *
 * FIXME: allow other escapes in regex and replacement? if so change escapes()
 */
static char *
get_s_arg(Cmd *c, char *s)
{
	Rune delim, r;
	Cmd buf;
	char *p;
	int esc, lastre;

	/* s/Find/Replace/Flags */

	/* Find */
	if (!gflags.s_cont) { /* NOT continuing from literal newline in replacement text */
		lastre = 0;
		c->u.s.repl = (String){ NULL, 0 };
		c->u.s.occurrence = 1;
		c->u.s.file = NULL;
		c->u.s.p = 0;

		if (!*s || *s == '\\')
			leprintf("bad delimiter");

		p = s + strlen(s);
		s += echarntorune(&delim, s, p - s);
		c->u.s.delim = delim;

		echarntorune(&r, s, p - s);
		if (r == delim) /* empty regex */
			lastre = 1;

		p = find_delim(s, delim, 1);
		if (!*p)
			leprintf("missing second delimiter");
		p -= escapes(s, p, delim, 0);
		*p = '\0';

		if (lastre) {
			c->u.s.re = NULL;
		} else {
			c->u.s.re = emalloc(sizeof(*c->u.s.re));
			/* FIXME: different eregcomp that calls fatal */
			eregcomp(c->u.s.re, s, gflags.E ? REG_EXTENDED : 0);
		}
		s = p + runelen(delim);
	}

	/* Replace */
	delim = c->u.s.delim;

	p = find_delim(s, delim, 0);
	p -= escapes(s, p, delim, 0);
	if (!*p) { /* no third delimiter */
		/* FIXME: same backslash counting as aci_append() */
		if (p[-1] != '\\')
			leprintf("missing third delimiter or <backslash><newline>");
		p[-1] = '\n';
		gflags.s_cont = 1;
	} else {
		gflags.s_cont = 0;
	}

	/* check for bad references in replacement text */
	*p = '\0';
	for (esc = 0, p = s; *p; p++) {
		if (esc) {
			esc = 0;
			if (isdigit(*p) && c->u.s.re && (size_t)(*p - '0') > c->u.s.re->re_nsub)
				leprintf("back reference number greater than number of groups");
		} else if (*p == '\\') {
			esc = 1;
		}
	}
	stracat(&c->u.s.repl, s);

	if (gflags.s_cont)
		return p;

	s = p + runelen(delim);

	/* Flags */
	p = semicolon_arg(s = chomp(s));

	/* FIXME: currently for simplicity take last of g or occurrence flags and
	 *        ignore multiple p flags. need to fix that */
	for (; s < p; s++) {
		if (isdigit(*s)) {
			c->u.s.occurrence = stol(s, &s);
		} else {
			switch (*s) {
			case 'g': c->u.s.occurrence = 0; break;
			case 'p': c->u.s.p = 1;          break;
			case 'w':
				/* must be last flag, take everything up to newline/semicolon
				 * s == p after this */
				s = get_w_arg(&buf, s);
				c->u.s.file = buf.u.file;
				break;
			}
		}
	}
	return p;
}

static void
free_s_arg(Cmd *c)
{
	if (c->u.s.re)
		regfree(c->u.s.re);
	free(c->u.s.re);
	free(c->u.s.repl.str);
}

/* see get_r_arg notes */
static char *
get_w_arg(Cmd *c, char *s)
{
	char *p = semicolon_arg(s = chomp(s));
	Wfile *w, **wp;

	if (p == s)
		leprintf("no file name");

	for (wp = (Wfile **)wfiles.data; (size_t)(wp - (Wfile **)wfiles.data) < wfiles.size; wp++) {
		if (strlen((*wp)->path) == (size_t)(p - s) && !strncmp(s, (*wp)->path, p - s)) {
			c->u.file = (*wp)->file;
			return p;
		}
	}

	w = emalloc(sizeof(*w));
	w->path = estrndup(s, p - s);

	if (!(w->file = fopen(w->path, "w")))
		leprintf("fopen failed");

	c->u.file = w->file;

	push(&wfiles, w);
	return p;
}

static char *
get_y_arg(Cmd *c, char *s)
{
	Rune delim;
	char *p = s + strlen(s);
	size_t rlen = echarntorune(&delim, s, p - s);
	size_t nrunes1, nrunes2;

	c->u.y.set1 = c->u.y.set2 = NULL;

	s += rlen;
	p = find_delim(s, delim, 0);
	p -= escapes(s, p, delim, 1);
	nrunes1 = utfnlen(s, p - s);
	c->u.y.set1 = strtorunes(s, nrunes1);

	s = p + rlen;
	p = find_delim(s, delim, 0);
	p -= escapes(s, p, delim, 1);
	nrunes2 = utfnlen(s, p - s);

	if (nrunes1 != nrunes2)
		leprintf("different set lengths");

	c->u.y.set2 = strtorunes(s, utfnlen(s, p - s));

	return p + rlen;
}

static void
free_y_arg(Cmd *c)
{
	free(c->u.y.set1);
	free(c->u.y.set2);
}

/* see get_bt_arg notes */
static char *
get_colon_arg(Cmd *c, char *s)
{
	char *p = semicolon_arg(s = chomp(s));

	if (p == s)
		leprintf("no label name");

	c->u.label = estrndup(s, p - s);
	push(&labels, (void *)(c - prog));
	return p;
}

static char *
get_lbrace_arg(Cmd *c, char *s)
{
	push(&braces, (void *)(c - prog));
	return s;
}

static char *
get_rbrace_arg(Cmd *c, char *s)
{
	Cmd *lbrace;

	if (!braces.size)
		leprintf("extra }");

	lbrace = prog + (ptrdiff_t)pop(&braces);
	lbrace->u.offset = c - prog;
	return s;
}

/* s points to beginning of an argument that may be semicolon terminated
 * return pointer to semicolon or nul byte after string
 * or closing brace as to not force ; before }
 * FIXME: decide whether or not to eat trailing whitespace for arguments that
 *        we allow semicolon/brace termination that POSIX doesn't
 *        b, r, t, w, :
 *        POSIX says trailing whitespace is part of label name, file name, etc.
 *        we should probably eat it
 */
static char *
semicolon_arg(char *s)
{
	char *p = strpbrk(s, ";}");
	if (!p)
		p = s + strlen(s);
	return p;
}

static void
run(void)
{
	lineno = 0;
	if (braces.size)
		leprintf("extra {");

	/* genbuf has already been initialized, patt will be in new_line
	 * (or we'll halt) */
	stracpy(&hold, "");

	insert_labels();
	next_file();
	new_line();

	for (pc = prog; !gflags.halt; pc++)
		pc->fninfo->fn(pc);
}

/* return true if we are in range for c, set c->in_match appropriately */
static int
in_range(Cmd *c)
{
	if (match_addr(&c->range.beg)) {
		if (c->range.naddr == 2) {
			if (c->range.end.type == LINE && c->range.end.u.lineno <= lineno)
				c->in_match = 0;
			else
				c->in_match = 1;
		}
		return !c->negate;
	}
	if (c->in_match && match_addr(&c->range.end)) {
		c->in_match = 0;
		return !c->negate;
	}
	return c->in_match ^ c->negate;
}

/* return true if addr matches current line */
static int
match_addr(Addr *a)
{
	switch (a->type) {
	default:
	case IGNORE: return 0;
	case EVERY: return 1;
	case LINE: return lineno == a->u.lineno;
	case LAST:
		while (is_eof(file) && !next_file())
			;
		return !file;
	case REGEX:
		lastre = a->u.re;
		return !regexec(a->u.re, patt.str, 0, NULL, 0);
	case LASTRE:
		if (!lastre)
			leprintf("no previous regex");
		return !regexec(lastre, patt.str, 0, NULL, 0);
	}
}

/* move to next input file
 * stdin if first call and no files
 * return 0 for success and 1 for no more files
 */
static int
next_file(void)
{
	static unsigned char first = 1;

	if (file == stdin)
		clearerr(file);
	else if (file)
		fshut(file, "<file>");
	file = NULL;

	do {
		if (!*files) {
			if (first) /* given no files, default to stdin */
				file = stdin;
			/* else we've used all our files, leave file = NULL */
		} else if (!strcmp(*files, "-")) {
			file = stdin;
			files++;
		} else if (!(file = fopen(*files++, "r"))) {
			/* warn this file didn't open, but move on to next */
			weprintf("fopen:");
		}
	} while (!file && *files);
	first = 0;

	return !file;
}

/* test if stream is at EOF */
static int
is_eof(FILE *f)
{
	int c;

	if (!f || feof(f))
		return 1;

	c = fgetc(f);
	if (c == EOF && ferror(f))
		eprintf("fgetc:");
	if (c != EOF && ungetc(c, f) == EOF)
		eprintf("ungetc EOF\n");

	return c == EOF;
}

/* perform writes that were scheduled
 * for aci this is check_puts(string, stdout)
 * for r this is write_file(path, stdout)
 */
static void
do_writes(void)
{
	Cmd *c;
	size_t i;

	for (i = 0; i < writes.size; i++) {
		c = writes.data[i];
		c->u.acir.print(c->u.acir.str.str, stdout);
	}
	writes.size = 0;
}

/* used for r's u.acir.print()
 * FIXME: something like util's concat() would be better
 */
static void
write_file(char *path, FILE *out)
{
	FILE *in = fopen(path, "r");
	if (!in) /* no file is treated as empty file */
		return;

	while (read_line(in, &genbuf) != EOF)
		check_puts(genbuf.str, out);

	fshut(in, path);
}

static void
check_puts(char *s, FILE *f)
{
	if (s && fputs(s, f) == EOF)
		eprintf("fputs:");
	if (fputs("\n", f) == EOF)
		eprintf("fputs:");
}

/* iterate from beg to end updating ranges so we don't miss any commands
 * e.g. sed -n '1d;1,3p' should still print lines 2 and 3
 */
static void
update_ranges(Cmd *beg, Cmd *end)
{
	while (beg < end)
		in_range(beg++);
}

/*
 * Sed functions
 */
static void
cmd_a(Cmd *c)
{
	if (in_range(c))
		push(&writes, c);
}

static void
cmd_b(Cmd *c)
{
	if (!in_range(c))
		return;

	/* if we jump backwards update to end, otherwise update to destination */
	update_ranges(c + 1, c->u.jump > c ? c->u.jump : prog + pcap);
	pc = c->u.jump;
}

static void
cmd_c(Cmd *c)
{
	if (!in_range(c))
		return;

	/* write the text on the last line of the match */
	if (!c->in_match)
		check_puts(c->u.acir.str.str, stdout);
	/* otherwise start the next cycle without printing pattern space
	 * effectively deleting the text */
	new_next();
}

static void
cmd_d(Cmd *c)
{
	if (!in_range(c))
		return;

	new_next();
}

static void
cmd_D(Cmd *c)
{
	char *p;

	if (!in_range(c))
		return;

	if ((p = strchr(patt.str, '\n'))) {
		p++;
		memmove(patt.str, p, strlen(p) + 1);
		old_next();
	} else {
		new_next();
	}
}

static void
cmd_g(Cmd *c)
{
	if (in_range(c))
		stracpy(&patt, hold.str);
}

static void
cmd_G(Cmd *c)
{
	if (!in_range(c))
		return;

	stracat(&patt, "\n");
	stracat(&patt, hold.str);
}

static void
cmd_h(Cmd *c)
{
	if (in_range(c))
		stracpy(&hold, patt.str);
}

static void
cmd_H(Cmd *c)
{
	if (!in_range(c))
		return;

	stracat(&hold, "\n");
	stracat(&hold, patt.str);
}

static void
cmd_i(Cmd *c)
{
	if (in_range(c))
		check_puts(c->u.acir.str.str, stdout);
}

/* I think it makes sense to print invalid UTF-8 sequences in octal to satisfy
 * the "visually unambiguous form" sed(1p)
 */
static void
cmd_l(Cmd *c)
{
	Rune   r;
	char  *p, *end;
	size_t rlen;

	char *escapes[] = { /* FIXME: 7 entries and search instead of 127 */
		['\\'] = "\\\\", ['\a'] = "\\a", ['\b'] = "\\b",
		['\f'] = "\\f" , ['\r'] = "\\r", ['\t'] = "\\t",
		['\v'] = "\\v" , [0x7f] = NULL, /* fill out the table */
	};

	if (!in_range(c))
		return;

	/* FIXME: line wrapping. sed(1p) says "length at which folding occurs is
	 * unspecified, but should be appropraite for the output device"
	 * just wrap at 80 Runes?
	 */
	for (p = patt.str, end = p + strlen(p); p < end; p += rlen) {
		if (isascii(*p) && escapes[(unsigned int)*p]) {
			fputs(escapes[(unsigned int)*p], stdout);
			rlen = 1;
		} else if (!(rlen = charntorune(&r, p, end - p))) {
			/* ran out of chars, print the bytes of the short sequence */
			for (; p < end; p++)
				printf("\\%03hho", (unsigned char)*p);
			break;
		} else if (r == Runeerror) {
			for (; rlen; rlen--, p++)
				printf("\\%03hho", (unsigned char)*p);
		} else {
			while (fwrite(p, rlen, 1, stdout) < 1 && errno == EINTR)
				;
			if (ferror(stdout))
				eprintf("fwrite:");
		}
	}
	check_puts("$", stdout);
}

static void
cmd_n(Cmd *c)
{
	if (!in_range(c))
		return;

	if (!gflags.n)
		check_puts(patt.str, stdout);
	do_writes();
	new_line();
}

static void
cmd_N(Cmd *c)
{
	if (!in_range(c))
		return;
	do_writes();
	app_line();
}

static void
cmd_p(Cmd *c)
{
	if (in_range(c))
		check_puts(patt.str, stdout);
}

static void
cmd_P(Cmd *c)
{
	char *p;

	if (!in_range(c))
		return;

	if ((p = strchr(patt.str, '\n')))
		*p = '\0';

	check_puts(patt.str, stdout);

	if (p)
		*p = '\n';
}

static void
cmd_q(Cmd *c)
{
	if (!in_range(c))
		return;

	if (!gflags.n)
		check_puts(patt.str, stdout);
	do_writes();
	gflags.halt = 1;
}

static void
cmd_r(Cmd *c)
{
	if (in_range(c))
		push(&writes, c);
}

static void
cmd_s(Cmd *c)
{
	String tmp;
	Rune r;
	size_t plen, rlen, len;
	char *p, *s, *end;
	unsigned int matches = 0, last_empty = 1, qflag = 0, cflags = 0;
	regex_t *re;
	regmatch_t *rm, *pmatch = NULL;

	if (!in_range(c))
		return;

	if (!c->u.s.re && !lastre)
		leprintf("no previous regex");

	re = c->u.s.re ? c->u.s.re : lastre;
	lastre = re;

	plen = re->re_nsub + 1;
	pmatch = ereallocarray(NULL, plen, sizeof(regmatch_t));

	*genbuf.str = '\0';
	s = patt.str;

	while (!qflag && !regexec(re, s, plen, pmatch, cflags)) {
		cflags = REG_NOTBOL; /* match against beginning of line first time, but not again */
		if (!*s) /* match against empty string first time, but not again */
			qflag = 1;

		/* don't substitute if last match was not empty but this one is.
		 * s_a*_._g
		 * foobar -> .f.o.o.b.r.
		 */
		if ((last_empty || pmatch[0].rm_eo) &&
		    (++matches == c->u.s.occurrence || !c->u.s.occurrence)) {
			/* copy over everything before the match */
			strnacat(&genbuf, s, pmatch[0].rm_so);

			/* copy over replacement text, taking into account &, backreferences, and \ escapes */
			for (p = c->u.s.repl.str, len = strcspn(p, "\\&"); *p; len = strcspn(++p, "\\&")) {
				strnacat(&genbuf, p, len);
				p += len;
				switch (*p) {
				default: leprintf("this shouldn't be possible");
				case '\0':
					/* we're at the end, back up one so the ++p will put us on
					 * the null byte to break out of the loop */
					--p;
					break;
				case '&':
					strnacat(&genbuf, s + pmatch[0].rm_so, pmatch[0].rm_eo - pmatch[0].rm_so);
					break;
				case '\\':
					if (isdigit(*++p)) { /* backreference */
						/* only need to check here if using lastre, otherwise we checked when building */
						if (!c->u.s.re && (size_t)(*p - '0') > re->re_nsub)
							leprintf("back reference number greater than number of groups");
						rm = &pmatch[*p - '0'];
						strnacat(&genbuf, s + rm->rm_so, rm->rm_eo - rm->rm_so);
					} else { /* character after backslash taken literally (well one byte, but it works) */
						strnacat(&genbuf, p, 1);
					}
					break;
				}
			}
		} else {
			/* not replacing, copy over everything up to and including the match */
			strnacat(&genbuf, s, pmatch[0].rm_eo);
		}

		if (!pmatch[0].rm_eo) { /* empty match, advance one rune and add it to output */
			end = s + strlen(s);
			rlen = charntorune(&r, s, end - s);

			if (!rlen) { /* ran out of bytes, copy short sequence */
				stracat(&genbuf, s);
				s = end;
			} else { /* copy whether or not it's a good rune */
				strnacat(&genbuf, s, rlen);
				s += rlen;
			}
		}
		last_empty = !pmatch[0].rm_eo;
		s += pmatch[0].rm_eo;
	}
	free(pmatch);

	if (!(matches && matches >= c->u.s.occurrence)) /* no replacement */
		return;

	gflags.s = 1;

	stracat(&genbuf, s);

	tmp    = patt;
	patt   = genbuf;
	genbuf = tmp;

	if (c->u.s.p)
		check_puts(patt.str, stdout);
	if (c->u.s.file)
		check_puts(patt.str, c->u.s.file);
}

static void
cmd_t(Cmd *c)
{
	if (!in_range(c) || !gflags.s)
		return;

	/* if we jump backwards update to end, otherwise update to destination */
	update_ranges(c + 1, c->u.jump > c ? c->u.jump : prog + pcap);
	pc = c->u.jump;
	gflags.s = 0;
}

static void
cmd_w(Cmd *c)
{
	if (in_range(c))
		check_puts(patt.str, c->u.file);
}

static void
cmd_x(Cmd *c)
{
	String tmp;

	if (!in_range(c))
		return;

	tmp  = patt;
	patt = hold;
	hold = tmp;
}

static void
cmd_y(Cmd *c)
{
	String tmp;
	Rune r, *rp;
	size_t n, rlen;
	char *s, *end, buf[UTFmax];

	if (!in_range(c))
		return;

	*genbuf.str = '\0';
	for (s = patt.str, end = s + strlen(s); *s; s += rlen) {
		if (!(rlen = charntorune(&r, s, end - s))) { /* ran out of chars, copy rest */
			stracat(&genbuf, s);
			break;
		} else if (r == Runeerror) { /* bad UTF-8 sequence, copy bytes */
			strnacat(&genbuf, s, rlen);
		} else {
			for (rp = c->u.y.set1; *rp; rp++)
				if (*rp == r)
					break;
			if (*rp) { /* found r in set1, replace with Rune from set2 */
				n = runetochar(buf, c->u.y.set2 + (rp - c->u.y.set1));
				strnacat(&genbuf, buf, n);
			} else {
				strnacat(&genbuf, s, rlen);
			}
		}
	}
	tmp    = patt;
	patt   = genbuf;
	genbuf = tmp;
}

static void
cmd_colon(Cmd *c)
{
}

static void
cmd_equal(Cmd *c)
{
	if (in_range(c))
		printf("%zu\n", lineno);
}

static void
cmd_lbrace(Cmd *c)
{
	Cmd *jump;

	if (in_range(c))
		return;

	/* update ranges on all commands we skip */
	jump = prog + c->u.offset;
	update_ranges(c + 1, jump);
	pc = jump;
}

static void
cmd_rbrace(Cmd *c)
{
}

/* not actually a sed function, but acts like one, put in last spot of script */
static void
cmd_last(Cmd *c)
{
	if (!gflags.n)
		check_puts(patt.str, stdout);
	do_writes();
	new_next();
}

/*
 * Actions
 */

/* read new line, continue current cycle */
static void
new_line(void)
{
	while (read_line(file, &patt) == EOF) {
		if (next_file()) {
			gflags.halt = 1;
			return;
		}
	}
	gflags.s = 0;
	lineno++;
}

/* append new line, continue current cycle
 * FIXME: used for N, POSIX specifies do not print pattern space when out of
 *        input, but GNU does so busybox does as well. Currently we don't.
 *        Should we?
 */
static void
app_line(void)
{
	while (read_line(file, &genbuf) == EOF) {
		if (next_file()) {
			gflags.halt = 1;
			return;
		}
	}

	stracat(&patt, "\n");
	stracat(&patt, genbuf.str);
	gflags.s = 0;
	lineno++;
}

/* read new line, start new cycle */
static void
new_next(void)
{
	*patt.str = '\0';
	update_ranges(pc + 1, prog + pcap);
	new_line();
	pc = prog - 1;
}

/* keep old pattern space, start new cycle */
static void
old_next(void)
{
	update_ranges(pc + 1, prog + pcap);
	pc = prog - 1;
}

int
main(int argc, char *argv[])
{
	char *arg;
	int ret = 0, script = 0;

	ARGBEGIN {
	case 'n':
		gflags.n = 1;
		break;
	case 'r':
	case 'E':
		gflags.E = 1;
		break;
	case 'e':
		arg = EARGF(usage());
		compile(arg, 0);
		script = 1;
		break;
	case 'f':
		arg = EARGF(usage());
		compile(arg, 1);
		script = 1;
		break;
	default : usage();
	} ARGEND

	/* no script to run */
	if (!script && !argc)
		usage();

	/* no script yet, next argument is script */
	if (!script)
		compile(*argv++, 0);

	/* shrink/grow memory to fit and add our last instruction */
	resize((void **)&prog, &pcap, sizeof(*prog), pc - prog + 1, NULL);
	pc = prog + pcap - 1;
	pc->fninfo = &(Fninfo){ cmd_last, NULL, NULL, 0 };

	files = argv;
	run();

	ret |= fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>");

	return ret;
}
