cmd/agg: new command
diff --git a/cmd/agg/doc.go b/cmd/agg/doc.go
new file mode 100644
index 0000000..3a9ec2d
--- /dev/null
+++ b/cmd/agg/doc.go
@@ -0,0 +1,73 @@
+/*
+
+Agg computes aggregate values over tabular text.
+It behaves somewhat like the SQL “GROUP BY” clause.
+
+Usage:
+
+ agg [function...]
+
+It reads input from stdin as a sequence of records, one per line.
+It treats each line as a set of fields separated by white space.
+One field (the first, by default) is designated as the key.
+Successive lines with equal keys are grouped into a group,
+and agg produces one line of output for each group.
+(Note that only contiguous input lines can form a group.
+If you need to make sure that all records for a given key
+are grouped together, sort the input first.)
+
+For each remaining field,
+agg applies a function to all the values in the group,
+producing a single output value.
+The command line arguments specify which functions to use,
+one per field in the input table.
+
+Functions
+
+The available functions are:
+
+ key group by this field (default for field 1)
+ first value from first line of group (default for rest)
+ last value from last line of group
+ sample value from any line of group, uniformly at random
+ prefix longest common string prefix
+ join:sep concatenate strings with given sep
+ smin lexically least string
+ smax lexically greatest string
+ min numerically least value
+ max numerically greatest value
+ sum numeric sum
+ mean arithmetic mean
+ count number of records (ignores input value)
+ const:val print val, ignoring input
+ drop omit the column entirely
+
+The numeric functions skip items that don't parse as numbers.
+
+Examples
+
+Using the following input:
+
+ $ cat >input
+ -rwx alice 100 /home/alice/bin/crdt
+ -rw- alice 210002 /home/alice/thesis.tex
+ -rw- bob 10051 /home/bob/expenses.tab
+ -rwx kr 862060 /home/kr/bin/blog
+ -rwx kr 304608 /home/kr/bin/agg
+
+Disk usage for each user, plus where that disk usage occurs
+(longest common prefix of filesystem paths):
+
+ $ agg <input drop key sum prefix
+ alice 210153 /home/alice/
+ bob 10051 /home/bob/expenses.tab
+ kr 1166668 /home/kr/
+
+Disk usage for executable vs non-executable files:
+
+ $ sort input | agg key drop sum join:,
+ -rw- 220053 /home/alice/thesis.tex,/home/bob/expenses.tab
+ -rwx 1166768 /home/alice/bin/crdt,/home/kr/bin/agg,/home/kr/bin/blog
+
+*/
+package main
diff --git a/cmd/agg/main.go b/cmd/agg/main.go
new file mode 100644
index 0000000..4239aa4
--- /dev/null
+++ b/cmd/agg/main.go
@@ -0,0 +1,112 @@
+package main
+
+// TODO(kr): tests
+
+import (
+ "bufio"
+ "fmt"
+ "log"
+ "math/rand"
+ "os"
+ "strings"
+ "time"
+)
+
+type agg interface {
+ merge(string)
+ String() string
+}
+
+var (
+ key = 0
+ funcmap = make(map[int]func(init, arg string) agg)
+ argmap = make(map[int]string)
+ symtab = map[string]func(init, arg string) agg{
+ "first": first,
+ "last": last,
+ "prefix": prefix,
+ "sample": sample,
+ "join": join,
+ "smin": smin,
+ "smax": smax,
+ "min": min,
+ "max": max,
+ "sum": sum,
+ "mean": mean,
+ "count": count,
+ "const": constf,
+ "drop": nil,
+ }
+)
+
+func main() {
+ log.SetPrefix("agg: ")
+ log.SetFlags(0)
+ rand.Seed(time.Now().UnixNano())
+ for i, sym := range os.Args[1:] {
+ if p := strings.IndexByte(sym, ':'); p >= 0 {
+ sym, argmap[i] = sym[:p], sym[p+1:]
+ }
+ if sym == "key" {
+ key, sym = i, "first"
+ }
+ f, ok := symtab[sym]
+ if !ok {
+ log.Fatalf("bad function: %q", sym)
+ }
+ funcmap[i] = f
+ }
+
+ sc := bufio.NewScanner(os.Stdin)
+ var g *group
+ for sc.Scan() {
+ ss := strings.Fields(sc.Text())
+ if !matches(g, ss) {
+ emit(g)
+ g = &group{key: ss[key]}
+ }
+ mergeLine(g, ss)
+ }
+ emit(g)
+}
+
+type group struct {
+ key string
+ agg []agg
+}
+
+func matches(g *group, ss []string) bool {
+ return g != nil && g.key == ss[key]
+}
+
+func emit(g *group) {
+ if g == nil {
+ return
+ }
+ rest := false
+ for i, a := range g.agg {
+ if f, ok := funcmap[i]; ok && f == nil {
+ continue
+ }
+ if rest {
+ fmt.Print("\t")
+ }
+ rest = true
+ fmt.Print(a)
+ }
+ fmt.Println()
+}
+
+func mergeLine(g *group, ss []string) {
+ for i, s := range ss {
+ if i >= len(g.agg) {
+ f := funcmap[i]
+ if f == nil {
+ f = first
+ }
+ g.agg = append(g.agg, f(s, argmap[i]))
+ } else {
+ g.agg[i].merge(s)
+ }
+ }
+}
diff --git a/cmd/agg/num.go b/cmd/agg/num.go
new file mode 100644
index 0000000..93ac3fe
--- /dev/null
+++ b/cmd/agg/num.go
@@ -0,0 +1,99 @@
+package main
+
+import (
+ "math/big"
+ "strconv"
+)
+
+func min(s, arg string) agg { return newBinop(s, opmin) }
+func max(s, arg string) agg { return newBinop(s, opmax) }
+func sum(s, arg string) agg { return newBinop(s, opsum) }
+
+type binop struct {
+ v *big.Float
+ f func(a, b *big.Float) *big.Float
+}
+
+func newBinop(s string, f func(a, b *big.Float) *big.Float) *binop {
+ v, _ := parseFloat(s)
+ return &binop{v, f}
+}
+
+func (o *binop) String() string {
+ if o.v == nil {
+ return "NaN"
+ }
+ return o.v.Text('f', -1)
+}
+
+func (o *binop) merge(s string) {
+ v, ok := parseFloat(s)
+ if !ok {
+ return
+ }
+ o.v = o.f(o.v, v)
+}
+
+func opmin(a, b *big.Float) *big.Float {
+ if a != nil && (b == nil || a.Cmp(b) <= 0) {
+ return a
+ }
+ return b
+}
+
+func opmax(a, b *big.Float) *big.Float {
+ if a != nil && (b == nil || a.Cmp(b) >= 0) {
+ return a
+ }
+ return b
+}
+
+func opsum(a, b *big.Float) *big.Float {
+ if a == nil {
+ return b
+ } else if b == nil {
+ return a
+ }
+ return a.Add(a, b)
+}
+
+type meanagg struct {
+ v *big.Float
+ d float64 // actually an integer
+}
+
+func mean(s, arg string) agg {
+ v, ok := parseFloat(s)
+ if !ok {
+ return &meanagg{new(big.Float), 0}
+ }
+ return &meanagg{v, 1}
+}
+
+func (m *meanagg) String() string {
+ if m.d == 0 {
+ return "NaN"
+ }
+ v := new(big.Float).Quo(m.v, big.NewFloat(m.d))
+ return v.Text('f', -1)
+}
+
+func (m *meanagg) merge(s string) {
+ v, ok := parseFloat(s)
+ if !ok {
+ return
+ }
+ m.v.Add(m.v, v)
+ m.d++
+}
+
+func parseFloat(s string) (*big.Float, bool) {
+ v, _, err := big.ParseFloat(s, 0, 1000, big.ToNearestEven)
+ return v, err == nil
+}
+
+type counter int
+
+func count(init, arg string) agg { return new(counter) }
+func (c *counter) String() string { return strconv.Itoa(int(*c) + 1) }
+func (c *counter) merge(string) { *c++ }
diff --git a/cmd/agg/string.go b/cmd/agg/string.go
new file mode 100644
index 0000000..9a8cf78
--- /dev/null
+++ b/cmd/agg/string.go
@@ -0,0 +1,74 @@
+package main
+
+import (
+ "math/rand"
+ "strings"
+)
+
+func first(s, arg string) agg { return &sbinop{s, opfirst} }
+func last(s, arg string) agg { return &sbinop{s, oplast} }
+func prefix(s, arg string) agg { return &sbinop{s, opprefix} }
+func join(s, arg string) agg { return &sbinop{s, opjoin(arg)} }
+func smin(s, arg string) agg { return &sbinop{s, opsmin} }
+func smax(s, arg string) agg { return &sbinop{s, opsmax} }
+
+type sbinop struct {
+ s string
+ f func(a, b string) string
+}
+
+func (o *sbinop) String() string { return o.s }
+
+func (o *sbinop) merge(s string) { o.s = o.f(o.s, s) }
+
+func opfirst(a, b string) string { return a }
+func oplast(a, b string) string { return b }
+
+func opprefix(a, b string) string {
+ for i := range a {
+ if i >= len(b) || a[i] != b[i] {
+ return a[:i]
+ }
+ }
+ return a
+}
+
+func opjoin(sep string) func(a, b string) string {
+ return func(a, b string) string {
+ return a + sep + b // TODO(kr): too slow? maybe strings.Join?
+ }
+}
+
+func opsmin(a, b string) string {
+ if strings.Compare(a, b) <= 0 {
+ return a
+ }
+ return b
+}
+
+func opsmax(a, b string) string {
+ if strings.Compare(a, b) >= 0 {
+ return a
+ }
+ return b
+}
+
+type sampler struct {
+ n int
+ s string
+}
+
+func sample(s, arg string) agg { return &sampler{1, s} }
+func (p *sampler) String() string { return p.s }
+func (p *sampler) merge(s string) {
+ p.n++
+ if rand.Intn(p.n) == 0 {
+ p.s = s
+ }
+}
+
+type constant string
+
+func constf(init, arg string) agg { return constant(arg) }
+func (c constant) String() string { return string(c) }
+func (c constant) merge(string) {}