Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions tests/aggregators-cols.vdj
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!vd -p
{"sheet": "global", "col": null, "row": "disp_date_fmt", "longname": "set-option", "input": "%b %d, %Y", "keystrokes": "", "comment": null}
{"longname": "open-file", "input": "sample_data/test.jsonl", "keystrokes": "o"}
{"sheet": "test", "col": "key2", "row": "", "longname": "key-col", "input": "", "keystrokes": "!", "comment": "toggle current column as a key column"}
{"sheet": "test", "col": "key2", "row": "", "longname": "addcol-aggregate", "input": "count", "comment": "add column(s) with aggregator of rows grouped by key columns"}
{"sheet": "test", "col": "qty", "row": "", "longname": "type-float", "input": "", "keystrokes": "%", "comment": "set type of current column to float"}
{"sheet": "test", "col": "qty", "row": "", "longname": "addcol-aggregate", "input": "rank sum", "comment": "add column(s) with aggregator of rows grouped by key columns"}
{"sheet": "test", "col": "qty_sum", "row": "", "longname": "addcol-rank-sheet", "input": "", "comment": "add column with the rank of each row based on its key columns"}
11 changes: 11 additions & 0 deletions tests/golden/aggregators-cols.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
key2 key2_count key1 qty qty_rank qty_sum test_sheetrank amt
foo 2 2016-01-01 11:00:00 1.00 1 31.00 5
0 2016-01-01 1:00 2.00 1 66.00 2 3
baz 3 4.00 1 292.00 4 43.2
#ERR 0 #ERR #ERR 1 0.00 1 #ERR #ERR
bar 2 2017-12-25 8:44 16.00 2 16.00 3 .3
baz 3 32.00 2 292.00 4 3.3
0 2018-07-27 4:44 64.00 2 66.00 2 9.1
bar 2 2018-07-27 16:44 1 16.00 3
baz 3 2018-07-27 18:44 256.00 3 292.00 4 .01
foo 2 2018-10-20 18:44 30.00 2 31.00 5 .01
15 changes: 15 additions & 0 deletions tests/golden/rank-sheetrank-sorted-cols.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
deptno job mgr sheetrank_1asc2asc3desc sheetrank_1asc2desc sheetrank_default_sort empno ename hiredate sal comm
10 CLERK 7782 1 3 1 7934 MILLER Jan 23, 1982 1300.00000
10 MANAGER 7839 2 2 2 7782 CLARK Jun 19, 1981 2450.00000
10 PRESIDENT 3 1 3 7839 KING Nov 17, 1981 5000.00000
20 ANALYST 7566 4 7 4 7788 SCOTT Jul 13, 1987 3000.00000
20 ANALYST 7566 4 7 4 7902 FORD Dec 03, 1981 3000.00000
20 CLERK 7902 5 6 6 7369 SMITH Feb 17, 1981 800.00000
20 CLERK 7788 6 5 5 7876 ADAMS Jul 13, 1987 1100.00000
20 MANAGER 7839 7 4 7 7566 JONES Apr 12, 1981 2975.00000
30 CLERK 7698 8 10 8 7900 JAMES Dec 03, 1981 950.00000
30 MANAGER 7839 9 9 9 7698 BLAKE May 11, 1981 2850.00000
30 SALESMAN 7698 10 8 10 7499 ALLEN Feb 20, 1981 1600.00000 300.00000
30 SALESMAN 7698 10 8 10 7521 WARD Feb 22, 1981 1250.00000 500.00000
30 SALESMAN 7698 10 8 10 7654 MARTIN Sep 28, 1981 1250.00000 1400.00000
30 SALESMAN 7698 10 8 10 7844 TURNER Sep 18, 1981 1500.00000 0.00000
21 changes: 21 additions & 0 deletions tests/rank-sheetrank-sorted-cols.vdj
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!vd -p
{"sheet": "global", "col": null, "row": "disp_date_fmt", "longname": "set-option", "input": "%b %d, %Y", "keystrokes": "", "comment": null, "replayable": null}
{"sheet": "global", "row": "disp_float_fmt", "longname": "set-option", "input": "%.05f", "keystrokes": ""}
{"sheet": "global", "row": "default_width", "longname": "set-option", "input": "50", "keystrokes": ""}
{"col": "", "row": "", "longname": "open-file", "input": "sample_data/employees.sqlite", "keystrokes": "o", "replayable": true}
{"sheet": "employees", "col": "", "row": 1, "longname": "open-row", "input": "", "keystrokes": "Enter", "comment": "open current row with sheet-specific dive", "replayable": true}
{"sheet": "emp", "col": "deptno", "row": "", "longname": "key-col-on", "input": "", "comment": "set current column as a key column", "replayable": true}
{"sheet": "emp", "col": "job", "row": "", "longname": "key-col-on", "input": "", "comment": "set current column as a key column", "replayable": true}
{"sheet": "emp", "col": "mgr", "row": "", "longname": "key-col-on", "input": "", "comment": "set current column as a key column", "replayable": true}
{"sheet": "emp", "col": "empno", "row": "", "longname": "key-col-off", "input": "", "comment": "unset current column as a key column", "replayable": true}
{"sheet": "emp", "col": "", "row": "", "longname": "addcol-rank-sheet", "input": "", "comment": "add column with the rank of each row based on its key columns", "replayable": true}
{"sheet": "emp", "col": "emp_sheetrank", "row": "", "longname": "rename-col", "input": "sheetrank_default_sort", "keystrokes": "^", "comment": "rename current column", "replayable": true}
{"sheet": "emp", "col": "deptno", "row": "", "longname": "sort-asc", "input": "", "keystrokes": "[", "comment": "sort ascending by current column; replace any existing sort criteria", "replayable": true}
{"sheet": "emp", "col": "job", "row": "", "longname": "sort-desc-add", "input": "", "keystrokes": "z]", "comment": "sort descending by current column; add to existing sort criteria", "replayable": true}
{"sheet": "emp", "col": "", "row": "", "longname": "addcol-rank-sheet", "input": "", "comment": "add column with the rank of each row based on its key columns", "replayable": true}
{"sheet": "emp", "col": "emp_sheetrank", "row": "", "longname": "rename-col", "input": "sheetrank_1asc2desc", "keystrokes": "^", "comment": "rename current column", "replayable": true}
{"sheet": "emp", "col": "deptno", "row": "", "longname": "sort-asc", "input": "", "keystrokes": "[", "comment": "sort ascending by current column; replace any existing sort criteria", "replayable": true}
{"sheet": "emp", "col": "job", "row": "", "longname": "sort-asc-add", "input": "", "keystrokes": "z[", "comment": "sort ascending by current column; add to existing sort criteria", "replayable": true}
{"sheet": "emp", "col": "mgr", "row": "", "longname": "sort-desc-add", "input": "", "keystrokes": "z]", "comment": "sort descending by current column; add to existing sort criteria", "replayable": true}
{"sheet": "emp", "col": "", "row": "", "longname": "addcol-rank-sheet", "input": "", "comment": "add column with the rank of each row based on its key columns", "replayable": true}
{"sheet": "emp", "col": "emp_sheetrank", "row": "", "longname": "rename-col", "input": "sheetrank_1asc2asc3desc", "keystrokes": "^", "comment": "rename current column", "replayable": true}
131 changes: 120 additions & 11 deletions visidata/aggregators.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
import collections
import statistics
from copy import copy
import itertools

from visidata import Progress, Sheet, Column, ColumnsSheet, VisiData
from visidata import vd, anytype, vlen, asyncthread, wrapply, AttrDict, date, INPROGRESS, dispwidth
from visidata import Progress, Sheet, Column, ColumnsSheet, VisiData, SettableColumn
from visidata import vd, anytype, vlen, asyncthread, wrapply, AttrDict, date, INPROGRESS, dispwidth, stacktrace, TypedExceptionWrapper

vd.help_aggregators = '''# Choose Aggregators
Start typing an aggregator name or description.
Expand Down Expand Up @@ -77,7 +78,7 @@ def aggregators_set(col, aggs):


class Aggregator:
def __init__(self, name, type, funcValues=None, helpstr='foo'):
def __init__(self, name, type, funcValues=None, helpstr=''):
'Define aggregator `name` that calls funcValues(values)'
self.type = type
self.funcValues = funcValues # funcValues(values)
Expand All @@ -93,13 +94,48 @@ def aggregate(self, col, rows): # wrap builtins so they can have a .type
return None
raise e

class ListAggregator(Aggregator):
'''A list aggregator is an aggregator that returns a list of values, generally
one value per input row, unlike ordinary aggregators that operate on rows
and return only a single value.
To implement a new list aggregator, subclass ListAggregator,
and override aggregate() and aggregate_list().'''
def __init__(self, name, type, helpstr='', listtype=None):
'''*listtype* determines the type of the column created by addcol_aggregate()
for list aggrs. If it is None, then the new column will match the type of the input column'''
super().__init__(name, type, helpstr=helpstr)
self.listtype = listtype

def aggregate(self, col, rows) -> list:
'''Return a list, which can be shorter than *rows*, because it filters out nulls and errors.
Override in subclass.'''
vals = self.aggregate_list(col, rows)
# filter out nulls and errors
vals = [ v for v in vals if not col.sheet.isNullFunc()(v) ]
return vals

def aggregate_list(self, col, row_group) -> list:
'''Return a list of results, which will be one result per input row.
*row_group* is an iterable that holds a "group" of rows to run the aggregator on.
rows in *row_group* are not necessarily in the same order they are in the sheet.
Override in subclass.'''
vals = [ col.getTypedValue(r) for r in row_group ]
return vals

@VisiData.api
def aggregator(vd, name, funcValues, helpstr='', *, type=None):
'''Define simple aggregator *name* that calls ``funcValues(values)`` to aggregate *values*.
Use *type* to force type of aggregated column (default to use type of source column).'''
vd.aggregators[name] = Aggregator(name, type, funcValues=funcValues, helpstr=helpstr)

@VisiData.api
def aggregator_list(vd, name, helpstr='', type=anytype, listtype=anytype):
'''Define simple aggregator *name* that calls ``funcValues(values)`` to aggregate *values*.
Use *type* to force type of aggregated column (default to use type of source column).
Use *listtype* to force the type of the new column created by addcol-aggregate.
If *listtype* is None, it will match the type of the source column.'''
vd.aggregators[name] = ListAggregator(name, type, helpstr=helpstr, listtype=listtype)

## specific aggregator implementations

def mean(vals):
Expand All @@ -110,6 +146,16 @@ def mean(vals):
def vsum(vals):
return sum(vals, start=type(vals[0] if len(vals) else 0)()) #1996

def stdev(vals):
# because statistics.stdev can raise an exception, we put it in a wrapper.
# The wrapper lets the exception be seen as an error string in the stdev
# aggregator, shown at the bottom of the sheet as part of allAggregators.
try:
return statistics.stdev(vals)
except statistics.StatisticsError as e: #when vals holds only 1 element
e.stacktrace = stacktrace()
return TypedExceptionWrapper(None, exception=e)

# http://code.activestate.com/recipes/511478-finding-the-percentile-of-the-values/
def _percentile(N, percent, key=lambda x:x):
"""
Expand Down Expand Up @@ -141,10 +187,49 @@ def __init__(self, pct, helpstr=''):
def aggregate(self, col, rows):
return _percentile(sorted(col.getValues(rows)), self.pct/100, key=float)


def quantiles(q, helpstr):
return [PercentileAggregator(round(100*i/q), helpstr) for i in range(1, q)]

def aggregate_groups(sheet, col, rows, aggr) -> list:
'''Returns a list, containing the result of the aggregator applied to each row.
*col* is a column whose values determine each row's rank within a group.
*rows* is a list of visidata rows.
*aggr* is an Aggregator object.
Rows are grouped by their key columns. Null key column cells are considered equal,
so nulls are grouped together. Cells with exceptions do not group together.
Each exception cell is grouped by itself, with only one row in the group.
'''
def _key_progress(prog):
def identity(val):
prog.addProgress(1)
return val
return identity

with Progress(gerund='ranking', total=4*sheet.nRows) as prog:
p = _key_progress(prog) # increment progress every time p() is called
# compile row data, for each row a list of tuples: (group_key, rank_key, rownum)
rowdata = [(sheet.rowkey(r), col.getTypedValue(r), p(rownum)) for rownum, r in enumerate(rows)]
# sort by row key and column value to prepare for grouping
try:
rowdata.sort(key=p)
except TypeError as e:
vd.fail(f'elements in a ranking column must be comparable: {e.args[0]}')
rowvals = []
#group by row key
for _, group in itertools.groupby(rowdata, key=lambda v: v[0]):
# within a group, the rows have already been sorted by col_val
group = list(group)
if isinstance(aggr, ListAggregator): # for list aggregators, each row gets its own value
aggr_vals = aggr.aggregate_list(col, [rows[rownum] for _, _, rownum in group])
rowvals += [(rownum, v) for (_, _, rownum), v in zip(group, aggr_vals)]
else: # for normal aggregators, each row in the group gets the same value
aggr_val = aggr.aggregate(col, [rows[rownum] for _, _, rownum in group])
rowvals += [(rownum, aggr_val) for _, _, rownum in group]
prog.addProgress(len(group))
# sort by unique rownum, to make rank results match the original row order
rowvals.sort(key=p)
rowvals = [ v for rownum, v in rowvals ]
return rowvals

vd.aggregator('min', min, 'minimum value')
vd.aggregator('max', max, 'maximum value')
Expand All @@ -155,8 +240,8 @@ def quantiles(q, helpstr):
vd.aggregator('sum', vsum, 'sum of values')
vd.aggregator('distinct', set, 'distinct values', type=vlen)
vd.aggregator('count', lambda values: sum(1 for v in values), 'number of values', type=int)
vd.aggregator('list', list, 'list of values', type=anytype)
vd.aggregator('stdev', statistics.stdev, 'standard deviation of values', type=float)
vd.aggregator_list('list', 'list of values', type=anytype, listtype=None)
vd.aggregator('stdev', stdev, 'standard deviation of values', type=float)

vd.aggregators['q3'] = quantiles(3, 'tertiles (33/66th pctile)')
vd.aggregators['q4'] = quantiles(4, 'quartiles (25/50/75th pctile)')
Expand Down Expand Up @@ -243,7 +328,8 @@ def memo_aggregate(col, agg_choices, rows):
for agg in aggs:
aggval = agg.aggregate(col, rows)
typedval = wrapply(agg.type or col.type, aggval)
dispval = col.format(typedval)
# limit width to limit formatting time when typedval is a long list
dispval = col.format(typedval, width=1000)
k = col.name+'_'+agg.name
vd.status(f'{k}={dispval}')
vd.memory[k] = typedval
Expand All @@ -254,14 +340,13 @@ def aggregator_choices(vd):
return [
AttrDict(key=agg, desc=v[0].helpstr if isinstance(v, list) else v.helpstr)
for agg, v in vd.aggregators.items()
if not agg.startswith('p') # skip all the percentiles, user should use q# instead
if not (agg.startswith('p') and agg[1:].isdigit()) # skip all the percentiles like 'p10', user should use q# instead
]


@VisiData.api
def chooseAggregators(vd):
def chooseAggregators(vd, prompt = 'choose aggregators: '):
'''Return a list of aggregator name strings chosen or entered by the user. User-entered names may be invalid.'''
prompt = 'choose aggregators: '
def _fmt_aggr_summary(match, row, trigger_key):
formatted_aggrname = match.formatted.get('key', row.key) if match else row.key
r = ' '*(dispwidth(prompt)-3)
Expand All @@ -288,10 +373,34 @@ def _fmt_aggr_summary(match, row, trigger_key):
vd.warning(f'aggregator does not exist: {aggr}')
return aggrs

Sheet.addCommand('+', 'aggregate-col', 'addAggregators([cursorCol], chooseAggregators())', 'add aggregator to current column')
@Sheet.api
@asyncthread
def addcol_aggregate(sheet, col, aggrnames):
for aggrname in aggrnames:
aggrs = vd.aggregators.get(aggrname)
aggrs = aggrs if isinstance(aggrs, list) else [aggrs]
if not aggrs: continue
for aggr in aggrs:
rows = aggregate_groups(sheet, col, sheet.rows, aggr)
if isinstance(aggr, ListAggregator):
t = aggr.listtype or col.type
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need a separate listtype? Seems like we could just use the same aggr.type in both cases, and remove this isinstance (which is usually a code smell for me).

Copy link
Contributor Author

@midichef midichef Jan 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The way list aggregators work now, there is a need for two distinct types, type and listtype. type is for the result of the aggregator. For example, this is used by memo-aggregate. That's why type is anytype for ListAggregators. This type would be used whenever we want to hold the entire result (a list) in a cell.

But we also need a separate type for the elements of the list. This is for when the aggregator result goes in a column, like for addcol-aggregate, where each cell holds not the result itself, but an element of the list result.

If I try to get rid of one or the other types, I run into problems. For RankAggregator, if I get rid of the listtype=int switch to type=int, I get an error in the statusbar for z+ rank:
'''
text_rank=int() argument must be a string, a bytes-like object or a real number, not 'list'
'''
But if instead I make RankAggregator use type=anytype, the column added by addcol-aggregate rank does not get the type int.

The need for two types is awkward. And I see your point about isinstance being a code smell. (That is a helpful heuristic, and I'll use it in the future.) It's accurately pointing out strain in the design: most aggregators produce a single value, list aggregators produce a list.

Maybe rank should not be an aggregator. It's unlikely people want a list object holding the ranks. Most people want a column holding the ranks. What if we replace addcol-aggregate+rank with an equivalent command addcol-grouprank (in addition to the existing addcol-sheetrank)? And we would reserve addcol-aggregate for finding group values like sum, mean, median, as you suggested earlier. What do you think?
(I would also consider changing the name addcol-aggregate. Maybe to addcol-group-aggregate.)

else:
t = aggr.type or col.type
c = SettableColumn(name=f'{col.name}_{aggr.name}', type=t)
sheet.addColumnAtCursor(c)
c.setValues(sheet.rows, *rows)

Sheet.addCommand('+', 'aggregate-col', 'addAggregators([cursorCol], chooseAggregators())', 'Add aggregator to current column')
Sheet.addCommand('z+', 'memo-aggregate', 'cursorCol.memo_aggregate(chooseAggregators(), selectedRows or rows)', 'memo result of aggregator over values in selected rows for current column')
ColumnsSheet.addCommand('g+', 'aggregate-cols', 'addAggregators(selectedRows or source[0].nonKeyVisibleCols, chooseAggregators())', 'add aggregators to selected source columns')
Sheet.addCommand('', 'addcol-aggregate', 'addcol_aggregate(cursorCol, chooseAggregators(prompt="aggregator for groups: "))', 'add column(s) with aggregator of rows grouped by key columns')

vd.addGlobals(
ListAggregator=ListAggregator
)

vd.addMenuItems('''
Column > Add aggregator > aggregate-col
Column > Add column > aggregate > addcol-aggregate
''')

Loading