saulpw · anjakefala · Jun 13, 2025 · May 31, 2024 · Jul 18, 2024 · Jul 16, 2024
diff --git a/tests/aggregators-cols.vdj b/tests/aggregators-cols.vdj
@@ -0,0 +1,8 @@
+#!vd -p
+{"sheet": "global", "col": null, "row": "disp_date_fmt", "longname": "set-option", "input": "%b %d, %Y", "keystrokes": "", "comment": null}
+{"longname": "open-file", "input": "sample_data/test.jsonl", "keystrokes": "o"}
+{"sheet": "test", "col": "key2", "row": "", "longname": "key-col", "input": "", "keystrokes": "!", "comment": "toggle current column as a key column"}
+{"sheet": "test", "col": "key2", "row": "", "longname": "addcol-aggregate", "input": "count", "comment": "add column(s) with aggregator of rows grouped by key columns"}
+{"sheet": "test", "col": "qty", "row": "", "longname": "type-float", "input": "", "keystrokes": "%", "comment": "set type of current column to float"}
+{"sheet": "test", "col": "qty", "row": "", "longname": "addcol-aggregate", "input": "rank sum", "comment": "add column(s) with aggregator of rows grouped by key columns"}
+{"sheet": "test", "col": "qty_sum", "row": "", "longname": "addcol-rank-sheet", "input": "", "comment": "add column with the rank of each row based on its key columns"}
diff --git a/tests/golden/aggregators-cols.tsv b/tests/golden/aggregators-cols.tsv
@@ -0,0 +1,11 @@
+key2	key2_count	key1	qty	qty_rank	qty_sum	test_sheetrank	amt	
+foo	2	2016-01-01 11:00:00	1.00	1	31.00	5		
+	0	2016-01-01 1:00	2.00	1	66.00	2	3	
+baz	3		4.00	1	292.00	4	43.2	
+#ERR	0	#ERR	#ERR	1	0.00	1	#ERR	#ERR
+bar	2	2017-12-25 8:44	16.00	2	16.00	3	.3	
+baz	3		32.00	2	292.00	4	3.3	
+	0	2018-07-27 4:44	64.00	2	66.00	2	9.1	
+bar	2	2018-07-27 16:44		1	16.00	3		
+baz	3	2018-07-27 18:44	256.00	3	292.00	4	.01	
+foo	2	2018-10-20 18:44	30.00	2	31.00	5	.01	
diff --git a/tests/golden/rank-sheetrank-sorted-cols.tsv b/tests/golden/rank-sheetrank-sorted-cols.tsv
@@ -0,0 +1,15 @@
+deptno	job	mgr	sheetrank_1asc2asc3desc	sheetrank_1asc2desc	sheetrank_default_sort	empno	ename	hiredate	sal	comm
+10	CLERK	7782	1	3	1	7934	MILLER	Jan 23, 1982	1300.00000	
+10	MANAGER	7839	2	2	2	7782	CLARK	Jun 19, 1981	2450.00000	
+10	PRESIDENT		3	1	3	7839	KING	Nov 17, 1981	5000.00000	
+20	ANALYST	7566	4	7	4	7788	SCOTT	Jul 13, 1987	3000.00000	
+20	ANALYST	7566	4	7	4	7902	FORD	Dec 03, 1981	3000.00000	
+20	CLERK	7902	5	6	6	7369	SMITH	Feb 17, 1981	800.00000	
+20	CLERK	7788	6	5	5	7876	ADAMS	Jul 13, 1987	1100.00000	
+20	MANAGER	7839	7	4	7	7566	JONES	Apr 12, 1981	2975.00000	
+30	CLERK	7698	8	10	8	7900	JAMES	Dec 03, 1981	950.00000	
+30	MANAGER	7839	9	9	9	7698	BLAKE	May 11, 1981	2850.00000	
+30	SALESMAN	7698	10	8	10	7499	ALLEN	Feb 20, 1981	1600.00000	300.00000
+30	SALESMAN	7698	10	8	10	7521	WARD	Feb 22, 1981	1250.00000	500.00000
+30	SALESMAN	7698	10	8	10	7654	MARTIN	Sep 28, 1981	1250.00000	1400.00000
+30	SALESMAN	7698	10	8	10	7844	TURNER	Sep 18, 1981	1500.00000	0.00000
diff --git a/tests/rank-sheetrank-sorted-cols.vdj b/tests/rank-sheetrank-sorted-cols.vdj
@@ -0,0 +1,21 @@
+#!vd -p
+{"sheet": "global", "col": null, "row": "disp_date_fmt", "longname": "set-option", "input": "%b %d, %Y", "keystrokes": "", "comment": null, "replayable": null}
+{"sheet": "global", "row": "disp_float_fmt", "longname": "set-option", "input": "%.05f", "keystrokes": ""}
+{"sheet": "global", "row": "default_width", "longname": "set-option", "input": "50", "keystrokes": ""}
+{"col": "", "row": "", "longname": "open-file", "input": "sample_data/employees.sqlite", "keystrokes": "o", "replayable": true}
+{"sheet": "employees", "col": "", "row": 1, "longname": "open-row", "input": "", "keystrokes": "Enter", "comment": "open current row with sheet-specific dive", "replayable": true}
+{"sheet": "emp", "col": "deptno", "row": "", "longname": "key-col-on", "input": "", "comment": "set current column as a key column", "replayable": true}
+{"sheet": "emp", "col": "job", "row": "", "longname": "key-col-on", "input": "", "comment": "set current column as a key column", "replayable": true}
+{"sheet": "emp", "col": "mgr", "row": "", "longname": "key-col-on", "input": "", "comment": "set current column as a key column", "replayable": true}
+{"sheet": "emp", "col": "empno", "row": "", "longname": "key-col-off", "input": "", "comment": "unset current column as a key column", "replayable": true}
+{"sheet": "emp", "col": "", "row": "", "longname": "addcol-rank-sheet", "input": "", "comment": "add column with the rank of each row based on its key columns", "replayable": true}
+{"sheet": "emp", "col": "emp_sheetrank", "row": "", "longname": "rename-col", "input": "sheetrank_default_sort", "keystrokes": "^", "comment": "rename current column", "replayable": true}
+{"sheet": "emp", "col": "deptno", "row": "", "longname": "sort-asc", "input": "", "keystrokes": "[", "comment": "sort ascending by current column; replace any existing sort criteria", "replayable": true}
+{"sheet": "emp", "col": "job", "row": "", "longname": "sort-desc-add", "input": "", "keystrokes": "z]", "comment": "sort descending by current column; add to existing sort criteria", "replayable": true}
+{"sheet": "emp", "col": "", "row": "", "longname": "addcol-rank-sheet", "input": "", "comment": "add column with the rank of each row based on its key columns", "replayable": true}
+{"sheet": "emp", "col": "emp_sheetrank", "row": "", "longname": "rename-col", "input": "sheetrank_1asc2desc", "keystrokes": "^", "comment": "rename current column", "replayable": true}
+{"sheet": "emp", "col": "deptno", "row": "", "longname": "sort-asc", "input": "", "keystrokes": "[", "comment": "sort ascending by current column; replace any existing sort criteria", "replayable": true}
+{"sheet": "emp", "col": "job", "row": "", "longname": "sort-asc-add", "input": "", "keystrokes": "z[", "comment": "sort ascending by current column; add to existing sort criteria", "replayable": true}
+{"sheet": "emp", "col": "mgr", "row": "", "longname": "sort-desc-add", "input": "", "keystrokes": "z]", "comment": "sort descending by current column; add to existing sort criteria", "replayable": true}
+{"sheet": "emp", "col": "", "row": "", "longname": "addcol-rank-sheet", "input": "", "comment": "add column with the rank of each row based on its key columns", "replayable": true}
+{"sheet": "emp", "col": "emp_sheetrank", "row": "", "longname": "rename-col", "input": "sheetrank_1asc2asc3desc", "keystrokes": "^", "comment": "rename current column", "replayable": true}
diff --git a/visidata/aggregators.py b/visidata/aggregators.py
@@ -4,9 +4,10 @@
 import collections
 import statistics
 from copy import copy
+import itertools
 
-from visidata import Progress, Sheet, Column, ColumnsSheet, VisiData
-from visidata import vd, anytype, vlen, asyncthread, wrapply, AttrDict, date, INPROGRESS, dispwidth
+from visidata import Progress, Sheet, Column, ColumnsSheet, VisiData, SettableColumn
+from visidata import vd, anytype, vlen, asyncthread, wrapply, AttrDict, date, INPROGRESS, dispwidth, stacktrace, TypedExceptionWrapper
 
 vd.help_aggregators = '''# Choose Aggregators
 Start typing an aggregator name or description.
@@ -77,7 +78,7 @@ def aggregators_set(col, aggs):
 
 
 class Aggregator:
-    def __init__(self, name, type, funcValues=None, helpstr='foo'):
+    def __init__(self, name, type, funcValues=None, helpstr=''):
         'Define aggregator `name` that calls funcValues(values)'
         self.type = type
         self.funcValues = funcValues  # funcValues(values)
@@ -93,13 +94,48 @@ def aggregate(self, col, rows):  # wrap builtins so they can have a .type
                 return None
             raise e
 
+class ListAggregator(Aggregator):
+    '''A list aggregator is an aggregator that returns a list of values, generally
+    one value per input row, unlike ordinary aggregators that operate on rows
+    and return only a single value.
+    To implement a new list aggregator, subclass ListAggregator,
+    and override aggregate() and aggregate_list().'''
+    def __init__(self, name, type, helpstr='', listtype=None):
+        '''*listtype* determines the type of the column created by addcol_aggregate()
+        for list aggrs. If it is None, then the new column will match the type of the input column'''
+        super().__init__(name, type, helpstr=helpstr)
+        self.listtype = listtype
+
+    def aggregate(self, col, rows) -> list:
+        '''Return a list, which can be shorter than *rows*, because it filters out nulls and errors.
+        Override in subclass.'''
+        vals = self.aggregate_list(col, rows)
+        # filter out nulls and errors
+        vals = [ v for v in vals if not col.sheet.isNullFunc()(v) ]
+        return vals
+
+    def aggregate_list(self, col, row_group) -> list:
+        '''Return a list of results, which will be one result per input row.
+        *row_group* is an iterable that holds a "group" of rows to run the aggregator on.
+        rows in *row_group* are not necessarily in the same order they are in the sheet.
+        Override in subclass.'''
+        vals = [ col.getTypedValue(r) for r in row_group ]
+        return vals
 
 @VisiData.api
 def aggregator(vd, name, funcValues, helpstr='', *, type=None):
     '''Define simple aggregator *name* that calls ``funcValues(values)`` to aggregate *values*.
        Use *type* to force type of aggregated column (default to use type of source column).'''
     vd.aggregators[name] = Aggregator(name, type, funcValues=funcValues, helpstr=helpstr)
 
+@VisiData.api
+def aggregator_list(vd, name, helpstr='', type=anytype, listtype=anytype):
+    '''Define simple aggregator *name* that calls ``funcValues(values)`` to aggregate *values*.
+       Use *type* to force type of aggregated column (default to use type of source column).
+       Use *listtype* to force the type of the new column created by addcol-aggregate.
+       If *listtype* is None, it will match the type of the source column.'''
+    vd.aggregators[name] = ListAggregator(name, type, helpstr=helpstr, listtype=listtype)
+
 ## specific aggregator implementations
 
 def mean(vals):
@@ -110,6 +146,16 @@ def mean(vals):
 def vsum(vals):
     return sum(vals, start=type(vals[0] if len(vals) else 0)())  #1996
 
+def stdev(vals):
+    # because statistics.stdev can raise an exception, we put it in a wrapper.
+    # The wrapper lets the exception be seen as an error string in the stdev
+    # aggregator, shown at the bottom of the sheet as part of allAggregators.
+    try:
+        return statistics.stdev(vals)
+    except statistics.StatisticsError as e:  #when vals holds only 1 element
+        e.stacktrace = stacktrace()
+        return TypedExceptionWrapper(None, exception=e)
+
 # http://code.activestate.com/recipes/511478-finding-the-percentile-of-the-values/
 def _percentile(N, percent, key=lambda x:x):
     """
@@ -141,10 +187,49 @@ def __init__(self, pct, helpstr=''):
     def aggregate(self, col, rows):
         return _percentile(sorted(col.getValues(rows)), self.pct/100, key=float)
 
-
 def quantiles(q, helpstr):
     return [PercentileAggregator(round(100*i/q), helpstr) for i in range(1, q)]
 
+def aggregate_groups(sheet, col, rows, aggr) -> list:
+    '''Returns a list, containing the result of the aggregator applied to each row.
+    *col* is a column whose values determine each row's rank within a group.
+    *rows* is a list of visidata rows.
+    *aggr* is an Aggregator object.
+    Rows are grouped by their key columns. Null key column cells are considered equal,
+    so nulls are grouped together. Cells with exceptions do not group together.
+    Each exception cell is grouped by itself, with only one row in the group.
+    '''
+    def _key_progress(prog):
+        def identity(val):
+            prog.addProgress(1)
+            return val
+        return identity
+
+    with Progress(gerund='ranking', total=4*sheet.nRows) as prog:
+        p = _key_progress(prog) # increment progress every time p() is called
+        # compile row data, for each row a list of tuples: (group_key, rank_key, rownum)
+        rowdata = [(sheet.rowkey(r), col.getTypedValue(r), p(rownum)) for rownum, r in enumerate(rows)]
+        # sort by row key and column value to prepare for grouping
+        try:
+            rowdata.sort(key=p)
+        except TypeError as e:
+            vd.fail(f'elements in a ranking column must be comparable: {e.args[0]}')
+        rowvals = []
+        #group by row key
+        for _, group in itertools.groupby(rowdata, key=lambda v: v[0]):
+            # within a group, the rows have already been sorted by col_val
+            group = list(group)
+            if isinstance(aggr, ListAggregator): # for list aggregators, each row gets its own value
+                aggr_vals = aggr.aggregate_list(col, [rows[rownum] for _, _, rownum in group])
+                rowvals += [(rownum, v) for (_, _, rownum), v in zip(group, aggr_vals)]
+            else:             # for normal aggregators, each row in the group gets the same value
+                aggr_val = aggr.aggregate(col, [rows[rownum] for _, _, rownum in group])
+                rowvals += [(rownum, aggr_val) for _, _, rownum in group]
+            prog.addProgress(len(group))
+        # sort by unique rownum, to make rank results match the original row order
+        rowvals.sort(key=p)
+        rowvals = [ v for rownum, v in rowvals ]
+        return rowvals
 
 vd.aggregator('min', min, 'minimum value')
 vd.aggregator('max', max, 'maximum value')
@@ -155,8 +240,8 @@ def quantiles(q, helpstr):
 vd.aggregator('sum', vsum, 'sum of values')
 vd.aggregator('distinct', set, 'distinct values', type=vlen)
 vd.aggregator('count', lambda values: sum(1 for v in values), 'number of values', type=int)
-vd.aggregator('list', list, 'list of values', type=anytype)
-vd.aggregator('stdev', statistics.stdev, 'standard deviation of values', type=float)
+vd.aggregator_list('list', 'list of values', type=anytype, listtype=None)
+vd.aggregator('stdev', stdev, 'standard deviation of values', type=float)
 
 vd.aggregators['q3'] = quantiles(3, 'tertiles (33/66th pctile)')
 vd.aggregators['q4'] = quantiles(4, 'quartiles (25/50/75th pctile)')
@@ -243,7 +328,8 @@ def memo_aggregate(col, agg_choices, rows):
         for agg in aggs:
             aggval = agg.aggregate(col, rows)
             typedval = wrapply(agg.type or col.type, aggval)
-            dispval = col.format(typedval)
+            # limit width to limit formatting time when typedval is a long list
+            dispval = col.format(typedval, width=1000)
             k = col.name+'_'+agg.name
             vd.status(f'{k}={dispval}')
             vd.memory[k] = typedval
@@ -254,14 +340,13 @@ def aggregator_choices(vd):
     return [
        AttrDict(key=agg, desc=v[0].helpstr if isinstance(v, list) else v.helpstr)
          for agg, v in vd.aggregators.items()
-            if not agg.startswith('p')  # skip all the percentiles, user should use q# instead
+           if not (agg.startswith('p') and agg[1:].isdigit())  # skip all the percentiles like 'p10', user should use q# instead
     ]
 
 
 @VisiData.api
-def chooseAggregators(vd):
+def chooseAggregators(vd, prompt = 'choose aggregators: '):
     '''Return a list of aggregator name strings chosen or entered by the user. User-entered names may be invalid.'''
-    prompt = 'choose aggregators: '
     def _fmt_aggr_summary(match, row, trigger_key):
         formatted_aggrname = match.formatted.get('key', row.key) if match else row.key
         r = ' '*(dispwidth(prompt)-3)
@@ -288,10 +373,34 @@ def _fmt_aggr_summary(match, row, trigger_key):
             vd.warning(f'aggregator does not exist: {aggr}')
     return aggrs
 
-Sheet.addCommand('+', 'aggregate-col', 'addAggregators([cursorCol], chooseAggregators())', 'add aggregator to current column')
+@Sheet.api
+@asyncthread
+def addcol_aggregate(sheet, col, aggrnames):
+    for aggrname in aggrnames:
+        aggrs = vd.aggregators.get(aggrname)
+        aggrs = aggrs if isinstance(aggrs, list) else [aggrs]
+        if not aggrs: continue
+        for aggr in aggrs:
+            rows = aggregate_groups(sheet, col, sheet.rows, aggr)
+            if isinstance(aggr, ListAggregator):
+                t = aggr.listtype or col.type
+            else:
+                t = aggr.type or col.type
+            c = SettableColumn(name=f'{col.name}_{aggr.name}', type=t)
+            sheet.addColumnAtCursor(c)
+            c.setValues(sheet.rows, *rows)
+
+Sheet.addCommand('+', 'aggregate-col', 'addAggregators([cursorCol], chooseAggregators())', 'Add aggregator to current column')
 Sheet.addCommand('z+', 'memo-aggregate', 'cursorCol.memo_aggregate(chooseAggregators(), selectedRows or rows)', 'memo result of aggregator over values in selected rows for current column')
 ColumnsSheet.addCommand('g+', 'aggregate-cols', 'addAggregators(selectedRows or source[0].nonKeyVisibleCols, chooseAggregators())', 'add aggregators to selected source columns')
+Sheet.addCommand('', 'addcol-aggregate', 'addcol_aggregate(cursorCol, chooseAggregators(prompt="aggregator for groups: "))', 'add column(s) with aggregator of rows grouped by key columns')
+
+vd.addGlobals(
+    ListAggregator=ListAggregator
+)
 
 vd.addMenuItems('''
     Column > Add aggregator > aggregate-col
+    Column > Add column > aggregate > addcol-aggregate
 ''')
+