diff --git a/visidata/aggregators.py b/visidata/aggregators.py index 3b424a5ae..73312c9f5 100644 --- a/visidata/aggregators.py +++ b/visidata/aggregators.py @@ -4,6 +4,7 @@ import collections import statistics from copy import copy +import datetime from visidata import Progress, Sheet, Column, ColumnsSheet, VisiData from visidata import vd, anytype, vlen, asyncthread, wrapply, AttrDict, date, INPROGRESS @@ -105,10 +106,46 @@ def aggregator(vd, name, funcValues, helpstr='', *, type=None): def mean(vals): vals = list(vals) if vals: - return float(sum(vals))/len(vals) + if type(vals[0]) is date: + vals = [d.timestamp() for d in vals] + ans = float(sum(vals))/len(vals) + return datetime.date.fromtimestamp(ans) + elif isinstance(vals[0], datetime.timedelta): + return datetime.timedelta(seconds=vsum(vals)/datetime.timedelta(seconds=len(vals))) + else: + return float(sum(vals))/len(vals) def vsum(vals): - return sum(vals, start=type(vals[0] if len(vals) else 0)()) #1996 + if vals: + if type(vals[0]) is date: + vd.error('dates cannot be summed') + return None + return sum(vals, start=type(vals[0])()) #1996 + else: + return 0 + +def median(vals): + if not vals: + return None + if type(vals[0]) is date: + # when the length is even, statistics.median needs to add + # two midpoints to average them, so convert to timestamps + vals = [d.timestamp() for d in vals] + return datetime.date.fromtimestamp(statistics.median(vals)) + return statistics.median(vals) + +def stdev(vals): + if vals and len(vals) >= 2: + if type(vals[0]) is date: + vals = [d.timestamp() for d in vals] + return datetime.timedelta(seconds=statistics.stdev(vals)) + elif isinstance(vals[0], datetime.timedelta): + vals = [d.total_seconds() for d in vals] + return datetime.timedelta(seconds=statistics.stdev(vals)) + return statistics.stdev(vals) + else: + vd.error('stdev requires at least two data points') + return None # http://code.activestate.com/recipes/511478-finding-the-percentile-of-the-values/ def _percentile(N, percent, key=lambda x:x): @@ -146,17 +183,17 @@ def quantiles(q, helpstr): return [PercentileAggregator(round(100*i/q), helpstr) for i in range(1, q)] -vd.aggregator('min', min, 'minimum value') -vd.aggregator('max', max, 'maximum value') -vd.aggregator('avg', mean, 'arithmetic mean of values', type=float) -vd.aggregator('mean', mean, 'arithmetic mean of values', type=float) -vd.aggregator('median', statistics.median, 'median of values') +vd.aggregator('min', min, 'minimum value', type=anytype) +vd.aggregator('max', max, 'maximum value', type=anytype) +vd.aggregator('avg', mean, 'arithmetic mean of values', type=anytype) +vd.aggregator('mean', mean, 'arithmetic mean of values', type=anytype) +vd.aggregator('median', median, 'median of values', type=anytype) vd.aggregator('mode', statistics.mode, 'mode of values') -vd.aggregator('sum', vsum, 'sum of values') +vd.aggregator('sum', vsum, 'sum of values', type=anytype) vd.aggregator('distinct', set, 'distinct values', type=vlen) vd.aggregator('count', lambda values: sum(1 for v in values), 'number of values', type=int) vd.aggregator('list', list, 'list of values', type=anytype) -vd.aggregator('stdev', statistics.stdev, 'standard deviation of values', type=float) +vd.aggregator('stdev', stdev, 'standard deviation of values', type=anytype) vd.aggregators['q3'] = quantiles(3, 'tertiles (33/66th pctile)') vd.aggregators['q4'] = quantiles(4, 'quartiles (25/50/75th pctile)') @@ -236,6 +273,8 @@ def _aggregateTotalAsync(col, agg): @asyncthread def memo_aggregate(col, agg_choices, rows): 'Show aggregated value in status, and add to memory.' + if not rows: + vd.fail('no rows to aggregate') for agg_choice in agg_choices: agg = vd.aggregators.get(agg_choice) if not agg: continue @@ -243,7 +282,11 @@ def memo_aggregate(col, agg_choices, rows): for agg in aggs: aggval = agg.aggregate(col, rows) typedval = wrapply(agg.type or col.type, aggval) - dispval = col.format(typedval) + if agg.name == 'stdev' and (col.type is date): + # col type is a date, but typedval is a timedelta + dispval = str(typedval) + else: + dispval = col.format(typedval) k = col.name+'_'+agg.name vd.status(f'{k}={dispval}') vd.memory[k] = typedval diff --git a/visidata/features/describe.py b/visidata/features/describe.py index be26311c8..1181cbf10 100644 --- a/visidata/features/describe.py +++ b/visidata/features/describe.py @@ -1,11 +1,12 @@ from copy import copy -from statistics import mode, median, mean, stdev +from statistics import mode +import datetime -from visidata import vd, Column, ColumnAttr, vlen, RowColorizer, asyncthread, Progress, wrapply +from visidata import vd, Column, ColumnAttr, vlen, RowColorizer, asyncthread, Progress, wrapply, anytype, date from visidata import BaseSheet, TableSheet, ColumnsSheet, SheetsSheet -vd.option('describe_aggrs', 'mean stdev', 'numeric aggregators to calculate on Describe sheet', help=vd.help_aggregators) +vd.option('describe_aggrs', 'min max sum median mean stdev', 'numeric aggregators to calculate on Describe sheet', help=vd.help_aggregators) @Column.api @@ -44,10 +45,6 @@ class DescribeSheet(ColumnsSheet): DescribeColumn('nulls', type=vlen), DescribeColumn('distinct',type=vlen), DescribeColumn('mode', type=str), - DescribeColumn('min', type=str), - DescribeColumn('max', type=str), - DescribeColumn('sum'), - DescribeColumn('median', type=str), ] colorizers = [ RowColorizer(7, 'color_key_col', lambda s,c,r,v: r and r in r.sheet.keyCols), @@ -61,7 +58,8 @@ def loader(self): self.resetCols() for aggrname in vd.options.describe_aggrs.split(): - self.addColumn(DescribeColumn(aggrname, type=float)) + aggrtype = vd.aggregators[aggrname].type + self.addColumn(DescribeColumn(aggrname, type=aggrtype)) for srccol in Progress(self.rows, 'categorizing'): if not srccol.hidden: @@ -87,12 +85,15 @@ def reloadColumn(self, srccol): d['distinct'].add(v) except Exception as e: d['errors'].append(sr) + if not vals: + return d['mode'] = self.calcStatistic(d, mode, vals) - if vd.isNumeric(srccol): - for func in [min, max, sum, median]: # use type - d[func.__name__] = self.calcStatistic(d, func, vals) + if vd.isNumeric(srccol) or \ + isinstance(vals[0], (datetime.timedelta, datetime.date)): for aggrname in vd.options.describe_aggrs.split(): + if aggrname == 'sum' and (srccol.type is date or isinstance(vals[0], datetime.date)): + continue aggr = vd.aggregators[aggrname].funcValues d[aggrname] = self.calcStatistic(d, aggr, vals)