From 91e77c6260b87f9691af586756635bbf8d3533a7 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sat, 5 Dec 2015 13:01:50 -0500 Subject: [PATCH] int-preservation feature for mlr top --- c/containers/mlr_val.c | 144 ++++++++++++++++++++++++- c/containers/mlr_val.h | 11 ++ c/containers/top_keeper.c | 15 ++- c/containers/top_keeper.h | 9 +- c/mapping/mapper_top.c | 29 +++-- c/reg_test/expected/out | 144 +++++++++++++------------ c/reg_test/run | 20 ++-- c/todo.txt | 2 - c/unit_test/test_multiple_containers.c | 50 +++++---- doc/mlr.1.premade | 1 + doc/reference.html | 1 + 11 files changed, 303 insertions(+), 123 deletions(-) diff --git a/c/containers/mlr_val.c b/c/containers/mlr_val.c index 272600d816..52d0a94b1b 100644 --- a/c/containers/mlr_val.c +++ b/c/containers/mlr_val.c @@ -35,6 +35,9 @@ // For some Linux distros, in spite of including time.h: char *strptime(const char *s, const char *format, struct tm *tm); +typedef int mv_i_nn_comparator_func_t(mv_t* pa, mv_t* pb); +typedef int mv_i_cncn_comparator_func_t(const mv_t* pa, const mv_t* pb); + // ---------------------------------------------------------------- mv_t MV_NULL = { .type = MT_NULL, @@ -1805,6 +1808,103 @@ mv_t ge_op_func(mv_t* pval1, mv_t* pval2) { return (ge_dispositions[pval1->type] mv_t lt_op_func(mv_t* pval1, mv_t* pval2) { return (lt_dispositions[pval1->type][pval2->type])(pval1, pval2); } mv_t le_op_func(mv_t* pval1, mv_t* pval2) { return (le_dispositions[pval1->type][pval2->type])(pval1, pval2); } +// ---------------------------------------------------------------- +static int eq_i_ii(mv_t* pa, mv_t* pb) { return pa->u.intv == pb->u.intv; } +static int ne_i_ii(mv_t* pa, mv_t* pb) { return pa->u.intv != pb->u.intv; } +static int gt_i_ii(mv_t* pa, mv_t* pb) { return pa->u.intv > pb->u.intv; } +static int ge_i_ii(mv_t* pa, mv_t* pb) { return pa->u.intv >= pb->u.intv; } +static int lt_i_ii(mv_t* pa, mv_t* pb) { return pa->u.intv < pb->u.intv; } +static int le_i_ii(mv_t* pa, mv_t* pb) { return pa->u.intv <= pb->u.intv; } + +static int eq_i_ff(mv_t* pa, mv_t* pb) { return pa->u.fltv == pb->u.fltv; } +static int ne_i_ff(mv_t* pa, mv_t* pb) { return pa->u.fltv != pb->u.fltv; } +static int gt_i_ff(mv_t* pa, mv_t* pb) { return pa->u.fltv > pb->u.fltv; } +static int ge_i_ff(mv_t* pa, mv_t* pb) { return pa->u.fltv >= pb->u.fltv; } +static int lt_i_ff(mv_t* pa, mv_t* pb) { return pa->u.fltv < pb->u.fltv; } +static int le_i_ff(mv_t* pa, mv_t* pb) { return pa->u.fltv <= pb->u.fltv; } + +static int eq_i_fi(mv_t* pa, mv_t* pb) { return pa->u.fltv == pb->u.intv; } +static int ne_i_fi(mv_t* pa, mv_t* pb) { return pa->u.fltv != pb->u.intv; } +static int gt_i_fi(mv_t* pa, mv_t* pb) { return pa->u.fltv > pb->u.intv; } +static int ge_i_fi(mv_t* pa, mv_t* pb) { return pa->u.fltv >= pb->u.intv; } +static int lt_i_fi(mv_t* pa, mv_t* pb) { return pa->u.fltv < pb->u.intv; } +static int le_i_fi(mv_t* pa, mv_t* pb) { return pa->u.fltv <= pb->u.intv; } + +static int eq_i_if(mv_t* pa, mv_t* pb) { return pa->u.intv == pb->u.fltv; } +static int ne_i_if(mv_t* pa, mv_t* pb) { return pa->u.intv != pb->u.fltv; } +static int gt_i_if(mv_t* pa, mv_t* pb) { return pa->u.intv > pb->u.fltv; } +static int ge_i_if(mv_t* pa, mv_t* pb) { return pa->u.intv >= pb->u.fltv; } +static int lt_i_if(mv_t* pa, mv_t* pb) { return pa->u.intv < pb->u.fltv; } +static int le_i_if(mv_t* pa, mv_t* pb) { return pa->u.intv <= pb->u.fltv; } + +static mv_i_nn_comparator_func_t* ieq_dispositions[MT_MAX][MT_MAX] = { + // NULL ERROR BOOL FLOAT INT STRING + /*NULL*/ {NULL, NULL, NULL, NULL, NULL, NULL}, + /*ERROR*/ {NULL, NULL, NULL, NULL, NULL, NULL}, + /*BOOL*/ {NULL, NULL, NULL, NULL, NULL, NULL}, + /*FLOAT*/ {NULL, NULL, NULL, eq_i_ff, eq_i_fi, NULL}, + /*INT*/ {NULL, NULL, NULL, eq_i_if, eq_i_ii, NULL}, + /*STRING*/ {NULL, NULL, NULL, NULL, NULL, NULL}, +}; + +static mv_i_nn_comparator_func_t* ine_dispositions[MT_MAX][MT_MAX] = { + // NULL ERROR BOOL FLOAT INT STRING + /*NULL*/ {NULL, NULL, NULL, NULL, NULL, NULL}, + /*ERROR*/ {NULL, NULL, NULL, NULL, NULL, NULL}, + /*BOOL*/ {NULL, NULL, NULL, NULL, NULL, NULL}, + /*FLOAT*/ {NULL, NULL, NULL, ne_i_ff, ne_i_fi, NULL}, + /*INT*/ {NULL, NULL, NULL, ne_i_if, ne_i_ii, NULL}, + /*STRING*/ {NULL, NULL, NULL, NULL, NULL, NULL}, +}; + +static mv_i_nn_comparator_func_t* igt_dispositions[MT_MAX][MT_MAX] = { + // NULL ERROR BOOL FLOAT INT STRING + /*NULL*/ {NULL, NULL, NULL, NULL, NULL, NULL}, + /*ERROR*/ {NULL, NULL, NULL, NULL, NULL, NULL}, + /*BOOL*/ {NULL, NULL, NULL, NULL, NULL, NULL}, + /*FLOAT*/ {NULL, NULL, NULL, gt_i_ff, gt_i_fi, NULL}, + /*INT*/ {NULL, NULL, NULL, gt_i_if, gt_i_ii, NULL}, + /*STRING*/ {NULL, NULL, NULL, NULL, NULL, NULL}, +}; + +static mv_i_nn_comparator_func_t* ige_dispositions[MT_MAX][MT_MAX] = { + // NULL ERROR BOOL FLOAT INT STRING + /*NULL*/ {NULL, NULL, NULL, NULL, NULL, NULL}, + /*ERROR*/ {NULL, NULL, NULL, NULL, NULL, NULL}, + /*BOOL*/ {NULL, NULL, NULL, NULL, NULL, NULL}, + /*FLOAT*/ {NULL, NULL, NULL, ge_i_ff, ge_i_fi, NULL}, + /*INT*/ {NULL, NULL, NULL, ge_i_if, ge_i_ii, NULL}, + /*STRING*/ {NULL, NULL, NULL, NULL, NULL, NULL}, +}; + +static mv_i_nn_comparator_func_t* ilt_dispositions[MT_MAX][MT_MAX] = { + // NULL ERROR BOOL FLOAT INT STRING + /*NULL*/ {NULL, NULL, NULL, NULL, NULL, NULL}, + /*ERROR*/ {NULL, NULL, NULL, NULL, NULL, NULL}, + /*BOOL*/ {NULL, NULL, NULL, NULL, NULL, NULL}, + /*FLOAT*/ {NULL, NULL, NULL, lt_i_ff, lt_i_fi, NULL}, + /*INT*/ {NULL, NULL, NULL, lt_i_if, lt_i_ii, NULL}, + /*STRING*/ {NULL, NULL, NULL, NULL, NULL, NULL}, +}; + +static mv_i_nn_comparator_func_t* ile_dispositions[MT_MAX][MT_MAX] = { + // NULL ERROR BOOL FLOAT INT STRING + /*NULL*/ {NULL, NULL, NULL, NULL, NULL, NULL}, + /*ERROR*/ {NULL, NULL, NULL, NULL, NULL, NULL}, + /*BOOL*/ {NULL, NULL, NULL, NULL, NULL, NULL}, + /*FLOAT*/ {NULL, NULL, NULL, le_i_ff, le_i_fi, NULL}, + /*INT*/ {NULL, NULL, NULL, le_i_if, le_i_ii, NULL}, + /*STRING*/ {NULL, NULL, NULL, NULL, NULL, NULL}, +}; + + +int mv_i_nn_eq(mv_t* pval1, mv_t* pval2) { return (ieq_dispositions[pval1->type][pval2->type])(pval1, pval2); } +int mv_i_nn_ne(mv_t* pval1, mv_t* pval2) { return (ine_dispositions[pval1->type][pval2->type])(pval1, pval2); } +int mv_i_nn_gt(mv_t* pval1, mv_t* pval2) { return (igt_dispositions[pval1->type][pval2->type])(pval1, pval2); } +int mv_i_nn_ge(mv_t* pval1, mv_t* pval2) { return (ige_dispositions[pval1->type][pval2->type])(pval1, pval2); } +int mv_i_nn_lt(mv_t* pval1, mv_t* pval2) { return (ilt_dispositions[pval1->type][pval2->type])(pval1, pval2); } +int mv_i_nn_le(mv_t* pval1, mv_t* pval2) { return (ile_dispositions[pval1->type][pval2->type])(pval1, pval2); } + // ---------------------------------------------------------------- // arg2 evaluates to string via compound expression; regexes compiled on each call. mv_t matches_no_precomp_func(mv_t* pval1, mv_t* pval2) { @@ -1849,7 +1949,6 @@ mv_t does_not_match_precomp_func(mv_t* pval1, regex_t* pregex, string_builder_t* } // ---------------------------------------------------------------- -typedef int mv_comparator_func_t(const mv_t* pa, const mv_t* pb); static int mv_ff_comparator(const mv_t* pa, const mv_t* pb) { double d = pa->u.fltv - pb->u.fltv; return (d < 0) ? -1 : (d > 0) ? 1 : 0; @@ -1867,7 +1966,7 @@ static int mv_ii_comparator(const mv_t* pa, const mv_t* pb) { return (d < 0) ? -1 : (d > 0) ? 1 : 0; } // We assume mv_t's coming into percentile keeper are int or double -- in particular, non-null. -static mv_comparator_func_t* mv_comparator_dispositions[MT_MAX][MT_MAX] = { +static mv_i_cncn_comparator_func_t* mv_comparator_dispositions[MT_MAX][MT_MAX] = { // NULL ERROR BOOL FLOAT INT STRING /*NULL*/ {NULL, NULL, NULL, NULL, NULL, NULL}, /*ERROR*/ {NULL, NULL, NULL, NULL, NULL, NULL}, @@ -1881,3 +1980,44 @@ int mv_nn_comparator(const void* pva, const void* pvb) { const mv_t* pb = pvb; return mv_comparator_dispositions[pa->type][pb->type](pa, pb); } + +// ---------------------------------------------------------------- +int mlr_bsearch_mv_n_for_insert(mv_t* array, int size, mv_t* pvalue) { + int lo = 0; + int hi = size-1; + int mid = (hi+lo)/2; + int newmid; + + if (size == 0) + return 0; + if (mv_i_nn_gt(pvalue, &array[0])) + return 0; + if (mv_i_nn_lt(pvalue, &array[hi])) + return size; + + while (lo < hi) { + mv_t* pa = &array[mid]; + if (mv_i_nn_eq(pvalue, pa)) { + return mid; + } + else if (mv_i_nn_gt(pvalue, pa)) { + hi = mid; + newmid = (hi+lo)/2; + } + else { + lo = mid; + newmid = (hi+lo)/2; + } + if (mid == newmid) { + if (mv_i_nn_ge(pvalue, &array[lo])) + return lo; + else if (mv_i_nn_ge(pvalue, &array[hi])) + return hi; + else + return hi+1; + } + mid = newmid; + } + + return lo; +} diff --git a/c/containers/mlr_val.h b/c/containers/mlr_val.h index 365783c541..7f5e4b43d3 100644 --- a/c/containers/mlr_val.h +++ b/c/containers/mlr_val.h @@ -280,6 +280,7 @@ mv_t does_not_match_no_precomp_func(mv_t* pval1, mv_t* pval2); mv_t matches_precomp_func(mv_t* pval1, regex_t* pregex, string_builder_t* psb); mv_t does_not_match_precomp_func(mv_t* pval1, regex_t* pregex, string_builder_t* psb); +// For filter/put DSLs: mv_t eq_op_func(mv_t* pval1, mv_t* pval2); mv_t ne_op_func(mv_t* pval1, mv_t* pval2); mv_t gt_op_func(mv_t* pval1, mv_t* pval2); @@ -287,8 +288,18 @@ mv_t ge_op_func(mv_t* pval1, mv_t* pval2); mv_t lt_op_func(mv_t* pval1, mv_t* pval2); mv_t le_op_func(mv_t* pval1, mv_t* pval2); +// For non-DSL comparison of mlrvals: +int mv_i_nn_eq(mv_t* pval1, mv_t* pval2); +int mv_i_nn_ne(mv_t* pval1, mv_t* pval2); +int mv_i_nn_gt(mv_t* pval1, mv_t* pval2); +int mv_i_nn_ge(mv_t* pval1, mv_t* pval2); +int mv_i_nn_lt(mv_t* pval1, mv_t* pval2); +int mv_i_nn_le(mv_t* pval1, mv_t* pval2); + // ---------------------------------------------------------------- // For qsort of numeric mlrvals. int mv_nn_comparator(const void* pva, const void* pvb); +int mlr_bsearch_mv_n_for_insert(mv_t* array, int size, mv_t* pvalue); + #endif // MLR_VAL_H diff --git a/c/containers/top_keeper.c b/c/containers/top_keeper.c index 08ee83c4f6..46946dba70 100644 --- a/c/containers/top_keeper.c +++ b/c/containers/top_keeper.c @@ -6,7 +6,7 @@ // ---------------------------------------------------------------- top_keeper_t* top_keeper_alloc(int capacity) { top_keeper_t* ptop_keeper = mlr_malloc_or_die(sizeof(top_keeper_t)); - ptop_keeper->top_values = mlr_malloc_or_die(capacity*sizeof(double)); + ptop_keeper->top_values = mlr_malloc_or_die(capacity*sizeof(mv_t)); ptop_keeper->top_precords = mlr_malloc_or_die(capacity*sizeof(lrec_t*)); ptop_keeper->size = 0; ptop_keeper->capacity = capacity; @@ -56,8 +56,8 @@ void top_keeper_free(top_keeper_t* ptop_keeper) { // [9 ] [9 ] [9 #] [9 #] // Our caller, mapper_top, feeds us records. We keep them or free them. -void top_keeper_add(top_keeper_t* ptop_keeper, double value, lrec_t* prec) { - int destidx = mlr_bsearch_double_for_insert(ptop_keeper->top_values, ptop_keeper->size, value); +void top_keeper_add(top_keeper_t* ptop_keeper, mv_t value, lrec_t* prec) { + int destidx = mlr_bsearch_mv_n_for_insert(ptop_keeper->top_values, ptop_keeper->size, &value); if (ptop_keeper->size < ptop_keeper->capacity) { for (int i = ptop_keeper->size-1; i >= destidx; i--) { ptop_keeper->top_values[i+1] = ptop_keeper->top_values[i]; @@ -84,8 +84,13 @@ void top_keeper_add(top_keeper_t* ptop_keeper, double value, lrec_t* prec) { // ---------------------------------------------------------------- void top_keeper_print(top_keeper_t* ptop_keeper) { printf("top_keeper dump:\n"); - for (int i = 0; i < ptop_keeper->size; i++) - printf("[%02d] %.8lf\n", i, ptop_keeper->top_values[i]); + for (int i = 0; i < ptop_keeper->size; i++) { + mv_t* pvalue = &ptop_keeper->top_values[i]; + if (pvalue->type == MT_FLOAT) + printf("[%02d] %.8lf\n", i, pvalue->u.fltv); + else + printf("[%02d] %lld\n", i, pvalue->u.intv); + } for (int i = ptop_keeper->size; i < ptop_keeper->capacity; i++) printf("[%02d] ---\n", i); } diff --git a/c/containers/top_keeper.h b/c/containers/top_keeper.h index 69b1b6ce62..cbd1acea92 100644 --- a/c/containers/top_keeper.h +++ b/c/containers/top_keeper.h @@ -4,18 +4,19 @@ #ifndef TOP_KEEPER_H #define TOP_KEEPER_H +#include "containers/mlr_val.h" #include "containers/lrec.h" typedef struct _top_keeper_t { - double* top_values; + mv_t* top_values; lrec_t** top_precords; - int size; - int capacity; + int size; + int capacity; } top_keeper_t; top_keeper_t* top_keeper_alloc(int capacity); void top_keeper_free(top_keeper_t* ptop_keeper); -void top_keeper_add(top_keeper_t* ptop_keeper, double value, lrec_t* prec); +void top_keeper_add(top_keeper_t* ptop_keeper, mv_t value, lrec_t* prec); // For debug/test void top_keeper_print(top_keeper_t* ptop_keeper); diff --git a/c/mapping/mapper_top.c b/c/mapping/mapper_top.c index 81a29ed2fd..821a9a0d3f 100644 --- a/c/mapping/mapper_top.c +++ b/c/mapping/mapper_top.c @@ -17,9 +17,10 @@ typedef struct _mapper_top_state_t { slls_t* pvalue_field_names; slls_t* pgroup_by_field_names; - int show_full_records; int top_count; - double sign; // for +1 for max; -1 for min + int show_full_records; + int allow_int_float; + mv_t sign; // for +1 for max; -1 for min lhmslv_t* groups; } mapper_top_state_t; @@ -30,7 +31,7 @@ static void mapper_top_ingest(lrec_t* pinrec, mapper_top_state_t* pstate); static sllv_t* mapper_top_emit(mapper_top_state_t* pstate, context_t* pctx); static void mapper_top_free(void* pvstate); static mapper_t* mapper_top_alloc(slls_t* pvalue_field_names, slls_t* pgroup_by_field_names, - int top_count, int do_max, int show_full_records); + int top_count, int do_max, int show_full_records, int allow_int_float); static void mapper_top_usage(FILE* o, char* argv0, char* verb); static mapper_t* mapper_top_parse_cli(int* pargi, int argc, char** argv); @@ -51,6 +52,8 @@ static void mapper_top_usage(FILE* o, char* argv0, char* verb) { fprintf(o, " to print only value and group-by fields. Requires a single\n"); fprintf(o, " value-field name only.\n"); fprintf(o, "--min Print top smallest values; default is top largest values.\n"); + fprintf(o, "-F Keep top values as floats even if they look like integers.\n"); + fprintf(o, "Prints the n records with smallest/largest values at specified fields,\n"); fprintf(o, "optionally by category.\n"); } @@ -61,6 +64,7 @@ static mapper_t* mapper_top_parse_cli(int* pargi, int argc, char** argv) { slls_t* pgroup_by_field_names = slls_alloc(); int show_full_records = FALSE; int do_max = TRUE; + int allow_int_float = TRUE; char* verb = argv[(*pargi)++]; @@ -71,6 +75,7 @@ static mapper_t* mapper_top_parse_cli(int* pargi, int argc, char** argv) { ap_define_true_flag(pstate, "-a", &show_full_records); ap_define_true_flag(pstate, "--max", &do_max); ap_define_false_flag(pstate, "--min", &do_max); + ap_define_false_flag(pstate, "-F", &allow_int_float); if (!ap_parse(pstate, verb, pargi, argc, argv)) { mapper_top_usage(stderr, argv[0], verb); @@ -86,12 +91,12 @@ static mapper_t* mapper_top_parse_cli(int* pargi, int argc, char** argv) { } return mapper_top_alloc(pvalue_field_names, pgroup_by_field_names, - top_count, do_max, show_full_records); + top_count, do_max, show_full_records, allow_int_float); } // ---------------------------------------------------------------- static mapper_t* mapper_top_alloc(slls_t* pvalue_field_names, slls_t* pgroup_by_field_names, - int top_count, int do_max, int show_full_records) + int top_count, int do_max, int show_full_records, int allow_int_float) { mapper_t* pmapper = mlr_malloc_or_die(sizeof(mapper_t)); @@ -100,8 +105,9 @@ static mapper_t* mapper_top_alloc(slls_t* pvalue_field_names, slls_t* pgroup_by_ pstate->pvalue_field_names = slls_copy(pvalue_field_names); pstate->pgroup_by_field_names = slls_copy(pgroup_by_field_names); pstate->show_full_records = show_full_records; + pstate->allow_int_float = allow_int_float; pstate->top_count = top_count; - pstate->sign = do_max ? 1.0 : -1.0; + pstate->sign = mv_from_int(do_max ? 1 : -1); pstate->groups = lhmslv_alloc(); pmapper->pvstate = pstate; @@ -161,7 +167,9 @@ static void mapper_top_ingest(lrec_t* pinrec, mapper_top_state_t* pstate) { for ( ; pa != NULL && pb != NULL; pa = pa->pnext, pb = pb->pnext) { char* value_field_name = pa->value; char* value_field_sval = pb->value; - double value_field_dval = mlr_double_from_string_or_die(value_field_sval); + mv_t value_field_nval = pstate->allow_int_float + ? mv_scan_number_or_die(value_field_sval) + : mv_from_float(mlr_double_from_string_or_die(value_field_sval)); top_keeper_t* ptop_keeper_for_group = lhmsv_get(group_to_acc_field, value_field_name); if (ptop_keeper_for_group == NULL) { @@ -171,7 +179,7 @@ static void mapper_top_ingest(lrec_t* pinrec, mapper_top_state_t* pstate) { // The top-keeper object will free the record if it isn't retained, or // keep it if it is. - top_keeper_add(ptop_keeper_for_group, value_field_dval * pstate->sign, + top_keeper_add(ptop_keeper_for_group, n_nn_times_func(&value_field_nval, &pstate->sign), pstate->show_full_records ? pinrec : NULL); } } @@ -221,10 +229,9 @@ static sllv_t* mapper_top_emit(mapper_top_state_t* pstate, context_t* pctx) { char* key = mlr_paste_2_strings(value_field_name, "_top"); if (i < ptop_keeper_for_group->size) { - double fltv = ptop_keeper_for_group->top_values[i] * pstate->sign; - char* strv = mlr_alloc_string_from_double(fltv, MLR_GLOBALS.ofmt); + mv_t numv = n_nn_times_func(&ptop_keeper_for_group->top_values[i], &pstate->sign); + char* strv = mv_format_val(&numv); lrec_put(poutrec, key, strv, LREC_FREE_ENTRY_KEY|LREC_FREE_ENTRY_VALUE); - free(strv); } else { lrec_put(poutrec, key, "", LREC_FREE_ENTRY_KEY); } diff --git a/c/reg_test/expected/out b/c/reg_test/expected/out index 9929072112..9e27e7fb35 100644 --- a/c/reg_test/expected/out +++ b/c/reg_test/expected/out @@ -548,6 +548,80 @@ a=eks,b=zee,top_idx=2,x_top=,y_top= a=pan,b=wye,top_idx=1,x_top=0.502626,y_top=0.952618 a=pan,b=wye,top_idx=2,x_top=,y_top= +mlr top -f x,y -n 2 ./reg_test/input/ints.dkvp +top_idx=1,x_top=9,y_top=9 +top_idx=2,x_top=8,y_top=9 + +mlr top -f x,y -n 2 -F ./reg_test/input/ints.dkvp +top_idx=1,x_top=9.000000,y_top=9.000000 +top_idx=2,x_top=8.000000,y_top=9.000000 + +mlr top -n 4 -f x ./reg_test/input/abixy-wide +top_idx=1,x_top=0.999730 +top_idx=2,x_top=0.999308 +top_idx=3,x_top=0.997339 +top_idx=4,x_top=0.997245 + +mlr top -n 1 -f x,y ./reg_test/input/abixy-wide +top_idx=1,x_top=0.999730,y_top=0.999522 + +mlr top -n 4 -f x -g a ./reg_test/input/abixy-wide +a=cat,top_idx=1,x_top=0.996359 +a=cat,top_idx=2,x_top=0.990588 +a=cat,top_idx=3,x_top=0.990160 +a=cat,top_idx=4,x_top=0.986548 +a=pan,top_idx=1,x_top=0.999730 +a=pan,top_idx=2,x_top=0.999308 +a=pan,top_idx=3,x_top=0.996583 +a=pan,top_idx=4,x_top=0.990853 +a=wye,top_idx=1,x_top=0.997339 +a=wye,top_idx=2,x_top=0.996525 +a=wye,top_idx=3,x_top=0.995414 +a=wye,top_idx=4,x_top=0.994219 +a=dog,top_idx=1,x_top=0.997245 +a=dog,top_idx=2,x_top=0.988581 +a=dog,top_idx=3,x_top=0.980266 +a=dog,top_idx=4,x_top=0.977280 +a=hat,top_idx=1,x_top=0.997014 +a=hat,top_idx=2,x_top=0.995705 +a=hat,top_idx=3,x_top=0.992879 +a=hat,top_idx=4,x_top=0.989639 + +mlr top -n 1 -f x,y -g a ./reg_test/input/abixy-wide +a=cat,top_idx=1,x_top=0.996359,y_top=0.997929 +a=pan,top_idx=1,x_top=0.999730,y_top=0.999319 +a=wye,top_idx=1,x_top=0.997339,y_top=0.999450 +a=dog,top_idx=1,x_top=0.997245,y_top=0.999522 +a=hat,top_idx=1,x_top=0.997014,y_top=0.991400 + +mlr top -a -n 4 -f x ./reg_test/input/abixy-wide +a=pan,b=cat,i=1943,x=0.9997301774748071,y=0.8408385083267972,x2=0.9994604277538093,xy=0.8406116311572011,y2=0.7070093970852334 +a=pan,b=dog,i=552,x=0.9993077485337599,y=0.10019859749555626,x2=0.9986159762796124,xy=0.10012923486952477,y2=0.010039758940076493 +a=wye,b=pan,i=568,x=0.9973387930844961,y=0.5711453274816489,x2=0.9946846681912394,xy=0.569625391586397,y2=0.32620698510412 +a=dog,b=pan,i=1613,x=0.997245171384539,y=0.8188847898688517,x2=0.9944979318497786,xy=0.8166289026169553,y2=0.6705722990785534 + +mlr top -a -n 4 -f x -g a ./reg_test/input/abixy-wide +a=cat,b=dog,i=901,x=0.9963591865818606,y=0.9423606044154952,x2=0.9927316286860669,xy=0.9389296452822133,y2=0.8880435087543374 +a=cat,b=wye,i=135,x=0.9905881531288986,y=0.7789092765450115,x2=0.9812648891193222,xy=0.7715783017076895,y2=0.6066996610878732 +a=cat,b=hat,i=975,x=0.9901595235021967,y=0.8592860989306361,x2=0.9804158819820973,xy=0.8508303142692202,y2=0.738372599815431 +a=cat,b=dog,i=748,x=0.986548192318023,y=0.22327955924073406,x2=0.973277335765959,xy=0.2202760455505111,y2=0.04985376157473647 +a=pan,b=cat,i=1943,x=0.9997301774748071,y=0.8408385083267972,x2=0.9994604277538093,xy=0.8406116311572011,y2=0.7070093970852334 +a=pan,b=dog,i=552,x=0.9993077485337599,y=0.10019859749555626,x2=0.9986159762796124,xy=0.10012923486952477,y2=0.010039758940076493 +a=pan,b=pan,i=756,x=0.9965826588776338,y=0.8864833103309704,x2=0.9931769959756142,xy=0.883453894460285,y2=0.7858526594953555 +a=pan,b=wye,i=1876,x=0.9908531016291003,y=0.318873165751998,x2=0.9817898690080082,xy=0.3159564653116574,y2=0.10168009583670118 +a=wye,b=pan,i=568,x=0.9973387930844961,y=0.5711453274816489,x2=0.9946846681912394,xy=0.569625391586397,y2=0.32620698510412 +a=wye,b=dog,i=1772,x=0.9965249307670624,y=0.8078766237661098,x2=0.9930619376402985,xy=0.8050691965668507,y2=0.6526646392277285 +a=wye,b=pan,i=450,x=0.9954136776717515,y=0.8564839236024028,x2=0.9908483896960015,xy=0.8525558122597992,y2=0.7335647113893665 +a=wye,b=dog,i=882,x=0.9942189496990836,y=0.3298671968153588,x2=0.988471319940749,xy=0.32796021795794694,y2=0.10881236753482267 +a=dog,b=pan,i=1613,x=0.997245171384539,y=0.8188847898688517,x2=0.9944979318497786,xy=0.8166289026169553,y2=0.6705722990785534 +a=dog,b=hat,i=1680,x=0.9885812365291148,y=0.048768165346359305,x2=0.9772928612174336,xy=0.04821129320136021,y2=0.0023783339512498405 +a=dog,b=wye,i=1331,x=0.9802656048578349,y=0.9431768957041479,x2=0.9609206560672968,xy=0.9245638701553616,y2=0.889582656590113 +a=dog,b=pan,i=978,x=0.9772797759762182,y=0.5328687567382687,x2=0.9550757605321272,xy=0.5207618592099011,y2=0.2839491119077882 +a=hat,b=dog,i=1894,x=0.9970136511026203,y=0.2054109087831395,x2=0.9940362204849775,xy=0.2047974801421852,y2=0.04219364144711526 +a=hat,b=hat,i=818,x=0.9957052214494239,y=0.026702922142899044,x2=0.9914288880216462,xy=0.026588239005642018,y2=0.0007130460509697281 +a=hat,b=hat,i=1513,x=0.9928788688650781,y=0.1805357299725343,x2=0.9858084482387971,xy=0.17925011136486105,y2=0.03259314979671582 +a=hat,b=dog,i=1768,x=0.9896393441122658,y=0.5323182982465756,x2=0.9793860314149557,xy=0.5268031315356986,y2=0.2833627706481302 + mlr --seed 12345 sample -k 2 ./reg_test/input/abixy-het aaa=wye,b=wye,i=3,x=0.20460330576630303,y=0.33831852551664776 a=eks,b=pan,i=2,x=0.7586799647899636,y=0.5221511083334797 @@ -884,76 +958,6 @@ x=1 a=3 -================================================================ -TOP - -mlr top -n 4 -f x ./reg_test/input/abixy-wide -top_idx=1,x_top=0.999730 -top_idx=2,x_top=0.999308 -top_idx=3,x_top=0.997339 -top_idx=4,x_top=0.997245 - -mlr top -n 1 -f x,y ./reg_test/input/abixy-wide -top_idx=1,x_top=0.999730,y_top=0.999522 - -mlr top -n 4 -f x -g a ./reg_test/input/abixy-wide -a=cat,top_idx=1,x_top=0.996359 -a=cat,top_idx=2,x_top=0.990588 -a=cat,top_idx=3,x_top=0.990160 -a=cat,top_idx=4,x_top=0.986548 -a=pan,top_idx=1,x_top=0.999730 -a=pan,top_idx=2,x_top=0.999308 -a=pan,top_idx=3,x_top=0.996583 -a=pan,top_idx=4,x_top=0.990853 -a=wye,top_idx=1,x_top=0.997339 -a=wye,top_idx=2,x_top=0.996525 -a=wye,top_idx=3,x_top=0.995414 -a=wye,top_idx=4,x_top=0.994219 -a=dog,top_idx=1,x_top=0.997245 -a=dog,top_idx=2,x_top=0.988581 -a=dog,top_idx=3,x_top=0.980266 -a=dog,top_idx=4,x_top=0.977280 -a=hat,top_idx=1,x_top=0.997014 -a=hat,top_idx=2,x_top=0.995705 -a=hat,top_idx=3,x_top=0.992879 -a=hat,top_idx=4,x_top=0.989639 - -mlr top -n 1 -f x,y -g a ./reg_test/input/abixy-wide -a=cat,top_idx=1,x_top=0.996359,y_top=0.997929 -a=pan,top_idx=1,x_top=0.999730,y_top=0.999319 -a=wye,top_idx=1,x_top=0.997339,y_top=0.999450 -a=dog,top_idx=1,x_top=0.997245,y_top=0.999522 -a=hat,top_idx=1,x_top=0.997014,y_top=0.991400 - -mlr top -a -n 4 -f x ./reg_test/input/abixy-wide -a=pan,b=cat,i=1943,x=0.9997301774748071,y=0.8408385083267972,x2=0.9994604277538093,xy=0.8406116311572011,y2=0.7070093970852334 -a=pan,b=dog,i=552,x=0.9993077485337599,y=0.10019859749555626,x2=0.9986159762796124,xy=0.10012923486952477,y2=0.010039758940076493 -a=wye,b=pan,i=568,x=0.9973387930844961,y=0.5711453274816489,x2=0.9946846681912394,xy=0.569625391586397,y2=0.32620698510412 -a=dog,b=pan,i=1613,x=0.997245171384539,y=0.8188847898688517,x2=0.9944979318497786,xy=0.8166289026169553,y2=0.6705722990785534 - -mlr top -a -n 4 -f x -g a ./reg_test/input/abixy-wide -a=cat,b=dog,i=901,x=0.9963591865818606,y=0.9423606044154952,x2=0.9927316286860669,xy=0.9389296452822133,y2=0.8880435087543374 -a=cat,b=wye,i=135,x=0.9905881531288986,y=0.7789092765450115,x2=0.9812648891193222,xy=0.7715783017076895,y2=0.6066996610878732 -a=cat,b=hat,i=975,x=0.9901595235021967,y=0.8592860989306361,x2=0.9804158819820973,xy=0.8508303142692202,y2=0.738372599815431 -a=cat,b=dog,i=748,x=0.986548192318023,y=0.22327955924073406,x2=0.973277335765959,xy=0.2202760455505111,y2=0.04985376157473647 -a=pan,b=cat,i=1943,x=0.9997301774748071,y=0.8408385083267972,x2=0.9994604277538093,xy=0.8406116311572011,y2=0.7070093970852334 -a=pan,b=dog,i=552,x=0.9993077485337599,y=0.10019859749555626,x2=0.9986159762796124,xy=0.10012923486952477,y2=0.010039758940076493 -a=pan,b=pan,i=756,x=0.9965826588776338,y=0.8864833103309704,x2=0.9931769959756142,xy=0.883453894460285,y2=0.7858526594953555 -a=pan,b=wye,i=1876,x=0.9908531016291003,y=0.318873165751998,x2=0.9817898690080082,xy=0.3159564653116574,y2=0.10168009583670118 -a=wye,b=pan,i=568,x=0.9973387930844961,y=0.5711453274816489,x2=0.9946846681912394,xy=0.569625391586397,y2=0.32620698510412 -a=wye,b=dog,i=1772,x=0.9965249307670624,y=0.8078766237661098,x2=0.9930619376402985,xy=0.8050691965668507,y2=0.6526646392277285 -a=wye,b=pan,i=450,x=0.9954136776717515,y=0.8564839236024028,x2=0.9908483896960015,xy=0.8525558122597992,y2=0.7335647113893665 -a=wye,b=dog,i=882,x=0.9942189496990836,y=0.3298671968153588,x2=0.988471319940749,xy=0.32796021795794694,y2=0.10881236753482267 -a=dog,b=pan,i=1613,x=0.997245171384539,y=0.8188847898688517,x2=0.9944979318497786,xy=0.8166289026169553,y2=0.6705722990785534 -a=dog,b=hat,i=1680,x=0.9885812365291148,y=0.048768165346359305,x2=0.9772928612174336,xy=0.04821129320136021,y2=0.0023783339512498405 -a=dog,b=wye,i=1331,x=0.9802656048578349,y=0.9431768957041479,x2=0.9609206560672968,xy=0.9245638701553616,y2=0.889582656590113 -a=dog,b=pan,i=978,x=0.9772797759762182,y=0.5328687567382687,x2=0.9550757605321272,xy=0.5207618592099011,y2=0.2839491119077882 -a=hat,b=dog,i=1894,x=0.9970136511026203,y=0.2054109087831395,x2=0.9940362204849775,xy=0.2047974801421852,y2=0.04219364144711526 -a=hat,b=hat,i=818,x=0.9957052214494239,y=0.026702922142899044,x2=0.9914288880216462,xy=0.026588239005642018,y2=0.0007130460509697281 -a=hat,b=hat,i=1513,x=0.9928788688650781,y=0.1805357299725343,x2=0.9858084482387971,xy=0.17925011136486105,y2=0.03259314979671582 -a=hat,b=dog,i=1768,x=0.9896393441122658,y=0.5323182982465756,x2=0.9793860314149557,xy=0.5268031315356986,y2=0.2833627706481302 - - ================================================================ JOIN diff --git a/c/reg_test/run b/c/reg_test/run index f9efeb94a2..f6bebacb49 100755 --- a/c/reg_test/run +++ b/c/reg_test/run @@ -156,6 +156,16 @@ run_mlr top -f x,y -n 2 $indir/abixy-het run_mlr top -f x,y -n 2 -g a $indir/abixy-het run_mlr top -f x,y -n 2 -g a,b $indir/abixy-het +run_mlr top -f x,y -n 2 $indir/ints.dkvp +run_mlr top -f x,y -n 2 -F $indir/ints.dkvp + +run_mlr top -n 4 -f x $indir/abixy-wide +run_mlr top -n 1 -f x,y $indir/abixy-wide +run_mlr top -n 4 -f x -g a $indir/abixy-wide +run_mlr top -n 1 -f x,y -g a $indir/abixy-wide +run_mlr top -a -n 4 -f x $indir/abixy-wide +run_mlr top -a -n 4 -f x -g a $indir/abixy-wide + run_mlr --seed 12345 sample -k 2 $indir/abixy-het run_mlr --seed 12345 sample -k 2 -g a $indir/abixy-het run_mlr --seed 12345 sample -k 2 -g a,b $indir/abixy-het @@ -209,16 +219,6 @@ run_mlr sort -f a -r b -nf x -nr y $indir/abixy run_mlr sort -f x $indir/sort-het.dkvp run_mlr sort -r x $indir/sort-het.dkvp -# ---------------------------------------------------------------- -announce TOP - -run_mlr top -n 4 -f x $indir/abixy-wide -run_mlr top -n 1 -f x,y $indir/abixy-wide -run_mlr top -n 4 -f x -g a $indir/abixy-wide -run_mlr top -n 1 -f x,y -g a $indir/abixy-wide -run_mlr top -a -n 4 -f x $indir/abixy-wide -run_mlr top -a -n 4 -f x -g a $indir/abixy-wide - # ---------------------------------------------------------------- announce JOIN diff --git a/c/todo.txt b/c/todo.txt index eebd4e6c55..8b652856e0 100644 --- a/c/todo.txt +++ b/c/todo.txt @@ -8,8 +8,6 @@ TOP OF LIST * make a 3.1.0 w/ recent --auto etc & debian-port findings -* top int-to-int: & use -F - * faqents/cookbook * csv read perf diff --git a/c/unit_test/test_multiple_containers.c b/c/unit_test/test_multiple_containers.c index d63a767d2c..3d1fd0a20b 100644 --- a/c/unit_test/test_multiple_containers.c +++ b/c/unit_test/test_multiple_containers.c @@ -547,37 +547,49 @@ static char* test_top_keeper() { top_keeper_t* ptop_keeper = top_keeper_alloc(capacity); mu_assert_lf(ptop_keeper->size == 0); - top_keeper_add(ptop_keeper, 5.0, NULL); + top_keeper_add(ptop_keeper, mv_from_float(5.0), NULL); top_keeper_print(ptop_keeper); mu_assert_lf(ptop_keeper->size == 1); - mu_assert_lf(ptop_keeper->top_values[0] == 5.0); + mu_assert_lf(ptop_keeper->top_values[0].type == MT_FLOAT); + mu_assert_lf(ptop_keeper->top_values[0].u.fltv == 5.0); - top_keeper_add(ptop_keeper, 6.0, NULL); + top_keeper_add(ptop_keeper, mv_from_float(6.0), NULL); top_keeper_print(ptop_keeper); mu_assert_lf(ptop_keeper->size == 2); - mu_assert_lf(ptop_keeper->top_values[0] == 6.0); - mu_assert_lf(ptop_keeper->top_values[1] == 5.0); + mu_assert_lf(ptop_keeper->top_values[0].type == MT_FLOAT); + mu_assert_lf(ptop_keeper->top_values[0].u.fltv == 6.0); + mu_assert_lf(ptop_keeper->top_values[1].type == MT_FLOAT); + mu_assert_lf(ptop_keeper->top_values[1].u.fltv == 5.0); - top_keeper_add(ptop_keeper, 4.0, NULL); + top_keeper_add(ptop_keeper, mv_from_int(4), NULL); top_keeper_print(ptop_keeper); mu_assert_lf(ptop_keeper->size == 3); - mu_assert_lf(ptop_keeper->top_values[0] == 6.0); - mu_assert_lf(ptop_keeper->top_values[1] == 5.0); - mu_assert_lf(ptop_keeper->top_values[2] == 4.0); - - top_keeper_add(ptop_keeper, 2.0, NULL); + mu_assert_lf(ptop_keeper->top_values[0].type == MT_FLOAT); + mu_assert_lf(ptop_keeper->top_values[0].u.fltv == 6.0); + mu_assert_lf(ptop_keeper->top_values[1].type == MT_FLOAT); + mu_assert_lf(ptop_keeper->top_values[1].u.fltv == 5.0); + mu_assert_lf(ptop_keeper->top_values[2].type == MT_INT); + mu_assert_lf(ptop_keeper->top_values[2].u.intv == 4.0); + + top_keeper_add(ptop_keeper, mv_from_int(2), NULL); top_keeper_print(ptop_keeper); mu_assert_lf(ptop_keeper->size == 3); - mu_assert_lf(ptop_keeper->top_values[0] == 6.0); - mu_assert_lf(ptop_keeper->top_values[1] == 5.0); - mu_assert_lf(ptop_keeper->top_values[2] == 4.0); - - top_keeper_add(ptop_keeper, 7.0, NULL); + mu_assert_lf(ptop_keeper->top_values[0].type == MT_FLOAT); + mu_assert_lf(ptop_keeper->top_values[0].u.fltv == 6.0); + mu_assert_lf(ptop_keeper->top_values[1].type == MT_FLOAT); + mu_assert_lf(ptop_keeper->top_values[1].u.fltv == 5.0); + mu_assert_lf(ptop_keeper->top_values[2].type == MT_INT); + mu_assert_lf(ptop_keeper->top_values[2].u.intv == 4.0); + + top_keeper_add(ptop_keeper, mv_from_int(7), NULL); top_keeper_print(ptop_keeper); mu_assert_lf(ptop_keeper->size == 3); - mu_assert_lf(ptop_keeper->top_values[0] == 7.0); - mu_assert_lf(ptop_keeper->top_values[1] == 6.0); - mu_assert_lf(ptop_keeper->top_values[2] == 5.0); + mu_assert_lf(ptop_keeper->top_values[0].type == MT_INT); + mu_assert_lf(ptop_keeper->top_values[0].u.intv == 7); + mu_assert_lf(ptop_keeper->top_values[1].type == MT_FLOAT); + mu_assert_lf(ptop_keeper->top_values[1].u.fltv == 6.0); + mu_assert_lf(ptop_keeper->top_values[2].type == MT_FLOAT); + mu_assert_lf(ptop_keeper->top_values[2].u.fltv == 5.0); top_keeper_free(ptop_keeper); return NULL; diff --git a/doc/mlr.1.premade b/doc/mlr.1.premade index 42ebe58526..cbf157adb7 100644 --- a/doc/mlr.1.premade +++ b/doc/mlr.1.premade @@ -819,6 +819,7 @@ Usage: mlr top [options] to print only value and group-by fields. Requires a single value-field name only. --min Print top smallest values; default is top largest values. +-F Keep top values as floats even if they look like integers. Prints the n records with smallest/largest values at specified fields, optionally by category. .fi diff --git a/doc/reference.html b/doc/reference.html index d4deb2a4ee..7ad79e9954 100644 --- a/doc/reference.html +++ b/doc/reference.html @@ -2806,6 +2806,7 @@ to print only value and group-by fields. Requires a single value-field name only. --min Print top smallest values; default is top largest values. +-F Keep top values as floats even if they look like integers. Prints the n records with smallest/largest values at specified fields, optionally by category.