Skip to content

Commit

Permalink
refactor(functions): make histogram() use arrow arrays (#609)
Browse files Browse the repository at this point in the history
Fixes #500.
  • Loading branch information
Christopher M. Wolff authored Jan 2, 2019
1 parent ab71bb1 commit 74c4a51
Show file tree
Hide file tree
Showing 7 changed files with 44 additions and 4 deletions.
2 changes: 1 addition & 1 deletion functions/tests/testdata/histogram.flux
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
t_histogram = (table=<-) =>
table
|> histogram(bins:[0.0,1.0,2.0])
|> histogram(bins:[-1.0,0.0,1.0,2.0])

testingTest(name: "histogram", load: testLoadData, infile: "histogram.in.csv", outfile: "histogram.out.csv", test: t_histogram)
3 changes: 3 additions & 0 deletions functions/tests/testdata/histogram.out.csv
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@
#group,false,false,true,true,true,true,false,false
#default,_result,,,,,,,
,result,table,_start,_stop,_time,_field,le,_value
,,0,2018-05-22T19:53:00Z,2018-05-22T19:54:00Z,2018-05-22T19:53:00Z,x_duration_seconds,-1,0
,,0,2018-05-22T19:53:00Z,2018-05-22T19:54:00Z,2018-05-22T19:53:00Z,x_duration_seconds,0,1
,,0,2018-05-22T19:53:00Z,2018-05-22T19:54:00Z,2018-05-22T19:53:00Z,x_duration_seconds,1,2
,,0,2018-05-22T19:53:00Z,2018-05-22T19:54:00Z,2018-05-22T19:53:00Z,x_duration_seconds,2,2
,,1,2018-05-22T19:53:00Z,2018-05-22T19:54:00Z,2018-05-22T19:53:00Z,y_duration_seconds,-1,0
,,1,2018-05-22T19:53:00Z,2018-05-22T19:54:00Z,2018-05-22T19:53:00Z,y_duration_seconds,0,2
,,1,2018-05-22T19:53:00Z,2018-05-22T19:54:00Z,2018-05-22T19:53:00Z,y_duration_seconds,1,2
,,1,2018-05-22T19:53:00Z,2018-05-22T19:54:00Z,2018-05-22T19:53:00Z,y_duration_seconds,2,3

9 changes: 9 additions & 0 deletions functions/tests/testdata/histogram_normalize.flux
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
t_histogram = (table=<-) =>
table
|> histogram(bins:[-1.0,0.0,1.0,2.0],
normalize:true,
column: "theValue",
countColumn: "theCount",
upperBoundColumn: "ub")

testingTest(name: "histogram", load: testLoadData, infile: "histogram_normalize.in.csv", outfile: "histogram_normalize.out.csv", test: t_histogram)
9 changes: 9 additions & 0 deletions functions/tests/testdata/histogram_normalize.in.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#datatype,string,long,dateTime:RFC3339,dateTime:RFC3339,dateTime:RFC3339,string,double
#group,false,false,true,true,true,true,false
#default,_result,,,,,,
,result,table,_start,_stop,_time,_field,theValue
,,1,2018-05-22T19:53:00Z,2018-05-22T19:54:00Z,2018-05-22T19:53:00Z,x_duration_seconds,0
,,1,2018-05-22T19:53:00Z,2018-05-22T19:54:00Z,2018-05-22T19:53:00Z,x_duration_seconds,1
,,2,2018-05-22T19:53:00Z,2018-05-22T19:54:00Z,2018-05-22T19:53:00Z,y_duration_seconds,0
,,2,2018-05-22T19:53:00Z,2018-05-22T19:54:00Z,2018-05-22T19:53:00Z,y_duration_seconds,0
,,2,2018-05-22T19:53:00Z,2018-05-22T19:54:00Z,2018-05-22T19:53:00Z,y_duration_seconds,1.5
13 changes: 13 additions & 0 deletions functions/tests/testdata/histogram_normalize.out.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#datatype,string,long,dateTime:RFC3339,dateTime:RFC3339,dateTime:RFC3339,string,double,double
#group,false,false,true,true,true,true,false,false
#default,_result,,,,,,,
,result,table,_start,_stop,_time,_field,ub,theCount
,,0,2018-05-22T19:53:00Z,2018-05-22T19:54:00Z,2018-05-22T19:53:00Z,x_duration_seconds,-1,0
,,0,2018-05-22T19:53:00Z,2018-05-22T19:54:00Z,2018-05-22T19:53:00Z,x_duration_seconds,0,0.5
,,0,2018-05-22T19:53:00Z,2018-05-22T19:54:00Z,2018-05-22T19:53:00Z,x_duration_seconds,1,1
,,0,2018-05-22T19:53:00Z,2018-05-22T19:54:00Z,2018-05-22T19:53:00Z,x_duration_seconds,2,1
,,1,2018-05-22T19:53:00Z,2018-05-22T19:54:00Z,2018-05-22T19:53:00Z,y_duration_seconds,-1,0
,,1,2018-05-22T19:53:00Z,2018-05-22T19:54:00Z,2018-05-22T19:53:00Z,y_duration_seconds,0,0.6666666666666666
,,1,2018-05-22T19:53:00Z,2018-05-22T19:54:00Z,2018-05-22T19:53:00Z,y_duration_seconds,1,0.6666666666666666
,,1,2018-05-22T19:53:00Z,2018-05-22T19:54:00Z,2018-05-22T19:53:00Z,y_duration_seconds,2,1

7 changes: 6 additions & 1 deletion functions/tests/testdata/prepcsvtests/prepcsvtests.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ func main() {
if err != nil {
return
}

if len(fnames) == 0 {
fmt.Printf("could not find any .flux files in directory \"%s\"", path)
return
}
} else {
printUsage()
return
Expand All @@ -69,7 +74,7 @@ func main() {
incsv := testName + ".in.csv"
indata, err := ioutil.ReadFile(incsv)
if err != nil {
fmt.Printf("could not open file %s", fname)
fmt.Printf("could not open file %s", incsv)
return
}

Expand Down
5 changes: 3 additions & 2 deletions functions/transformations/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ func init() {
map[string]semantic.PolyType{
"column": semantic.String,
"upperBoundColumn": semantic.String,
"countColumn": semantic.String,
"bins": semantic.NewArrayPolyType(semantic.Float),
"normalize": semantic.Bool,
},
Expand Down Expand Up @@ -190,9 +191,9 @@ func (t *histogramTransformation) Process(id execute.DatasetID, tbl flux.Table)
}
totalRows := 0.0
counts := make([]float64, len(t.spec.Bins))
err = tbl.Do(func(cr flux.ColReader) error {
err = tbl.DoArrow(func(cr flux.ArrowColReader) error {
totalRows += float64(cr.Len())
for _, v := range cr.Floats(valueIdx) {
for _, v := range cr.Floats(valueIdx).Float64Values() {
idx := sort.Search(len(t.spec.Bins), func(i int) bool {
return v <= t.spec.Bins[i]
})
Expand Down

0 comments on commit 74c4a51

Please sign in to comment.