Skip to content

Commit c95a4b4

Browse files
authored
0.3.2 (#34)
* Fix #28 * Fix #26, #29, #31 * Fix #38 * Add `str_dtype` argument to `as_character()` to partially fix #36 * 0.3.2 * Delete grouped2.py
1 parent 0b68a31 commit c95a4b4

19 files changed

+243
-159
lines changed

datar/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@
44
from .core import _frame_format_patch
55
from .core.defaults import f
66

7-
__version__ = "0.3.1"
7+
__version__ = "0.3.2"

datar/base/string.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from pipda import register_func
88

99
from ..core.contexts import Context
10-
from ..core.types import IntOrIter, StringOrIter, is_scalar, is_null
10+
from ..core.types import Dtype, IntOrIter, StringOrIter, is_scalar, is_null
1111
from ..core.utils import (
1212
arg_match,
1313
get_option,
@@ -30,13 +30,14 @@
3030

3131

3232
@register_func(None, context=Context.EVAL)
33-
def as_character(x: Any, _na: Any = NA) -> StringOrIter:
33+
def as_character(x: Any, str_dtype: Dtype = str, _na: Any = NA) -> StringOrIter:
3434
"""Convert an object or elements of an iterable into string
3535
3636
Aliases `as_str` and `as_string`
3737
3838
Args:
3939
x: The object
40+
str_dtype: The string dtype to convert to
4041
_na: How NAs should be casted. Specify NA will keep them unchanged.
4142
But the dtype will be object then.
4243
@@ -45,8 +46,7 @@ def as_character(x: Any, _na: Any = NA) -> StringOrIter:
4546
When x is iterable, convert elements of it into strings
4647
Otherwise, convert x to string.
4748
"""
48-
return _as_type(x, str, na=_na)
49-
49+
return _as_type(x, str_dtype, na=_na)
5050

5151
as_str = as_string = as_character
5252

datar/core/_frame_format_patch.py

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,36 @@
1+
# BSD 3-Clause License
2+
3+
# Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc.
4+
# and PyData Development Team
5+
# All rights reserved.
6+
7+
# Copyright (c) 2011-2021, Open source contributors.
8+
9+
# Redistribution and use in source and binary forms, with or without
10+
# modification, are permitted provided that the following conditions are met:
11+
12+
# * Redistributions of source code must retain the above copyright notice, this
13+
# list of conditions and the following disclaimer.
14+
15+
# * Redistributions in binary form must reproduce the above copyright notice,
16+
# this list of conditions and the following disclaimer in the documentation
17+
# and/or other materials provided with the distribution.
18+
19+
# * Neither the name of the copyright holder nor the names of its
20+
# contributors may be used to endorse or promote products derived from
21+
# this software without specific prior written permission.
22+
23+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
24+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26+
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
27+
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28+
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
29+
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30+
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31+
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33+
134
"""Monkey-patch data frame format to
235
1. add dtypes next to column names when printing
336
2. collapse data frames when they are elements of a parent data frame.
@@ -37,12 +70,7 @@
3770

3871
from .options import add_option
3972

40-
# pylint: disable=c-extension-no-member
41-
# pylint: disable=invalid-name
42-
# pylint: disable=too-many-branches
43-
# pylint: disable=too-many-statements
44-
# pylint: disable=consider-using-enumerate
45-
# pylint: disable=too-many-nested-blocks
73+
# pylint: skip-file
4674

4775
# TODO: patch more formatters
4876

datar/core/operator.py

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,15 @@
1212
from .exceptions import DataUnrecyclable
1313
from .types import BoolOrIter
1414

15+
class DatarOperatorMeta(type):
16+
"""Allow attributes with '_op_' to pass for operator functions"""
17+
def __getattr__(cls, name: str) -> Any:
18+
"""If name starts with '_op_', let it go self for the real function
19+
Otherwise, do regular getattr.
20+
"""
21+
if name.startswith('_op_'):
22+
return True
23+
return super().__getattr__(name)
1524

1625
@register_operator
1726
class DatarOperator(Operator):
@@ -30,19 +39,19 @@ def _arithmetize2(self, left: Any, right: Any, op: str) -> Any:
3039
left, right = _recycle_left_right(left, right)
3140
return op_func(left, right)
3241

33-
def invert(self, operand: Any) -> Any:
42+
def _op_invert(self, operand: Any) -> Any:
3443
"""Interpretation for ~x"""
35-
if isinstance(operand, (slice, str, list, tuple, Collection)):
44+
if isinstance(operand, (slice, str, list, tuple)):
3645
return Inverted(operand)
3746
return self._arithmetize1(operand, "invert")
3847

39-
def neg(self, operand: Any) -> Any:
48+
def _op_neg(self, operand: Any) -> Any:
4049
"""Interpretation for -x"""
4150
if isinstance(operand, (slice, list)):
4251
return Negated(operand)
4352
return self._arithmetize1(operand, "neg")
4453

45-
def and_(self, left: Any, right: Any) -> Any:
54+
def _op_and_(self, left: Any, right: Any) -> Any:
4655
"""Mimic the & operator in R.
4756
4857
This has to have Expression objects to be involved to work
@@ -63,7 +72,7 @@ def and_(self, left: Any, right: Any) -> Any:
6372
right = Series(right).fillna(False)
6473
return left & right
6574

66-
def or_(self, left: Any, right: Any) -> Any:
75+
def _op_or_(self, left: Any, right: Any) -> Any:
6776
"""Mimic the & operator in R.
6877
6978
This has to have Expression objects to be involved to work
@@ -84,9 +93,9 @@ def or_(self, left: Any, right: Any) -> Any:
8493
return left | right
8594

8695
# pylint: disable=invalid-name
87-
def ne(self, left: Any, right: Any) -> BoolOrIter:
96+
def _op_ne(self, left: Any, right: Any) -> BoolOrIter:
8897
"""Interpret for left != right"""
89-
out = self.eq(left, right)
98+
out = self._op_eq(left, right)
9099
if isinstance(out, (numpy.ndarray, Series)):
91100
neout = ~out
92101
# neout[pandas.isna(out)] = numpy.nan
@@ -96,11 +105,11 @@ def ne(self, left: Any, right: Any) -> BoolOrIter:
96105

97106
def __getattr__(self, name: str) -> Any:
98107
"""Other operators"""
99-
if not hasattr(operator, name):
100-
raise AttributeError
101-
attr = partial(self._arithmetize2, op=name)
102-
attr.__qualname__ = self._arithmetize2.__qualname__
103-
return attr
108+
if name.startswith('_op_'):
109+
attr = partial(self._arithmetize2, op=name[4:])
110+
attr.__qualname__ = self._arithmetize2.__qualname__
111+
return attr
112+
return super().__getattr__(name)
104113

105114

106115
def _recycle_left_right(left: Any, right: Any) -> Tuple:

docs/CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
## 0.3.2
2+
- Adopt `pipda` v0.4.1 to fix `getattr()` failure for operater-connected expressions (#38)
3+
- Add `str_dtype` argument to `as_character()` to partially fix #36
4+
- Update license in `core._frame_format_patch` (#28)
5+
16
## 0.3.1
27
- Adopt `pipda` v0.4.0
38
- Change argument `_dtypes` to `dtypes_` for tibble-families

docs/caveats/NAs.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
2+
- dtype
3+
4+
`NA` in datar sets to `numpy.nan`, which is a float. So that it causes problems for other dtypes of data, because setting a value to NA (float) in an array with other dtype is not compatible. Unlink R, python does not have missing value type for other dtypes.
5+
6+
pandas has introduced it's own `NA` and some `NA` compatible dtypes. However, `numpy` is still not aware of it, which causes problems for internal computations.
7+
8+
- string
9+
10+
When initialize a string array intentionally: `numpy.array(['a', NA])`, the `NA` will be converted to a string `'nan'`. That may not be what we want sometimes. To avoid that, use `None` or `NULL` instead:
11+
12+
```python
13+
>>> numpy.array(['a', None])
14+
array(['a', None], dtype=object)
15+
```
16+
17+
Just pay attention that the dtype falls back to object.
18+
19+
20+
- `NaN`
21+
22+
Since `NA` is already a float, `NaN` here is equivalent to `NA`.

docs/caveats/df_index_colname.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
2+
Most APIs from tidyverse packages ignore/reset the index (row names) of data frames, so do the APIs from `datar`. So when selecting rows, row indices are always used. With most APIs, the indices of the data frames are dropped, so they are actually ranging from 0 to `nrow(df) - 1`.
3+
4+
!!! Note
5+
6+
when using 1-based indexing (default), 1 selects the first row. Even though the first row shows index 0 when it's printed.
7+
8+
No `MultiIndex` indices/column names are supported for the APIs to select or manipulate data frames and the data frames generated by the APIs will not have `MultiIndex` indices/column names. However, since it's still pandas DataFrame, you can always do it in pandas way:
9+
10+
```python
11+
df = tibble(x=1, y=2)
12+
df2 = df >> mutate(z=f.x+f.y)
13+
# pandas way to select
14+
df2.iloc[0, z] # 3
15+
# add multiindex to it:
16+
df.columns = pd.MultiIndex.from_product([df.columns, ['C']])
17+
```

docs/caveats/grouped.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
2+
`datar` doesn't use `pandas`' `DataFrameGroupBy`/`SeriesGroupBy` classes. Instead, we have our own `DataFrameGroupBy` class, which is actually a subclass of `DataFrame`, with 3 extra properties: `_group_data`, `_group_vars` and `_group_drop`, carring the grouping data, grouping variables/columns and whether drop the non-observable values. This is very similar to `grouped_df` from `dplyr`.
3+
4+
The reasons that we implement this are:
5+
6+
1. Pandas DataFrameGroupBy cannot handle mutilpe categorical columns as
7+
groupby variables with non-obserable values
8+
2. It is very hard to retrieve group indices and data when doing apply
9+
3. NAs unmatched in grouping variables

docs/caveats/in.md

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
`%in%` in R is a shortcut for `is.element()` to test if the elements are in a container.
2+
3+
```r
4+
r$> c(1,3,5) %in% 1:4
5+
[1] TRUE TRUE FALSE
6+
7+
r$> is.element(c(1,3,5), 1:4)
8+
[1] TRUE TRUE FALSE
9+
```
10+
11+
However, `in` in python acts differently:
12+
13+
```python
14+
>>> import numpy as np
15+
>>>
16+
>>> arr = np.array([1,2,3,4])
17+
>>> elts = np.array([1,3,5])
18+
>>>
19+
>>> elts in arr
20+
/.../bin/bpython:1: DeprecationWarning: elementwise comparison failed; this will raise an error in the future.
21+
#!/.../bin/python
22+
False
23+
>>> [1,2] in [1,2,3]
24+
False
25+
```
26+
27+
It simply tests if the element on the left side of `in` is equal to any of the elements in the right side. Regardless of whether the element on the left side is scalar or not.
28+
29+
Yes, we can redefine the behavior of this by writing your own `__contains__()` methods of the right object. For example:
30+
31+
```python
32+
>>> class MyList(list):
33+
... def __contains__(self, key):
34+
... # Just an example to let it return the reversed result
35+
... return not super().__contains__(key)
36+
...
37+
>>> 1 in MyList([1,2,3])
38+
False
39+
>>> 4 in MyList([1,2,3])
40+
True
41+
```
42+
43+
But the problem is that the result `__contains__()` is forced to be a scalar bool by python. In this sense, we cannot let `x in y` to be evaluated as a bool array or even a pipda `Expression` object.
44+
```python
45+
>>> class MyList(list):
46+
... def __contains__(self, key):
47+
... # Just an example
48+
... return [True, False, True] # logically True in python
49+
...
50+
>>> 1 in MyList([1,2,3])
51+
True
52+
>>> 4 in MyList([1,2,3])
53+
True
54+
```
55+
56+
So instead, we ported `is.element()` from R:
57+
58+
```python
59+
>>> import numpy as np
60+
>>> from datar.base import is_element
61+
>>>
62+
>>> arr = np.array([1,2,3,4])
63+
>>> elts = np.array([1,3,5])
64+
>>>
65+
>>> is_element(elts, arr)
66+
>>> is_element(elts, arr)
67+
array([ True, True, False])
68+
```
69+
70+
So, as @rleyvasal pointed out in https://github.com/pwwang/datar/issues/31#issuecomment-877499212,
71+
72+
if the left element is a pandas `Series`:
73+
```python
74+
>>> import pandas as pd
75+
>>> pd.Series(elts).isin(arr)
76+
0 True
77+
1 True
78+
2 False
79+
dtype: bool
80+
```
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ In `R`, negative indexes mean removal. However, here negative indexes are still
2020
selection, as `-1` for the last column, `-2` for the second last, etc. It is
2121
the same for both 0-based and 1-based indexing.
2222

23+
If you want to do negative selection, use tilde `~` instead of `-`.
24+
2325
## Temporary index base change
2426

2527
For example:

0 commit comments

Comments
 (0)