From 140623283a3f5a734b7a76be216f140411304be3 Mon Sep 17 00:00:00 2001 From: Seth Michael Larson Date: Tue, 14 Jul 2020 11:39:52 -0500 Subject: [PATCH] Support Series/collections in Series.isin(), add type hints --- docs/source/reference/supported_apis.rst | 4 +- eland/arithmetics.py | 2 +- eland/dataframe.py | 38 ++++++--- eland/field_mappings.py | 32 ++++--- eland/ndframe.py | 41 ++++----- eland/series.py | 99 +++++++++++++--------- eland/tests/dataframe/test_query_pytest.py | 9 +- eland/utils.py | 29 +++++-- noxfile.py | 2 + 9 files changed, 157 insertions(+), 99 deletions(-) diff --git a/docs/source/reference/supported_apis.rst b/docs/source/reference/supported_apis.rst index 8a1f3f2..94d9778 100644 --- a/docs/source/reference/supported_apis.rst +++ b/docs/source/reference/supported_apis.rst @@ -1040,7 +1040,7 @@ script instead of being modified manually. +---------------------------------------+------------+ | ``ed.Series.__dir__()`` | **Yes** | +---------------------------------------+------------+ -| ``ed.Series.__div__()`` | No | +| ``ed.Series.__div__()`` | **Yes** | +---------------------------------------+------------+ | ``ed.Series.__divmod__()`` | No | +---------------------------------------+------------+ @@ -1134,7 +1134,7 @@ script instead of being modified manually. +---------------------------------------+------------+ | ``ed.Series.__rand__()`` | No | +---------------------------------------+------------+ -| ``ed.Series.__rdiv__()`` | No | +| ``ed.Series.__rdiv__()`` | **Yes** | +---------------------------------------+------------+ | ``ed.Series.__rdivmod__()`` | No | +---------------------------------------+------------+ diff --git a/eland/arithmetics.py b/eland/arithmetics.py index 927392c..584fb0b 100644 --- a/eland/arithmetics.py +++ b/eland/arithmetics.py @@ -138,7 +138,7 @@ class ArithmeticSeries(ArithmeticObject): for task in self._tasks: if task.op_name == "__add__": value = f"({value} + {task.object.resolve()})" - elif task.op_name == "__truediv__": + elif task.op_name in ("__truediv__", "__div__"): value = f"({value} / {task.object.resolve()})" elif task.op_name == "__floordiv__": value = f"Math.floor({value} / {task.object.resolve()})" diff --git a/eland/dataframe.py b/eland/dataframe.py index ed79ed2..875cdb1 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -19,7 +19,7 @@ import sys import warnings from io import StringIO import re -from typing import Optional, Sequence, Union +from typing import Optional, Sequence, Union, Tuple import numpy as np import pandas as pd @@ -631,7 +631,7 @@ class DataFrame(NDFrame): def info_es(self): return self.es_info() - def es_query(self, query): + def es_query(self, query) -> "DataFrame": """Applies an Elasticsearch DSL query to the current DataFrame. Parameters @@ -705,7 +705,7 @@ class DataFrame(NDFrame): def info( self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None - ): + ) -> None: """ Print a concise summary of a DataFrame. @@ -822,7 +822,7 @@ class DataFrame(NDFrame): dtype = dtypes.iloc[i] col = pprint_thing(col) - line_no = _put_str(" {num}".format(num=i), space_num) + line_no = _put_str(f" {i}", space_num) count = "" if show_counts: @@ -1223,7 +1223,7 @@ class DataFrame(NDFrame): } return self._query_compiler.to_csv(**kwargs) - def to_pandas(self, show_progress: bool = False) -> "DataFrame": + def to_pandas(self, show_progress: bool = False) -> pd.DataFrame: """ Utility method to convert eland.Dataframe to pandas.Dataframe @@ -1233,10 +1233,10 @@ class DataFrame(NDFrame): """ return self._query_compiler.to_pandas(show_progress=show_progress) - def _empty_pd_df(self): + def _empty_pd_df(self) -> pd.DataFrame: return self._query_compiler._empty_pd_ef() - def select_dtypes(self, include=None, exclude=None): + def select_dtypes(self, include=None, exclude=None) -> "DataFrame": """ Return a subset of the DataFrame's columns based on the column dtypes. @@ -1272,7 +1272,7 @@ class DataFrame(NDFrame): return self._getitem_array(empty_df.columns) @property - def shape(self): + def shape(self) -> Tuple[int, int]: """ Return a tuple representing the dimensionality of the DataFrame. @@ -1299,7 +1299,23 @@ class DataFrame(NDFrame): return num_rows, num_columns - def keys(self): + @property + def ndim(self) -> int: + """ + Returns 2 by definition of a DataFrame + + Returns + ------- + int + By definition 2 + + See Also + -------- + :pandas_api_docs:`pandas.DataFrame.ndim` + """ + return 2 + + def keys(self) -> pd.Index: """ Return columns @@ -1381,7 +1397,7 @@ class DataFrame(NDFrame): hist = gfx.ed_hist_frame - def query(self, expr): + def query(self, expr) -> "DataFrame": """ Query the columns of a DataFrame with a boolean expression. @@ -1474,7 +1490,7 @@ class DataFrame(NDFrame): like: Optional[str] = None, regex: Optional[str] = None, axis: Optional[Union[int, str]] = None, - ): + ) -> "DataFrame": """ Subset the dataframe rows or columns according to the specified index labels. Note that this routine does not filter a dataframe on its diff --git a/eland/field_mappings.py b/eland/field_mappings.py index 33dbf1c..aa58bac 100644 --- a/eland/field_mappings.py +++ b/eland/field_mappings.py @@ -27,15 +27,16 @@ from pandas.core.dtypes.common import ( is_string_dtype, ) from pandas.core.dtypes.inference import is_list_like -from typing import NamedTuple, Optional, Mapping, Dict, Any, TYPE_CHECKING +from typing import NamedTuple, Optional, Mapping, Dict, Any, TYPE_CHECKING, List, Set if TYPE_CHECKING: + from elasticsearch import Elasticsearch from eland import DataFrame -ES_FLOAT_TYPES = {"double", "float", "half_float", "scaled_float"} -ES_INTEGER_TYPES = {"long", "integer", "short", "byte"} -ES_COMPATIBLE_TYPES = { +ES_FLOAT_TYPES: Set[str] = {"double", "float", "half_float", "scaled_float"} +ES_INTEGER_TYPES: Set[str] = {"long", "integer", "short", "byte"} +ES_COMPATIBLE_TYPES: Dict[str, Set[str]] = { "double": ES_FLOAT_TYPES, "scaled_float": ES_FLOAT_TYPES, "float": ES_FLOAT_TYPES, @@ -80,7 +81,7 @@ class Field(NamedTuple): def np_dtype(self): return np.dtype(self.pd_dtype) - def is_es_agg_compatible(self, es_agg): + def is_es_agg_compatible(self, es_agg) -> bool: # Cardinality works for all types # Numerics and bools work for all aggs if es_agg == "cardinality" or self.is_numeric or self.is_bool: @@ -115,7 +116,7 @@ class FieldMappings: or es_field_name.keyword (if exists) or None """ - ES_DTYPE_TO_PD_DTYPE = { + ES_DTYPE_TO_PD_DTYPE: Dict[str, str] = { "text": "object", "keyword": "object", "long": "int64", @@ -133,7 +134,7 @@ class FieldMappings: } # the labels for each column (display_name is index) - column_labels = [ + column_labels: List[str] = [ "es_field_name", "is_source", "es_dtype", @@ -145,7 +146,12 @@ class FieldMappings: "aggregatable_es_field_name", ] - def __init__(self, client=None, index_pattern=None, display_names=None): + def __init__( + self, + client: "Elasticsearch", + index_pattern: str, + display_names: Optional[List[str]] = None, + ): """ Parameters ---------- @@ -184,7 +190,9 @@ class FieldMappings: self.display_names = display_names @staticmethod - def _extract_fields_from_mapping(mappings, source_only=False, date_format=None): + def _extract_fields_from_mapping( + mappings: Dict[str, Any], source_only: bool = False + ) -> Dict[str, str]: """ Extract all field names and types from a mapping. ``` @@ -256,10 +264,10 @@ class FieldMappings: # Recurse until we get a 'type: xxx' def flatten(x, name=""): - if type(x) is dict: + if isinstance(x, dict): for a in x: - if ( - a == "type" and type(x[a]) is str + if a == "type" and isinstance( + x[a], str ): # 'type' can be a name of a field field_name = name[:-1] field_type = x[a] diff --git a/eland/ndframe.py b/eland/ndframe.py index b3c2a96..16a01df 100644 --- a/eland/ndframe.py +++ b/eland/ndframe.py @@ -17,10 +17,14 @@ import sys from abc import ABC, abstractmethod -from typing import Tuple - +from typing import TYPE_CHECKING, Tuple +import pandas as pd from eland.query_compiler import QueryCompiler + +if TYPE_CHECKING: + from eland.index import Index + """ NDFrame --------- @@ -73,7 +77,8 @@ class NDFrame(ABC): ) self._query_compiler = _query_compiler - def _get_index(self): + @property + def index(self) -> "Index": """ Return eland index referencing Elasticsearch field to index a DataFrame/Series @@ -100,10 +105,8 @@ class NDFrame(ABC): """ return self._query_compiler.index - index = property(_get_index) - @property - def dtypes(self): + def dtypes(self) -> pd.Series: """ Return the pandas dtypes in the DataFrame. Elasticsearch types are mapped to pandas dtypes via Mappings._es_dtype_to_pd_dtype.__doc__ @@ -129,7 +132,7 @@ class NDFrame(ABC): """ return self._query_compiler.dtypes - def _build_repr(self, num_rows): + def _build_repr(self, num_rows) -> pd.DataFrame: # self could be Series or DataFrame if len(self.index) <= num_rows: return self.to_pandas() @@ -144,11 +147,11 @@ class NDFrame(ABC): return head.append(tail) - def __sizeof__(self): + def __sizeof__(self) -> int: # Don't default to pandas, just return approximation TODO - make this more accurate return sys.getsizeof(self._query_compiler) - def __len__(self): + def __len__(self) -> int: """Gets the length of the DataFrame. Returns: @@ -159,7 +162,7 @@ class NDFrame(ABC): def _es_info(self, buf): self._query_compiler.es_info(buf) - def mean(self, numeric_only=True): + def mean(self, numeric_only: bool = True) -> pd.Series: """ Return mean value for each numeric column @@ -191,7 +194,7 @@ class NDFrame(ABC): """ return self._query_compiler.mean(numeric_only=numeric_only) - def sum(self, numeric_only=True): + def sum(self, numeric_only: bool = True) -> pd.Series: """ Return sum for each numeric column @@ -223,7 +226,7 @@ class NDFrame(ABC): """ return self._query_compiler.sum(numeric_only=numeric_only) - def min(self, numeric_only=True): + def min(self, numeric_only: bool = True) -> pd.Series: """ Return the minimum value for each numeric column @@ -255,7 +258,7 @@ class NDFrame(ABC): """ return self._query_compiler.min(numeric_only=numeric_only) - def var(self, numeric_only=True): + def var(self, numeric_only: bool = True) -> pd.Series: """ Return variance for each numeric column @@ -285,7 +288,7 @@ class NDFrame(ABC): """ return self._query_compiler.var(numeric_only=numeric_only) - def std(self, numeric_only=True): + def std(self, numeric_only: bool = True) -> pd.Series: """ Return standard deviation for each numeric column @@ -315,7 +318,7 @@ class NDFrame(ABC): """ return self._query_compiler.std(numeric_only=numeric_only) - def median(self, numeric_only=True): + def median(self, numeric_only: bool = True) -> pd.Series: """ Return the median value for each numeric column @@ -345,7 +348,7 @@ class NDFrame(ABC): """ return self._query_compiler.median(numeric_only=numeric_only) - def max(self, numeric_only=True): + def max(self, numeric_only: bool = True) -> pd.Series: """ Return the maximum value for each numeric column @@ -377,7 +380,7 @@ class NDFrame(ABC): """ return self._query_compiler.max(numeric_only=numeric_only) - def nunique(self): + def nunique(self) -> pd.Series: """ Return cardinality of each field. @@ -423,7 +426,7 @@ class NDFrame(ABC): """ return self._query_compiler.nunique() - def mad(self, numeric_only=True): + def mad(self, numeric_only: bool = True) -> pd.Series: """ Return standard deviation for each numeric column @@ -456,7 +459,7 @@ class NDFrame(ABC): def _hist(self, num_bins): return self._query_compiler._hist(num_bins) - def describe(self): + def describe(self) -> pd.DataFrame: """ Generate descriptive statistics that summarize the central tendency, dispersion and shape of a dataset’s distribution, excluding NaN values. diff --git a/eland/series.py b/eland/series.py index b52345c..011bd0c 100644 --- a/eland/series.py +++ b/eland/series.py @@ -33,8 +33,9 @@ Based on NDFrame which underpins eland.DataFrame import sys import warnings +from collections.abc import Collection from io import StringIO -from typing import Optional, Union, Sequence +from typing import Optional, Union, Sequence, Any, Tuple, TYPE_CHECKING import numpy as np import pandas as pd @@ -45,6 +46,7 @@ from eland import NDFrame from eland.arithmetics import ArithmeticSeries, ArithmeticString, ArithmeticNumber from eland.common import DEFAULT_NUM_ROWS_DISPLAYED, docstring_parameter from eland.filter import ( + BooleanFilter, NotFilter, Equal, Greater, @@ -56,10 +58,14 @@ from eland.filter import ( IsNull, NotNull, ) -from eland.utils import deprecated_api +from eland.utils import deprecated_api, to_list + +if TYPE_CHECKING: # type: ignore + from elasticsearch import Elasticsearch # noqa: F401 + from eland.query_compiler import QueryCompiler # noqa: F401 -def _get_method_name(): +def _get_method_name() -> str: return sys._getframe(1).f_code.co_name @@ -106,12 +112,12 @@ class Series(NDFrame): def __init__( self, - es_client=None, - es_index_pattern=None, - name=None, - es_index_field=None, - _query_compiler=None, - ): + es_client: Optional["Elasticsearch"] = None, + es_index_pattern: Optional[str] = None, + name: Optional[str] = None, + es_index_field: Optional[str] = None, + _query_compiler: Optional["QueryCompiler"] = None, + ) -> None: # Series has 1 column if name is None: columns = None @@ -129,7 +135,7 @@ class Series(NDFrame): hist = eland.plotting.ed_hist_series @property - def empty(self): + def empty(self) -> bool: """Determines if the Series is empty. Returns: @@ -139,7 +145,7 @@ class Series(NDFrame): return len(self.index) == 0 @property - def shape(self): + def shape(self) -> Tuple[int, int]: """ Return a tuple representing the dimensionality of the Series. @@ -167,7 +173,7 @@ class Series(NDFrame): return num_rows, num_columns @property - def es_field_name(self): + def es_field_name(self) -> str: """ Returns ------- @@ -176,15 +182,15 @@ class Series(NDFrame): """ return self._query_compiler.get_field_names(include_scripted_fields=True)[0] - def _get_name(self): + @property + def name(self) -> str: return self._query_compiler.columns[0] - def _set_name(self, name): + @name.setter + def name(self, name: str) -> None: self._query_compiler.rename({self.name: name}, inplace=True) - name = property(_get_name, _set_name) - - def rename(self, new_name): + def rename(self, new_name: str) -> "Series": """ Rename name of series. Only column rename is supported. This does not change the underlying Elasticsearch index, but adds a symbolic link from the new name (column) to the Elasticsearch field name. @@ -238,18 +244,23 @@ class Series(NDFrame): _query_compiler=self._query_compiler.rename({self.name: new_name}) ) - def head(self, n=5): + def head(self, n: int = 5) -> "Series": return Series(_query_compiler=self._query_compiler.head(n)) - def tail(self, n=5): + def tail(self, n: int = 5) -> "Series": return Series(_query_compiler=self._query_compiler.tail(n)) - def sample(self, n: int = None, frac: float = None, random_state: int = None): + def sample( + self, + n: Optional[int] = None, + frac: Optional[float] = None, + random_state: Optional[int] = None, + ) -> "Series": return Series( _query_compiler=self._query_compiler.sample(n, frac, random_state) ) - def value_counts(self, es_size=10): + def value_counts(self, es_size: int = 10) -> pd.Series: """ Return the value counts for the specified field. @@ -287,9 +298,8 @@ class Series(NDFrame): """ if not isinstance(es_size, int): raise TypeError("es_size must be a positive integer.") - if not es_size > 0: + elif es_size <= 0: raise ValueError("es_size must be a positive integer.") - return self._query_compiler.value_counts(es_size) # dtype not implemented for Series as causes query to fail @@ -297,7 +307,7 @@ class Series(NDFrame): # ---------------------------------------------------------------------- # Rendering Methods - def __repr__(self): + def __repr__(self) -> str: """ Return a string representation for a particular Series. """ @@ -339,7 +349,7 @@ class Series(NDFrame): name=False, max_rows=None, min_rows=None, - ): + ) -> Optional[str]: """ Render a string representation of the Series. @@ -411,15 +421,15 @@ class Series(NDFrame): result = _buf.getvalue() return result - def to_pandas(self, show_progress=False): + def to_pandas(self, show_progress: bool = False) -> pd.Series: return self._query_compiler.to_pandas(show_progress=show_progress)[self.name] @property - def _dtype(self): + def _dtype(self) -> np.dtype: # DO NOT MAKE PUBLIC (i.e. def dtype) as this breaks query eval implementation return self._query_compiler.dtypes[0] - def __gt__(self, other): + def __gt__(self, other: Union[int, float, "Series"]) -> BooleanFilter: if isinstance(other, Series): # Need to use scripted query to compare to values painless = f"doc['{self.name}'].value > doc['{other.name}'].value" @@ -429,7 +439,7 @@ class Series(NDFrame): else: raise NotImplementedError(other, type(other)) - def __lt__(self, other): + def __lt__(self, other: Union[int, float, "Series"]) -> BooleanFilter: if isinstance(other, Series): # Need to use scripted query to compare to values painless = f"doc['{self.name}'].value < doc['{other.name}'].value" @@ -439,7 +449,7 @@ class Series(NDFrame): else: raise NotImplementedError(other, type(other)) - def __ge__(self, other): + def __ge__(self, other: Union[int, float, "Series"]) -> BooleanFilter: if isinstance(other, Series): # Need to use scripted query to compare to values painless = f"doc['{self.name}'].value >= doc['{other.name}'].value" @@ -449,7 +459,7 @@ class Series(NDFrame): else: raise NotImplementedError(other, type(other)) - def __le__(self, other): + def __le__(self, other: Union[int, float, "Series"]) -> BooleanFilter: if isinstance(other, Series): # Need to use scripted query to compare to values painless = f"doc['{self.name}'].value <= doc['{other.name}'].value" @@ -459,7 +469,7 @@ class Series(NDFrame): else: raise NotImplementedError(other, type(other)) - def __eq__(self, other): + def __eq__(self, other: Union[int, float, str, "Series"]) -> BooleanFilter: if isinstance(other, Series): # Need to use scripted query to compare to values painless = f"doc['{self.name}'].value == doc['{other.name}'].value" @@ -471,7 +481,7 @@ class Series(NDFrame): else: raise NotImplementedError(other, type(other)) - def __ne__(self, other): + def __ne__(self, other: Union[int, float, str, "Series"]) -> BooleanFilter: if isinstance(other, Series): # Need to use scripted query to compare to values painless = f"doc['{self.name}'].value != doc['{other.name}'].value" @@ -483,13 +493,13 @@ class Series(NDFrame): else: raise NotImplementedError(other, type(other)) - def isin(self, other): - if isinstance(other, list): - return IsIn(field=self.name, value=other) + def isin(self, other: Union[Collection, pd.Series]) -> BooleanFilter: + if isinstance(other, (Collection, pd.Series)): + return IsIn(field=self.name, value=to_list(other)) else: raise NotImplementedError(other, type(other)) - def isna(self): + def isna(self) -> BooleanFilter: """ Detect missing values. @@ -506,7 +516,7 @@ class Series(NDFrame): isnull = isna - def notna(self): + def notna(self) -> BooleanFilter: """ Detect existing (non-missing) values. @@ -525,7 +535,7 @@ class Series(NDFrame): notnull = notna @property - def ndim(self): + def ndim(self) -> int: """ Returns 1 by definition of a Series @@ -596,7 +606,7 @@ class Series(NDFrame): ) return Series(_query_compiler=new_query_compiler) - def es_info(self): + def es_info(self) -> str: buf = StringIO() super()._es_info(buf) @@ -604,7 +614,7 @@ class Series(NDFrame): return buf.getvalue() @deprecated_api("eland.Series.es_info()") - def info_es(self): + def info_es(self) -> str: return self.es_info() def __add__(self, right): @@ -1149,7 +1159,12 @@ class Series(NDFrame): rsubtract = __rsub__ rtruediv = __rtruediv__ - def _numeric_op(self, right, method_name): + # __div__ is technically Python 2.x only + # but pandas has it so we do too. + __div__ = __truediv__ + __rdiv__ = __rtruediv__ + + def _numeric_op(self, right: Any, method_name: str) -> "Series": """ return a op b diff --git a/eland/tests/dataframe/test_query_pytest.py b/eland/tests/dataframe/test_query_pytest.py index 20a35e6..97a9eeb 100644 --- a/eland/tests/dataframe/test_query_pytest.py +++ b/eland/tests/dataframe/test_query_pytest.py @@ -86,10 +86,11 @@ class TestDataFrameQuery(TestData): ed_flights = self.ed_flights() pd_flights = self.pd_flights() - assert ( - pd_flights[pd_flights.OriginAirportID.isin(["LHR", "SYD"])].shape - == ed_flights[ed_flights.OriginAirportID.isin(["LHR", "SYD"])].shape - ) + for obj in (["LHR", "SYD"], ("LHR", "SYD"), pd.Series(data=["LHR", "SYD"])): + assert ( + pd_flights[pd_flights.OriginAirportID.isin(obj)].shape + == ed_flights[ed_flights.OriginAirportID.isin(obj)].shape + ) def test_multiitem_query(self): # Examples from: diff --git a/eland/utils.py b/eland/utils.py index 8e302e1..4530fcb 100644 --- a/eland/utils.py +++ b/eland/utils.py @@ -18,16 +18,21 @@ import re import functools import warnings -from typing import Callable, TypeVar +from typing import Callable, TypeVar, Any, Union, List, cast, Collection +from collections.abc import Collection as ABCCollection +import pandas as pd # type: ignore -F = TypeVar("F") +RT = TypeVar("RT") +Item = TypeVar("Item") -def deprecated_api(replace_with: str) -> Callable[[F], F]: - def wrapper(f: F) -> F: +def deprecated_api( + replace_with: str, +) -> Callable[[Callable[..., RT]], Callable[..., RT]]: + def wrapper(f: Callable[..., RT]) -> Callable[..., RT]: @functools.wraps(f) - def wrapped(*args, **kwargs): + def wrapped(*args: Any, **kwargs: Any) -> RT: warnings.warn( f"{f.__name__} is deprecated, use {replace_with} instead", DeprecationWarning, @@ -39,10 +44,18 @@ def deprecated_api(replace_with: str) -> Callable[[F], F]: return wrapper -def is_valid_attr_name(s): +def is_valid_attr_name(s: str) -> bool: """ Ensure the given string can be used as attribute on an object instance. """ - return isinstance(s, str) and re.search( - string=s, pattern=r"^[a-zA-Z_][a-zA-Z0-9_]*$" + return bool( + isinstance(s, str) and re.search(string=s, pattern=r"^[a-zA-Z_][a-zA-Z0-9_]*$") ) + + +def to_list(x: Union[Collection[Any], pd.Series]) -> List[Any]: + if isinstance(x, ABCCollection): + return list(x) + elif isinstance(x, pd.Series): + return cast(List[Any], x.to_list()) + raise NotImplementedError(f"Could not convert {type(x).__name__} into a list") diff --git a/noxfile.py b/noxfile.py index db6c28e..a8c4731 100644 --- a/noxfile.py +++ b/noxfile.py @@ -43,6 +43,7 @@ TYPED_FILES = { "eland/index.py", "eland/query.py", "eland/tasks.py", + "eland/utils.py", "eland/ml/__init__.py", "eland/ml/_model_serializer.py", "eland/ml/imported_ml_model.py", @@ -75,6 +76,7 @@ def lint(session): session.error(f"The file {typed_file!r} couldn't be found") popen = subprocess.Popen( f"mypy --strict {typed_file}", + env=session.env, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,