Update supported Pandas to v1.0

2025-07-11 00:02:14 +08:00 · 2020-03-27 18:21:15 +01:00 · 2020-03-27 18:21:15 +01:00 · e27a508c59
commit e27a508c59
parent 0c1d7222fe
11 changed files with 249 additions and 155 deletions
--- a/.gitignore
+++ b/.gitignore
@ -43,6 +43,7 @@ ipython_config.py
 # Environments
 .env
 .venv
+.nox
 env/
 venv/
 ENV/
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@ -1,5 +1,5 @@
 elasticsearch>=7.0.5
-pandas==0.25.3
+pandas>=1
 matplotlib
 pytest>=5.2.1
 git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master
--- a/docs/source/examples/demo_notebook.ipynb
+++ b/docs/source/examples/demo_notebook.ipynb
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -22,7 +22,7 @@ from pandas.core.common import apply_if_callable, is_bool_indexer
 from pandas.core.computation.eval import eval
 from pandas.core.dtypes.common import is_list_like
 from pandas.core.indexing import check_bool_indexer
-from pandas.io.common import _expand_user, _stringify_path
+from pandas.io.common import _expand_user, stringify_path
 from pandas.io.formats import console
 from pandas.io.formats import format as fmt
 from pandas.io.formats.printing import pprint_thing
@ -249,12 +249,19 @@ class DataFrame(NDFrame):
        --------
        >>> df = ed.DataFrame('localhost', 'flights', columns=['Origin', 'Dest'])
        >>> df.tail()
-                                                          Origin                                      Dest
-        13054                         Pisa International Airport      Xi'an Xianyang International Airport
-        13055  Winnipeg / James Armstrong Richardson Internat...                            Zurich Airport
-        13056     Licenciado Benito Juarez International Airport                         Ukrainka Air Base
-        13057                                      Itami Airport  Ministro Pistarini International Airport
-        13058                     Adelaide International Airport   Washington Dulles International Airport
+                                                                    Origin  \\
+        13054                                   Pisa International Airport   
+        13055  Winnipeg / James Armstrong Richardson International Airport   
+        13056               Licenciado Benito Juarez International Airport   
+        13057                                                Itami Airport   
+        13058                               Adelaide International Airport   
+        <BLANKLINE>
+                                                   Dest  
+        13054      Xi'an Xianyang International Airport  
+        13055                            Zurich Airport  
+        13056                         Ukrainka Air Base  
+        13057  Ministro Pistarini International Airport  
+        13058   Washington Dulles International Airport  
        <BLANKLINE>
        [5 rows x 2 columns]
        """
@ -602,8 +609,10 @@ class DataFrame(NDFrame):
        <class 'eland.dataframe.DataFrame'>
        Index: 4675 entries, 0 to 4674
        Data columns (total 2 columns):
-        customer_first_name    4675 non-null object
-        geoip.city_name        4094 non-null object
+         #   Column               Non-Null Count  Dtype 
+        ---  ------               --------------  ----- 
+         0   customer_first_name  4675 non-null   object
+         1   geoip.city_name      4094 non-null   object
        dtypes: object(2)
        memory usage: ...
        """
@ -618,6 +627,7 @@ class DataFrame(NDFrame):
            return

        cols = self.columns
+        col_count = len(self.columns)

        # hack
        if max_cols is None:
@ -637,30 +647,74 @@ class DataFrame(NDFrame):

        def _verbose_repr():
            lines.append(f"Data columns (total {len(self.columns)} columns):")
-            space = max(len(pprint_thing(k)) for k in self.columns) + 4
+
+            id_head = " # "
+            column_head = "Column"
+            col_space = 2
+
+            max_col = max(len(pprint_thing(k)) for k in cols)
+            len_column = len(pprint_thing(column_head))
+            space = max(max_col, len_column) + col_space
+
+            max_id = len(pprint_thing(col_count))
+            len_id = len(pprint_thing(id_head))
+            space_num = max(max_id, len_id) + col_space
            counts = None

-            tmpl = "{count}{dtype}"
+            header = _put_str(id_head, space_num) + _put_str(column_head, space)
            if show_counts:
                counts = self.count()
                if len(cols) != len(counts):  # pragma: no cover
                    raise AssertionError(
-                        f"Columns must equal counts "
-                        f"({len(cols):d} != {len(counts):d})"
+                        "Columns must equal counts "
+                        "({cols:d} != {counts:d})".format(
+                            cols=len(cols), counts=len(counts)
+                        )
+                    )
+                count_header = "Non-Null Count"
+                len_count = len(count_header)
+                non_null = " non-null"
+                max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null)
+                space_count = max(len_count, max_count) + col_space
+                count_temp = "{count}" + non_null
+            else:
+                count_header = ""
+                space_count = len(count_header)
+                len_count = space_count
+                count_temp = "{count}"
+
+            dtype_header = "Dtype"
+            len_dtype = len(dtype_header)
+            max_dtypes = max(len(pprint_thing(k)) for k in self.dtypes)
+            space_dtype = max(len_dtype, max_dtypes)
+            header += _put_str(count_header, space_count) + _put_str(
+                dtype_header, space_dtype
+            )
+
+            lines.append(header)
+            lines.append(
+                _put_str("-" * len_id, space_num)
+                + _put_str("-" * len_column, space)
+                + _put_str("-" * len_count, space_count)
+                + _put_str("-" * len_dtype, space_dtype)
            )
-                tmpl = "{count} non-null {dtype}"

            dtypes = self.dtypes
            for i, col in enumerate(self.columns):
                dtype = dtypes.iloc[i]
                col = pprint_thing(col)

+                line_no = _put_str(" {num}".format(num=i), space_num)
+
                count = ""
                if show_counts:
                    count = counts.iloc[i]

                lines.append(
-                    _put_str(col, space) + tmpl.format(count=count, dtype=dtype)
+                    line_no
+                    + _put_str(col, space)
+                    + _put_str(count_temp.format(count=count), space_count)
+                    + _put_str(dtype, space_dtype)
                )

        def _non_verbose_repr():
@ -769,7 +823,7 @@ class DataFrame(NDFrame):
        df = self._build_repr(max_rows + 1)

        if buf is not None:
-            _buf = _expand_user(_stringify_path(buf))
+            _buf = _expand_user(stringify_path(buf))
        else:
            _buf = StringIO()

@ -866,7 +920,7 @@ class DataFrame(NDFrame):
        df = self._build_repr(max_rows + 1)

        if buf is not None:
-            _buf = _expand_user(_stringify_path(buf))
+            _buf = _expand_user(stringify_path(buf))
        else:
            _buf = StringIO()

--- a/eland/ndframe.py
+++ b/eland/ndframe.py
@ -238,16 +238,16 @@ class NDFrame(ABC):
        --------
        >>> df = ed.DataFrame('localhost', 'flights')
        >>> df.min()
-        AvgTicketPrice        100.020531
-        Cancelled               0.000000
-        DistanceKilometers      0.000000
-        DistanceMiles           0.000000
-        FlightDelay             0.000000
-        FlightDelayMin          0.000000
-        FlightTimeHour          0.000000
-        FlightTimeMin           0.000000
-        dayOfWeek               0.000000
-        dtype: float64
+        AvgTicketPrice        100.021
+        Cancelled               False
+        DistanceKilometers          0
+        DistanceMiles               0
+        FlightDelay             False
+        FlightDelayMin              0
+        FlightTimeHour              0
+        FlightTimeMin               0
+        dayOfWeek                   0
+        dtype: object
        """
        return self._query_compiler.min(numeric_only=numeric_only)

@ -270,16 +270,16 @@ class NDFrame(ABC):
        --------
        >>> df = ed.DataFrame('localhost', 'flights')
        >>> df.max()
-        AvgTicketPrice         1199.729004
-        Cancelled                 1.000000
-        DistanceKilometers    19881.482422
-        DistanceMiles         12353.780273
-        FlightDelay               1.000000
-        FlightDelayMin          360.000000
-        FlightTimeHour           31.715034
-        FlightTimeMin          1902.901978
-        dayOfWeek                 6.000000
-        dtype: float64
+        AvgTicketPrice        1199.73
+        Cancelled                True
+        DistanceKilometers    19881.5
+        DistanceMiles         12353.8
+        FlightDelay              True
+        FlightDelayMin            360
+        FlightTimeHour         31.715
+        FlightTimeMin          1902.9
+        dayOfWeek                   6
+        dtype: object
        """
        return self._query_compiler.max(numeric_only=numeric_only)

--- a/eland/operations.py
+++ b/eland/operations.py
@ -126,10 +126,14 @@ class Operations:
        return self._metric_aggs(query_compiler, "sum", numeric_only=numeric_only)

    def max(self, query_compiler, numeric_only=True):
-        return self._metric_aggs(query_compiler, "max", numeric_only=numeric_only)
+        return self._metric_aggs(
+            query_compiler, "max", numeric_only=numeric_only, keep_original_dtype=True
+        )

    def min(self, query_compiler, numeric_only=True):
-        return self._metric_aggs(query_compiler, "min", numeric_only=numeric_only)
+        return self._metric_aggs(
+            query_compiler, "min", numeric_only=numeric_only, keep_original_dtype=True
+        )

    def nunique(self, query_compiler):
        return self._metric_aggs(
@ -142,13 +146,22 @@ class Operations:
    def hist(self, query_compiler, bins):
        return self._hist_aggs(query_compiler, bins)

-    def _metric_aggs(self, query_compiler, func, field_types=None, numeric_only=None):
+    def _metric_aggs(
+        self,
+        query_compiler,
+        func,
+        field_types=None,
+        numeric_only=None,
+        keep_original_dtype=False,
+    ):
        """
        Parameters
        ----------
        field_types: str, default None
            if `aggregatable` use only field_names whose fields in elasticseach are aggregatable.
            If `None`, use only numeric fields.
+        keep_original_dtype : bool, default False
+            if `True` the output values should keep the same domain as the input values, i.e. booleans should be booleans

        Returns
        -------
@ -235,6 +248,10 @@ class Operations:
                    results[field] = elasticsearch_date_to_pandas_date(
                        response["aggregations"][field]["value_as_string"], date_format
                    )
+                elif keep_original_dtype:
+                    results[field] = pd_dtype.type(
+                        response["aggregations"][field]["value"]
+                    )
                else:
                    results[field] = response["aggregations"][field]["value"]

--- a/eland/series.py
+++ b/eland/series.py
@ -35,7 +35,7 @@ from io import StringIO

 import numpy as np
 import pandas as pd
-from pandas.io.common import _expand_user, _stringify_path
+from pandas.io.common import _expand_user, stringify_path

 import eland.plotting
 from eland import NDFrame
@ -365,7 +365,7 @@ class Series(NDFrame):
        temp_series = self._build_repr(max_rows + 1)

        if buf is not None:
-            _buf = _expand_user(_stringify_path(buf))
+            _buf = _expand_user(stringify_path(buf))
        else:
            _buf = StringIO()

--- a/noxfile.py
+++ b/noxfile.py
@ -20,7 +20,7 @@ def blacken(session):
 def lint(session):
    session.install("black", "flake8")
    session.run("black", "--check", "--target-version=py36", *SOURCE_FILES)
-    session.run("flake8", "--ignore=E501,W503,E402,E712", *SOURCE_FILES)
+    session.run("flake8", "--ignore=W291,E501,W503,E402,E712", *SOURCE_FILES)


@nox.session(python=["3.6", "3.7", "3.8"])
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -1,5 +1,5 @@
 elasticsearch>=7.0.5
-pandas==0.25.3
+pandas>=1
 matplotlib
 pytest>=5.2.1
 nbval
--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,3 @@
 elasticsearch>=7.0.5
-pandas==0.25.3
+pandas>=1
 matplotlib
--- a/setup.py
+++ b/setup.py
@ -187,6 +187,6 @@ setup(
    classifiers=CLASSIFIERS,
    keywords="elastic eland pandas python",
    packages=find_packages(include=["eland", "eland.*"]),
-    install_requires=["elasticsearch>=7.0.5, <8", "pandas==0.25.3", "matplotlib"],
+    install_requires=["elasticsearch>=7.0.5, <8", "pandas>=1", "matplotlib"],
    python_requires=">=3.6",
 )