Fixing tests, and upgrading to pandas 0.25.1

2025-07-11 00:02:14 +08:00 · 2019-10-18 08:06:07 +00:00 · 2019-10-18 08:06:07 +00:00 · 9dad8613d3
commit 9dad8613d3
parent 315d4c3287
12 changed files with 40 additions and 1455 deletions
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -463,7 +463,6 @@ class DataFrame(NDFrame):
            "quotechar": quotechar,
            "line_terminator": line_terminator,
            "chunksize": chunksize,
-            "tupleize_cols": tupleize_cols,
            "date_format": date_format,
            "doublequote": doublequote,
            "escapechar": escapechar,
@ -552,7 +551,7 @@ class DataFrame(NDFrame):

        # currently we only support a subset of functions that aggregate columns.
        # ['count', 'mad', 'max', 'mean', 'median', 'min', 'mode', 'quantile', 'rank', 'sem', 'skew', 'sum', 'std', 'var', 'nunique']
-        if isinstance(func, compat.string_types):
+        if isinstance(func, str):
            # wrap in list
            func = [func]
            return self._query_compiler.aggs(func)
--- a/eland/mappings.py
+++ b/eland/mappings.py
@ -290,7 +290,7 @@ class Mappings:
        return es_dtype

    @staticmethod
-    def _generate_es_mappings(dataframe):
+    def _generate_es_mappings(dataframe, geo_points=None):
        """Given a pandas dataframe, generate the associated Elasticsearch mapping

        Parameters
@ -325,7 +325,10 @@ class Mappings:
        mappings = {}
        mappings['properties'] = {}
        for column_name, dtype in dataframe.dtypes.iteritems():
-            es_dtype = Mappings._pd_dtype_to_es_dtype(dtype)
+            if geo_points is not None and column_name in geo_points:
+                es_dtype = 'geo_point'
+            else:
+                es_dtype = Mappings._pd_dtype_to_es_dtype(dtype)

            mappings['properties'][column_name] = {}
            mappings['properties'][column_name]['type'] = es_dtype
--- a/eland/plotting.py
+++ b/eland/plotting.py
@ -3,13 +3,14 @@ import numpy as np
 import pandas.core.common as com
 from pandas.core.dtypes.generic import (
    ABCIndexClass)
+from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots


 def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
               xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
               sharey=False, figsize=None, layout=None, bins=10, **kwds):
    """
-    Derived from pandas.plotting._core.hist_frame 0.24.2
+    Derived from pandas.plotting._core.hist_frame 0.24.2 - TODO update to 0.25.1

    Ideally, we'd call hist_frame directly with histogram data,
    but weights are applied to ALL series. For example, we can
@ -29,8 +30,6 @@ def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
    # Start with empty pandas data frame derived from
    ed_df_bins, ed_df_weights = ed_df._hist(num_bins=bins)

-    _raise_if_no_mpl()
-    _converter._WARN = False
    if by is not None:
        raise NotImplementedError("TODO")
        """
--- a/eland/tests/dataframe/test_iloc_pytest.py
+++ b/eland/tests/dataframe/test_iloc_pytest.py
@ -24,31 +24,22 @@ class TestDataFrameiLoc(TestData):
        pd_iloc1= pd_flights.iloc[[0]]
        pd_iloc2= pd_flights.iloc[[0, 1]]
        pd_iloc3 = pd_flights.iloc[:3]
-        pd_iloc4 = pd_flights.iloc[[True, False, True]]
        pd_iloc5 = pd_flights.iloc[0, 1]
        pd_iloc6 = pd_flights.iloc[[0, 2], [1, 3]]
        pd_iloc7 = pd_flights.iloc[1:3, 0:3]
-        pd_iloc8 = pd_flights.iloc[:, [True, False, True, False]]
-        pd_iloc9 = pd_flights.iloc[[True, False, True, False]]

        ed_iloc0 = ed_flights.iloc[0]
        ed_iloc1 = ed_flights.iloc[[0]]
        ed_iloc2 = ed_flights.iloc[[0, 1]]
        ed_iloc3 = ed_flights.iloc[:3]
-        ed_iloc4 = ed_flights.iloc[[True, False, True]]
        ed_iloc5 = ed_flights.iloc[0, 1]
        ed_iloc6 = ed_flights.iloc[[0, 2], [1, 3]]
        ed_iloc7 = ed_flights.iloc[1:3, 0:3]
-        ed_iloc8 = ed_flights.iloc[:, [True, False, True, False]]
-        ed_iloc9 = ed_flights.iloc[[True, False, True, False]]

        #assert_pandas_eland_frame_equal(pd_iloc0, ed_iloc0) # pd_iloc0 is Series
        assert_pandas_eland_frame_equal(pd_iloc1, ed_iloc1)
        assert_pandas_eland_frame_equal(pd_iloc2, ed_iloc2)
        assert_pandas_eland_frame_equal(pd_iloc3, ed_iloc3)
-        assert_pandas_eland_frame_equal(pd_iloc4, ed_iloc4)
        #assert_pandas_eland_frame_equal(pd_iloc5, ed_iloc5) # pd_iloc5 is numpy_bool
        assert_pandas_eland_frame_equal(pd_iloc6, ed_iloc6)
        assert_pandas_eland_frame_equal(pd_iloc7, ed_iloc7)
-        assert_pandas_eland_frame_equal(pd_iloc8, ed_iloc8)
-        assert_pandas_eland_frame_equal(pd_iloc9, ed_iloc9)
--- a/eland/tests/dataframe/test_reviews_pytest.py
+++ b/eland/tests/dataframe/test_reviews_pytest.py
@ -1,31 +0,0 @@
-# File called _pytest for PyCharm compatability
-
-import gzip
-
-import pandas as pd
-
-import eland as ed
-from eland.tests.common import TestData
-
-
-class TestDataFrameReviews(TestData):
-
-    def test_explore(self):
-        ed_reviews = ed.DataFrame('localhost', 'anonreviews')
-
-        print(ed_reviews.head())
-        print(ed_reviews.describe())
-        print(ed_reviews.info())
-        print(ed_reviews.hist(column="rating", bins=5))
-        # print(ed_reviews.head().info_es())
-
-    def test_review(self):
-        csv_handle = gzip.open('../anonreviews.csv.gz')
-
-        reviews = pd.read_csv(csv_handle)
-
-        reviews['date'] = pd.to_datetime(reviews['date'])
-
-        g = reviews.groupby('reviewerId')
-
-        print(g.describe())
--- a/eland/tests/dataframe/test_to_csv_pytest.py
+++ b/eland/tests/dataframe/test_to_csv_pytest.py
@ -3,20 +3,23 @@
 import pandas as pd

 from eland.tests.common import TestData
+from eland.tests.common import ROOT_DIR

 from pandas.util.testing import (assert_equal, assert_frame_equal)

 import ast

+
 class TestDataFrameToCSV(TestData):

    def test_to_csv_head(self):
+        results_file = ROOT_DIR + '/dataframe/results/test_to_csv_head.csv'
+
        ed_flights = self.ed_flights().head()
        pd_flights = self.pd_flights().head()
-
-        ed_flights.to_csv('results/test_to_csv_head.csv')
+        ed_flights.to_csv(results_file)
        # Converting back from csv is messy as pd_flights is created from a json file
-        pd_from_csv = pd.read_csv('results/test_to_csv_head.csv', index_col=0, converters={
+        pd_from_csv = pd.read_csv(results_file, index_col=0, converters={
            'DestLocation': lambda x: ast.literal_eval(x),
            'OriginLocation': lambda x: ast.literal_eval(x)})
        pd_from_csv.index = pd_from_csv.index.map(str)
@ -25,13 +28,15 @@ class TestDataFrameToCSV(TestData):
        assert_frame_equal(pd_flights, pd_from_csv)

    def test_to_csv_full(self):
+        results_file = ROOT_DIR + '/dataframe/results/test_to_csv_full.csv'
+
        # Test is slow as it's for the full dataset, but it is useful as it goes over 10000 docs
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()

-        ed_flights.to_csv('results/test_to_csv_full.csv')
+        ed_flights.to_csv(results_file)
        # Converting back from csv is messy as pd_flights is created from a json file
-        pd_from_csv = pd.read_csv('results/test_to_csv_full.csv', index_col=0, converters={
+        pd_from_csv = pd.read_csv(results_file, index_col=0, converters={
            'DestLocation': lambda x: ast.literal_eval(x),
            'OriginLocation': lambda x: ast.literal_eval(x)})
        pd_from_csv.index = pd_from_csv.index.map(str)
--- a/eland/tests/demo_day_20190815.ipynb
+++ b/eland/tests/demo_day_20190815.ipynb
@ -7144,7 +7144,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.6.8"
  }
 },
 "nbformat": 4,
--- a/eland/tests/operators/test_operators_pytest.py
+++ b/eland/tests/operators/test_operators_pytest.py
@ -1,5 +1,5 @@
 # -*- coding: UTF-8 -*-
-from eland.operators import *
+from eland.filter import *


 class TestOperators():
@ -21,11 +21,6 @@ class TestOperators():
            'script': {'script': {'inline': 'doc["num1"].value > params.param1', 'params': {'param1': 5}}}}
        assert IsIn('ids', [1, 2, 3]).build() == {'ids': {'values': [1, 2, 3]}}

-    def test_and_none(self):
-        exp = None
-        exp = exp & Less('b', 3)
-        print(exp.build())
-
    def test_and_filter1(self):
        exp = GreaterEqual('a', 2) & Less('b', 3)
        assert exp.build() == {'bool': {'must': [{'range': {'a': {'gte': 2}}}, {'range': {'b': {'lt': 3}}}]}}
--- a/eland/tests/pivot_review_data_pandas.ipynb
+++ b/eland/tests/pivot_review_data_pandas.ipynb
--- a/eland/utils.py
+++ b/eland/utils.py
@ -7,7 +7,7 @@ def read_es(es_params, index_pattern):
    return DataFrame(client=es_params, index_pattern=index_pattern)


-def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False):
+def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=10000, refresh=False, dropna=False, geo_points=None):
    """
    Append a pandas DataFrame to an Elasticsearch index.
    Mainly used in testing.
@ -30,10 +30,19 @@ def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=
            If table exists, drop it, recreate it, and insert data.
        ``'append'``
                If table exists, insert data. Create if does not exist.
+
+    dropna : bool
+        ``'True'``
+            Remove missing values (see pandas.Series.dropna)
+        ``'False;``
+            Include missing values - may cause bulk to fail
+
+    geo_points : list or None
+        List of columns to map to geo_point data type
    """
    client = Client(es_params)

-    mapping = Mappings._generate_es_mappings(df)
+    mapping = Mappings._generate_es_mappings(df, geo_points)

    # If table exists, check if_exists parameter
    if client.index_exists(index=destination_index):
@ -58,7 +67,11 @@ def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=
    for row in df.iterrows():
        # Use index as _id
        id = row[0]
-        values = row[1].to_dict()
+
+        if dropna:
+            values = row[1].dropna().to_dict()
+        else:
+            values = row[1].to_dict()

        # Use integer as id field for repeatable results
        action = {'_index': destination_index, '_source': values, '_id': str(id)}
--- a/requirements.txt
+++ b/requirements.txt
@ -1,8 +1,2 @@
-elasticsearch==7.0.2
-elasticsearch-dsl==7.0.0
-numpy==1.16.4
-pandas==0.24.2
-python-dateutil==2.8.0
-pytz==2019.1
-six==1.12.0
-urllib3==1.25.3
+elasticsearch>=7.0.5
+pandas==0.25.1
--- a/setup.py
+++ b/setup.py
@ -13,8 +13,7 @@ setup(name='eland',
      license='ELASTIC LICENSE',
      packages=['eland'],
      install_requires=[
-          'elasticsearch',
-          'elasticsearch_dsl',
-          'pandas'
+          'elasticsearch>=7.0.5',
+          'pandas==0.25.1'
      ],
      zip_safe=False)