Merge pull request #18 from stevedodson/master

Adding DataFrame.hist tests and DataFrame.select_dtypes
2025-07-24 00:00:39 +08:00 · 2019-08-01 12:56:39 +00:00 · 2019-08-01 12:56:39 +00:00 · d34a8365eb
commit d34a8365eb
parent 62d244ff8a 67b7aee9c9
10 changed files with 145 additions and 72 deletions
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -1,9 +1,9 @@
 import sys
 import warnings
 from distutils.version import LooseVersion
 import numpy as np
 import pandas as pd
 from distutils.version import LooseVersion
 from pandas.compat import StringIO
 from pandas.core.common import apply_if_callable, is_bool_indexer
 from pandas.io.common import _expand_user, _stringify_path
@ -11,10 +11,10 @@ from pandas.io.formats import console
 from pandas.io.formats import format as fmt
 from pandas.io.formats.printing import pprint_thing
 import eland.plotting as gfx
 from eland import NDFrame
 from eland import Series
 import eland.plotting as gfx
 class DataFrame(NDFrame):
    # This is effectively 2 constructors
@ -138,6 +138,9 @@ class DataFrame(NDFrame):
        return buf.getvalue()
    def _index_summary(self):
        # Print index summary e.g.
        # Index: 103 entries, 0 to 102
        # Do this by getting head and tail of dataframe
        head = self.head(1)._to_pandas().index[0]
        tail = self.tail(1)._to_pandas().index[0]
        index_summary = ', %s to %s' % (pprint_thing(head),
@ -286,11 +289,11 @@ class DataFrame(NDFrame):
            _buf = StringIO()
        df.to_html(buf=_buf, columns=columns, col_space=col_space, header=header,
-                index=index, na_rep=na_rep, formatters=formatters, float_format=float_format,
+                   index=index, na_rep=na_rep, formatters=formatters, float_format=float_format,
-                sparsify=sparsify, index_names=index_names, justify=justify, max_rows=max_rows,
+                   sparsify=sparsify, index_names=index_names, justify=justify, max_rows=max_rows,
-                max_cols=max_cols, show_dimensions=False, decimal=decimal,
+                   max_cols=max_cols, show_dimensions=False, decimal=decimal,
-                bold_rows=bold_rows, classes=classes, escape=escape, notebook=notebook,
+                   bold_rows=bold_rows, classes=classes, escape=escape, notebook=notebook,
-                border=border, table_id=table_id, render_links=render_links)
+                   border=border, table_id=table_id, render_links=render_links)
        # Our fake dataframe has incorrect number of rows (max_rows*2+1) - write out
        # the correct number of rows
@ -439,6 +442,14 @@ class DataFrame(NDFrame):
            query_compiler=self._query_compiler.squeeze(axis)
        )
    def select_dtypes(self, include=None, exclude=None):
        # get empty df
        empty_df = self._empty_pd_df()
        empty_df = empty_df.select_dtypes(include=include, exclude=exclude)
        return self._getitem_array(empty_df.columns)
    @property
    def shape(self):
        """
--- a/eland/query_compiler.py
+++ b/eland/query_compiler.py
@ -315,7 +315,7 @@ class ElandQueryCompiler(BaseQueryCompiler):
        if numeric:
            raise NotImplementedError("Not implemented yet...")
-        result._operations.set_columns(key)
+        result._operations.set_columns(list(key))
        return result
--- a/eland/tests/dataframe/test_aggs_pytest.py
+++ b/eland/tests/dataframe/test_aggs_pytest.py
@ -0,0 +1,23 @@
 # File called _pytest for PyCharm compatability
 import numpy as np
 import pandas as pd
 from eland.tests.common import TestData
 class TestDataFrameAggs(TestData):
    def test_to_aggs1(self):
        pd_flights = self.pd_flights()
        ed_flights = self.ed_flights()
        pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(['sum', 'min'])
        print(type(pd_sum_min))
        with pd.option_context('display.max_rows', None, 'display.max_columns', None):
            print(pd_sum_min)
        ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(['sum', 'min'])
        print(type(ed_sum_min))
        with pd.option_context('display.max_rows', None, 'display.max_columns', None):
            print(ed_sum_min)
--- a/eland/tests/dataframe/test_describe_pytest.py
+++ b/eland/tests/dataframe/test_describe_pytest.py
@ -36,6 +36,7 @@ class TestDataFrameDescribe(TestData):
        ed_flights = self.ed_flights().head()
        pd_describe = pd_flights.describe()
        # This fails as we can not run 'describe' on a truncate ed dataframe
        ed_describe = ed_flights.describe()
        print(pd_describe)
--- a/eland/tests/dataframe/test_hist_pytest.py
+++ b/eland/tests/dataframe/test_hist_pytest.py
@ -1,5 +1,6 @@
 # File called _pytest for PyCharm compatability
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 from pandas.util.testing import assert_almost_equal
@ -9,7 +10,7 @@ from eland.tests.common import TestData
 class TestDataFrameHist(TestData):
-    def test_to_hist1(self):
+    def test_hist1(self):
        pd_flights = self.pd_flights()
        ed_flights = self.ed_flights()
--- a/eland/tests/dataframe/test_info_pytest.py
+++ b/eland/tests/dataframe/test_info_pytest.py
@ -22,4 +22,6 @@ class TestDataFrameInfo(TestData):
        assert pd_buf_lines[1:] == ed_buf_lines[1:]
        # NOTE: info does not work on truncated data frames (e.g. head/tail) TODO
        print(self.ed_ecommerce().info())
--- a/eland/tests/dataframe/test_select_dtypes_pytest.py
+++ b/eland/tests/dataframe/test_select_dtypes_pytest.py
@ -0,0 +1,31 @@
 # File called _pytest for PyCharm compatability
 import pandas as pd
 import numpy as np
 from eland.tests.common import TestData
 from eland.tests.common import (
    assert_pandas_eland_frame_equal
 )
 class TestDataFrameSelectDTypes(TestData):
    def test_select_dtypes1(self):
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()
        ed_flights_numeric = ed_flights.select_dtypes(include=[np.number])
        pd_flights_numeric = pd_flights.select_dtypes(include=[np.number])
        assert_pandas_eland_frame_equal(pd_flights_numeric.head(103), ed_flights_numeric.head(103))
    def test_select_dtypes2(self):
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()
        ed_flights_non_numeric = ed_flights.select_dtypes(exclude=[np.number])
        pd_flights_non_numeric = pd_flights.select_dtypes(exclude=[np.number])
        assert_pandas_eland_frame_equal(pd_flights_non_numeric.head(103), ed_flights_non_numeric.head(103))
--- a/eland/tests/plotting/test_dataframe_hist_pytest.ipynb
+++ b/eland/tests/plotting/test_dataframe_hist_pytest.ipynb
--- a/eland/tests/plotting/test_dataframe_hist_pytest.py
+++ b/eland/tests/plotting/test_dataframe_hist_pytest.py
@ -2,67 +2,18 @@
 from eland.tests.common import TestData
-from pandas.util.testing import assert_series_equal
+from matplotlib.testing.decorators import check_figures_equal
-import numpy as np
+@check_figures_equal(extensions=['png'])
-import pandas as pd
+def test_plot(fig_test, fig_ref):
-
+    test_data = TestData()
 class TestDataFrameHist(TestData):
    def test_dataframe_hist1(self):
        test_data = TestData()
        pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
        ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
        """
        pd_flights.hist(figsize=[10, 10])
        #ed_flights.hist(figsize=[10, 10])
        pd_min = pd_flights['DistanceKilometers'].min()
        pd_max = pd_flights['DistanceKilometers'].max()
        #ed_min = ed_flights['DistanceKilometers'].min()
        #ed_max = ed_flights['DistanceKilometers'].max()
        #num_bins = 10.0
        #bins = np.linspace(ed_min, ed_max, num=num_bins+1)
        #print(bins)
        #print(np.diff(bins).mean())
        #hist = ed_flights['DistanceKilometers'].hist(np.diff(bins).mean())
        x = [2956.,  768.,  719., 2662., 2934., 1320.,  641.,  529.,  426.,  104.]
        bins = [0., 1988.14823146, 3976.29646292, 5964.44469437, 7952.59292583, 9940.74115729, 11928.88938875, 13917.03762021, 15905.18585166,17893.33408312,19881.48231458]
        print(len(x))
        print(len(bins))
        a = bins[0:10]
        print(np.histogram(a, weights=x, bins=bins))
        #counts, bins = np.histogram(data)
        #plt.hist(bins[:-1], bins, weights=counts)
        """
        h1 = np.histogram(pd_flights['DistanceKilometers'], 10)
        h2 = np.histogram(pd_flights['FlightDelayMin'], 10)
        l1 = list(h1[0])
        l2 = list(h2[0])
        l1.append(0)
        l2.append(0)
        d = {'DistanceKilometers': h1[1],
             'FlightDelayMin': h2[1]}
        df = pd.DataFrame(data=d)
        df.hist(weights=[l1, l2])
    pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
    ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
    pd_ax = fig_ref.subplots()
    pd_flights.hist(ax=pd_ax)
    ed_ax = fig_test.subplots()
    ed_flights.hist(ax=ed_ax)
--- a/eland/tests/query/test_count_pytest.py
+++ b/eland/tests/query/test_count_pytest.py
@ -13,15 +13,15 @@ class TestQueryCopy(TestData):
        q.exists('field_a')
        q.exists('field_b', must=False)
-        print(q.to_query())
+        print(q.to_search_body())
        q1 = Query(q)
        q.exists('field_c', must=False)
        q1.exists('field_c1', must=False)
-        print(q.to_query())
+        print(q.to_search_body())
-        print(q1.to_query())
+        print(q1.to_search_body())