Merge pull request #18 from stevedodson/master

Adding DataFrame.hist tests and DataFrame.select_dtypes
This commit is contained in:
stevedodson 2019-08-01 12:56:39 +00:00 committed by GitHub
commit d34a8365eb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 145 additions and 72 deletions

View File

@ -1,9 +1,9 @@
import sys import sys
import warnings import warnings
from distutils.version import LooseVersion
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from distutils.version import LooseVersion
from pandas.compat import StringIO from pandas.compat import StringIO
from pandas.core.common import apply_if_callable, is_bool_indexer from pandas.core.common import apply_if_callable, is_bool_indexer
from pandas.io.common import _expand_user, _stringify_path from pandas.io.common import _expand_user, _stringify_path
@ -11,10 +11,10 @@ from pandas.io.formats import console
from pandas.io.formats import format as fmt from pandas.io.formats import format as fmt
from pandas.io.formats.printing import pprint_thing from pandas.io.formats.printing import pprint_thing
import eland.plotting as gfx
from eland import NDFrame from eland import NDFrame
from eland import Series from eland import Series
import eland.plotting as gfx
class DataFrame(NDFrame): class DataFrame(NDFrame):
# This is effectively 2 constructors # This is effectively 2 constructors
@ -138,6 +138,9 @@ class DataFrame(NDFrame):
return buf.getvalue() return buf.getvalue()
def _index_summary(self): def _index_summary(self):
# Print index summary e.g.
# Index: 103 entries, 0 to 102
# Do this by getting head and tail of dataframe
head = self.head(1)._to_pandas().index[0] head = self.head(1)._to_pandas().index[0]
tail = self.tail(1)._to_pandas().index[0] tail = self.tail(1)._to_pandas().index[0]
index_summary = ', %s to %s' % (pprint_thing(head), index_summary = ', %s to %s' % (pprint_thing(head),
@ -286,11 +289,11 @@ class DataFrame(NDFrame):
_buf = StringIO() _buf = StringIO()
df.to_html(buf=_buf, columns=columns, col_space=col_space, header=header, df.to_html(buf=_buf, columns=columns, col_space=col_space, header=header,
index=index, na_rep=na_rep, formatters=formatters, float_format=float_format, index=index, na_rep=na_rep, formatters=formatters, float_format=float_format,
sparsify=sparsify, index_names=index_names, justify=justify, max_rows=max_rows, sparsify=sparsify, index_names=index_names, justify=justify, max_rows=max_rows,
max_cols=max_cols, show_dimensions=False, decimal=decimal, max_cols=max_cols, show_dimensions=False, decimal=decimal,
bold_rows=bold_rows, classes=classes, escape=escape, notebook=notebook, bold_rows=bold_rows, classes=classes, escape=escape, notebook=notebook,
border=border, table_id=table_id, render_links=render_links) border=border, table_id=table_id, render_links=render_links)
# Our fake dataframe has incorrect number of rows (max_rows*2+1) - write out # Our fake dataframe has incorrect number of rows (max_rows*2+1) - write out
# the correct number of rows # the correct number of rows
@ -439,6 +442,14 @@ class DataFrame(NDFrame):
query_compiler=self._query_compiler.squeeze(axis) query_compiler=self._query_compiler.squeeze(axis)
) )
def select_dtypes(self, include=None, exclude=None):
# get empty df
empty_df = self._empty_pd_df()
empty_df = empty_df.select_dtypes(include=include, exclude=exclude)
return self._getitem_array(empty_df.columns)
@property @property
def shape(self): def shape(self):
""" """

View File

@ -315,7 +315,7 @@ class ElandQueryCompiler(BaseQueryCompiler):
if numeric: if numeric:
raise NotImplementedError("Not implemented yet...") raise NotImplementedError("Not implemented yet...")
result._operations.set_columns(key) result._operations.set_columns(list(key))
return result return result

View File

@ -0,0 +1,23 @@
# File called _pytest for PyCharm compatability
import numpy as np
import pandas as pd
from eland.tests.common import TestData
class TestDataFrameAggs(TestData):
def test_to_aggs1(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(['sum', 'min'])
print(type(pd_sum_min))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(pd_sum_min)
ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(['sum', 'min'])
print(type(ed_sum_min))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(ed_sum_min)

View File

@ -36,6 +36,7 @@ class TestDataFrameDescribe(TestData):
ed_flights = self.ed_flights().head() ed_flights = self.ed_flights().head()
pd_describe = pd_flights.describe() pd_describe = pd_flights.describe()
# This fails as we can not run 'describe' on a truncate ed dataframe
ed_describe = ed_flights.describe() ed_describe = ed_flights.describe()
print(pd_describe) print(pd_describe)

View File

@ -1,5 +1,6 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
import matplotlib.pyplot as plt
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from pandas.util.testing import assert_almost_equal from pandas.util.testing import assert_almost_equal
@ -9,7 +10,7 @@ from eland.tests.common import TestData
class TestDataFrameHist(TestData): class TestDataFrameHist(TestData):
def test_to_hist1(self): def test_hist1(self):
pd_flights = self.pd_flights() pd_flights = self.pd_flights()
ed_flights = self.ed_flights() ed_flights = self.ed_flights()

View File

@ -22,4 +22,6 @@ class TestDataFrameInfo(TestData):
assert pd_buf_lines[1:] == ed_buf_lines[1:] assert pd_buf_lines[1:] == ed_buf_lines[1:]
# NOTE: info does not work on truncated data frames (e.g. head/tail) TODO
print(self.ed_ecommerce().info()) print(self.ed_ecommerce().info())

View File

@ -0,0 +1,31 @@
# File called _pytest for PyCharm compatability
import pandas as pd
import numpy as np
from eland.tests.common import TestData
from eland.tests.common import (
assert_pandas_eland_frame_equal
)
class TestDataFrameSelectDTypes(TestData):
def test_select_dtypes1(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
ed_flights_numeric = ed_flights.select_dtypes(include=[np.number])
pd_flights_numeric = pd_flights.select_dtypes(include=[np.number])
assert_pandas_eland_frame_equal(pd_flights_numeric.head(103), ed_flights_numeric.head(103))
def test_select_dtypes2(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
ed_flights_non_numeric = ed_flights.select_dtypes(exclude=[np.number])
pd_flights_non_numeric = pd_flights.select_dtypes(exclude=[np.number])
assert_pandas_eland_frame_equal(pd_flights_non_numeric.head(103), ed_flights_non_numeric.head(103))

File diff suppressed because one or more lines are too long

View File

@ -2,67 +2,18 @@
from eland.tests.common import TestData from eland.tests.common import TestData
from pandas.util.testing import assert_series_equal from matplotlib.testing.decorators import check_figures_equal
import numpy as np @check_figures_equal(extensions=['png'])
import pandas as pd def test_plot(fig_test, fig_ref):
test_data = TestData()
class TestDataFrameHist(TestData):
def test_dataframe_hist1(self):
test_data = TestData()
pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
"""
pd_flights.hist(figsize=[10, 10])
#ed_flights.hist(figsize=[10, 10])
pd_min = pd_flights['DistanceKilometers'].min()
pd_max = pd_flights['DistanceKilometers'].max()
#ed_min = ed_flights['DistanceKilometers'].min()
#ed_max = ed_flights['DistanceKilometers'].max()
#num_bins = 10.0
#bins = np.linspace(ed_min, ed_max, num=num_bins+1)
#print(bins)
#print(np.diff(bins).mean())
#hist = ed_flights['DistanceKilometers'].hist(np.diff(bins).mean())
x = [2956., 768., 719., 2662., 2934., 1320., 641., 529., 426., 104.]
bins = [0., 1988.14823146, 3976.29646292, 5964.44469437, 7952.59292583, 9940.74115729, 11928.88938875, 13917.03762021, 15905.18585166,17893.33408312,19881.48231458]
print(len(x))
print(len(bins))
a = bins[0:10]
print(np.histogram(a, weights=x, bins=bins))
#counts, bins = np.histogram(data)
#plt.hist(bins[:-1], bins, weights=counts)
"""
h1 = np.histogram(pd_flights['DistanceKilometers'], 10)
h2 = np.histogram(pd_flights['FlightDelayMin'], 10)
l1 = list(h1[0])
l2 = list(h2[0])
l1.append(0)
l2.append(0)
d = {'DistanceKilometers': h1[1],
'FlightDelayMin': h2[1]}
df = pd.DataFrame(data=d)
df.hist(weights=[l1, l2])
pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
pd_ax = fig_ref.subplots()
pd_flights.hist(ax=pd_ax)
ed_ax = fig_test.subplots()
ed_flights.hist(ax=ed_ax)

View File

@ -13,15 +13,15 @@ class TestQueryCopy(TestData):
q.exists('field_a') q.exists('field_a')
q.exists('field_b', must=False) q.exists('field_b', must=False)
print(q.to_query()) print(q.to_search_body())
q1 = Query(q) q1 = Query(q)
q.exists('field_c', must=False) q.exists('field_c', must=False)
q1.exists('field_c1', must=False) q1.exists('field_c1', must=False)
print(q.to_query()) print(q.to_search_body())
print(q1.to_query()) print(q1.to_search_body())