Merge pull request #18 from stevedodson/master

Adding DataFrame.hist tests and DataFrame.select_dtypes
This commit is contained in:
stevedodson 2019-08-01 12:56:39 +00:00 committed by GitHub
commit d34a8365eb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 145 additions and 72 deletions

View File

@ -1,9 +1,9 @@
import sys
import warnings
from distutils.version import LooseVersion
import numpy as np
import pandas as pd
from distutils.version import LooseVersion
from pandas.compat import StringIO
from pandas.core.common import apply_if_callable, is_bool_indexer
from pandas.io.common import _expand_user, _stringify_path
@ -11,10 +11,10 @@ from pandas.io.formats import console
from pandas.io.formats import format as fmt
from pandas.io.formats.printing import pprint_thing
import eland.plotting as gfx
from eland import NDFrame
from eland import Series
import eland.plotting as gfx
class DataFrame(NDFrame):
# This is effectively 2 constructors
@ -138,6 +138,9 @@ class DataFrame(NDFrame):
return buf.getvalue()
def _index_summary(self):
# Print index summary e.g.
# Index: 103 entries, 0 to 102
# Do this by getting head and tail of dataframe
head = self.head(1)._to_pandas().index[0]
tail = self.tail(1)._to_pandas().index[0]
index_summary = ', %s to %s' % (pprint_thing(head),
@ -286,11 +289,11 @@ class DataFrame(NDFrame):
_buf = StringIO()
df.to_html(buf=_buf, columns=columns, col_space=col_space, header=header,
index=index, na_rep=na_rep, formatters=formatters, float_format=float_format,
sparsify=sparsify, index_names=index_names, justify=justify, max_rows=max_rows,
max_cols=max_cols, show_dimensions=False, decimal=decimal,
bold_rows=bold_rows, classes=classes, escape=escape, notebook=notebook,
border=border, table_id=table_id, render_links=render_links)
index=index, na_rep=na_rep, formatters=formatters, float_format=float_format,
sparsify=sparsify, index_names=index_names, justify=justify, max_rows=max_rows,
max_cols=max_cols, show_dimensions=False, decimal=decimal,
bold_rows=bold_rows, classes=classes, escape=escape, notebook=notebook,
border=border, table_id=table_id, render_links=render_links)
# Our fake dataframe has incorrect number of rows (max_rows*2+1) - write out
# the correct number of rows
@ -439,6 +442,14 @@ class DataFrame(NDFrame):
query_compiler=self._query_compiler.squeeze(axis)
)
def select_dtypes(self, include=None, exclude=None):
# get empty df
empty_df = self._empty_pd_df()
empty_df = empty_df.select_dtypes(include=include, exclude=exclude)
return self._getitem_array(empty_df.columns)
@property
def shape(self):
"""

View File

@ -315,7 +315,7 @@ class ElandQueryCompiler(BaseQueryCompiler):
if numeric:
raise NotImplementedError("Not implemented yet...")
result._operations.set_columns(key)
result._operations.set_columns(list(key))
return result

View File

@ -0,0 +1,23 @@
# File called _pytest for PyCharm compatability
import numpy as np
import pandas as pd
from eland.tests.common import TestData
class TestDataFrameAggs(TestData):
def test_to_aggs1(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(['sum', 'min'])
print(type(pd_sum_min))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(pd_sum_min)
ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(['sum', 'min'])
print(type(ed_sum_min))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(ed_sum_min)

View File

@ -36,6 +36,7 @@ class TestDataFrameDescribe(TestData):
ed_flights = self.ed_flights().head()
pd_describe = pd_flights.describe()
# This fails as we can not run 'describe' on a truncate ed dataframe
ed_describe = ed_flights.describe()
print(pd_describe)

View File

@ -1,5 +1,6 @@
# File called _pytest for PyCharm compatability
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.util.testing import assert_almost_equal
@ -9,7 +10,7 @@ from eland.tests.common import TestData
class TestDataFrameHist(TestData):
def test_to_hist1(self):
def test_hist1(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()

View File

@ -22,4 +22,6 @@ class TestDataFrameInfo(TestData):
assert pd_buf_lines[1:] == ed_buf_lines[1:]
# NOTE: info does not work on truncated data frames (e.g. head/tail) TODO
print(self.ed_ecommerce().info())

View File

@ -0,0 +1,31 @@
# File called _pytest for PyCharm compatability
import pandas as pd
import numpy as np
from eland.tests.common import TestData
from eland.tests.common import (
assert_pandas_eland_frame_equal
)
class TestDataFrameSelectDTypes(TestData):
def test_select_dtypes1(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
ed_flights_numeric = ed_flights.select_dtypes(include=[np.number])
pd_flights_numeric = pd_flights.select_dtypes(include=[np.number])
assert_pandas_eland_frame_equal(pd_flights_numeric.head(103), ed_flights_numeric.head(103))
def test_select_dtypes2(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
ed_flights_non_numeric = ed_flights.select_dtypes(exclude=[np.number])
pd_flights_non_numeric = pd_flights.select_dtypes(exclude=[np.number])
assert_pandas_eland_frame_equal(pd_flights_non_numeric.head(103), ed_flights_non_numeric.head(103))

File diff suppressed because one or more lines are too long

View File

@ -2,67 +2,18 @@
from eland.tests.common import TestData
from pandas.util.testing import assert_series_equal
from matplotlib.testing.decorators import check_figures_equal
import numpy as np
import pandas as pd
class TestDataFrameHist(TestData):
def test_dataframe_hist1(self):
test_data = TestData()
pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
"""
pd_flights.hist(figsize=[10, 10])
#ed_flights.hist(figsize=[10, 10])
pd_min = pd_flights['DistanceKilometers'].min()
pd_max = pd_flights['DistanceKilometers'].max()
#ed_min = ed_flights['DistanceKilometers'].min()
#ed_max = ed_flights['DistanceKilometers'].max()
#num_bins = 10.0
#bins = np.linspace(ed_min, ed_max, num=num_bins+1)
#print(bins)
#print(np.diff(bins).mean())
#hist = ed_flights['DistanceKilometers'].hist(np.diff(bins).mean())
x = [2956., 768., 719., 2662., 2934., 1320., 641., 529., 426., 104.]
bins = [0., 1988.14823146, 3976.29646292, 5964.44469437, 7952.59292583, 9940.74115729, 11928.88938875, 13917.03762021, 15905.18585166,17893.33408312,19881.48231458]
print(len(x))
print(len(bins))
a = bins[0:10]
print(np.histogram(a, weights=x, bins=bins))
#counts, bins = np.histogram(data)
#plt.hist(bins[:-1], bins, weights=counts)
"""
h1 = np.histogram(pd_flights['DistanceKilometers'], 10)
h2 = np.histogram(pd_flights['FlightDelayMin'], 10)
l1 = list(h1[0])
l2 = list(h2[0])
l1.append(0)
l2.append(0)
d = {'DistanceKilometers': h1[1],
'FlightDelayMin': h2[1]}
df = pd.DataFrame(data=d)
df.hist(weights=[l1, l2])
@check_figures_equal(extensions=['png'])
def test_plot(fig_test, fig_ref):
test_data = TestData()
pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
pd_ax = fig_ref.subplots()
pd_flights.hist(ax=pd_ax)
ed_ax = fig_test.subplots()
ed_flights.hist(ax=ed_ax)

View File

@ -13,15 +13,15 @@ class TestQueryCopy(TestData):
q.exists('field_a')
q.exists('field_b', must=False)
print(q.to_query())
print(q.to_search_body())
q1 = Query(q)
q.exists('field_c', must=False)
q1.exists('field_c1', must=False)
print(q.to_query())
print(q1.to_query())
print(q.to_search_body())
print(q1.to_search_body())