mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Merge pull request #18 from stevedodson/master
Adding DataFrame.hist tests and DataFrame.select_dtypes
This commit is contained in:
commit
d34a8365eb
@ -1,9 +1,9 @@
|
|||||||
import sys
|
import sys
|
||||||
import warnings
|
import warnings
|
||||||
|
from distutils.version import LooseVersion
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from distutils.version import LooseVersion
|
|
||||||
from pandas.compat import StringIO
|
from pandas.compat import StringIO
|
||||||
from pandas.core.common import apply_if_callable, is_bool_indexer
|
from pandas.core.common import apply_if_callable, is_bool_indexer
|
||||||
from pandas.io.common import _expand_user, _stringify_path
|
from pandas.io.common import _expand_user, _stringify_path
|
||||||
@ -11,10 +11,10 @@ from pandas.io.formats import console
|
|||||||
from pandas.io.formats import format as fmt
|
from pandas.io.formats import format as fmt
|
||||||
from pandas.io.formats.printing import pprint_thing
|
from pandas.io.formats.printing import pprint_thing
|
||||||
|
|
||||||
|
import eland.plotting as gfx
|
||||||
from eland import NDFrame
|
from eland import NDFrame
|
||||||
from eland import Series
|
from eland import Series
|
||||||
|
|
||||||
import eland.plotting as gfx
|
|
||||||
|
|
||||||
class DataFrame(NDFrame):
|
class DataFrame(NDFrame):
|
||||||
# This is effectively 2 constructors
|
# This is effectively 2 constructors
|
||||||
@ -138,6 +138,9 @@ class DataFrame(NDFrame):
|
|||||||
return buf.getvalue()
|
return buf.getvalue()
|
||||||
|
|
||||||
def _index_summary(self):
|
def _index_summary(self):
|
||||||
|
# Print index summary e.g.
|
||||||
|
# Index: 103 entries, 0 to 102
|
||||||
|
# Do this by getting head and tail of dataframe
|
||||||
head = self.head(1)._to_pandas().index[0]
|
head = self.head(1)._to_pandas().index[0]
|
||||||
tail = self.tail(1)._to_pandas().index[0]
|
tail = self.tail(1)._to_pandas().index[0]
|
||||||
index_summary = ', %s to %s' % (pprint_thing(head),
|
index_summary = ', %s to %s' % (pprint_thing(head),
|
||||||
@ -286,11 +289,11 @@ class DataFrame(NDFrame):
|
|||||||
_buf = StringIO()
|
_buf = StringIO()
|
||||||
|
|
||||||
df.to_html(buf=_buf, columns=columns, col_space=col_space, header=header,
|
df.to_html(buf=_buf, columns=columns, col_space=col_space, header=header,
|
||||||
index=index, na_rep=na_rep, formatters=formatters, float_format=float_format,
|
index=index, na_rep=na_rep, formatters=formatters, float_format=float_format,
|
||||||
sparsify=sparsify, index_names=index_names, justify=justify, max_rows=max_rows,
|
sparsify=sparsify, index_names=index_names, justify=justify, max_rows=max_rows,
|
||||||
max_cols=max_cols, show_dimensions=False, decimal=decimal,
|
max_cols=max_cols, show_dimensions=False, decimal=decimal,
|
||||||
bold_rows=bold_rows, classes=classes, escape=escape, notebook=notebook,
|
bold_rows=bold_rows, classes=classes, escape=escape, notebook=notebook,
|
||||||
border=border, table_id=table_id, render_links=render_links)
|
border=border, table_id=table_id, render_links=render_links)
|
||||||
|
|
||||||
# Our fake dataframe has incorrect number of rows (max_rows*2+1) - write out
|
# Our fake dataframe has incorrect number of rows (max_rows*2+1) - write out
|
||||||
# the correct number of rows
|
# the correct number of rows
|
||||||
@ -439,6 +442,14 @@ class DataFrame(NDFrame):
|
|||||||
query_compiler=self._query_compiler.squeeze(axis)
|
query_compiler=self._query_compiler.squeeze(axis)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def select_dtypes(self, include=None, exclude=None):
|
||||||
|
# get empty df
|
||||||
|
empty_df = self._empty_pd_df()
|
||||||
|
|
||||||
|
empty_df = empty_df.select_dtypes(include=include, exclude=exclude)
|
||||||
|
|
||||||
|
return self._getitem_array(empty_df.columns)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def shape(self):
|
def shape(self):
|
||||||
"""
|
"""
|
||||||
|
@ -315,7 +315,7 @@ class ElandQueryCompiler(BaseQueryCompiler):
|
|||||||
if numeric:
|
if numeric:
|
||||||
raise NotImplementedError("Not implemented yet...")
|
raise NotImplementedError("Not implemented yet...")
|
||||||
|
|
||||||
result._operations.set_columns(key)
|
result._operations.set_columns(list(key))
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
23
eland/tests/dataframe/test_aggs_pytest.py
Normal file
23
eland/tests/dataframe/test_aggs_pytest.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
|
||||||
|
class TestDataFrameAggs(TestData):
|
||||||
|
|
||||||
|
def test_to_aggs1(self):
|
||||||
|
pd_flights = self.pd_flights()
|
||||||
|
ed_flights = self.ed_flights()
|
||||||
|
|
||||||
|
pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(['sum', 'min'])
|
||||||
|
print(type(pd_sum_min))
|
||||||
|
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
|
||||||
|
print(pd_sum_min)
|
||||||
|
|
||||||
|
ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(['sum', 'min'])
|
||||||
|
print(type(ed_sum_min))
|
||||||
|
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
|
||||||
|
print(ed_sum_min)
|
@ -36,6 +36,7 @@ class TestDataFrameDescribe(TestData):
|
|||||||
ed_flights = self.ed_flights().head()
|
ed_flights = self.ed_flights().head()
|
||||||
|
|
||||||
pd_describe = pd_flights.describe()
|
pd_describe = pd_flights.describe()
|
||||||
|
# This fails as we can not run 'describe' on a truncate ed dataframe
|
||||||
ed_describe = ed_flights.describe()
|
ed_describe = ed_flights.describe()
|
||||||
|
|
||||||
print(pd_describe)
|
print(pd_describe)
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pandas.util.testing import assert_almost_equal
|
from pandas.util.testing import assert_almost_equal
|
||||||
@ -9,7 +10,7 @@ from eland.tests.common import TestData
|
|||||||
|
|
||||||
class TestDataFrameHist(TestData):
|
class TestDataFrameHist(TestData):
|
||||||
|
|
||||||
def test_to_hist1(self):
|
def test_hist1(self):
|
||||||
pd_flights = self.pd_flights()
|
pd_flights = self.pd_flights()
|
||||||
ed_flights = self.ed_flights()
|
ed_flights = self.ed_flights()
|
||||||
|
|
||||||
|
@ -22,4 +22,6 @@ class TestDataFrameInfo(TestData):
|
|||||||
|
|
||||||
assert pd_buf_lines[1:] == ed_buf_lines[1:]
|
assert pd_buf_lines[1:] == ed_buf_lines[1:]
|
||||||
|
|
||||||
|
# NOTE: info does not work on truncated data frames (e.g. head/tail) TODO
|
||||||
|
|
||||||
print(self.ed_ecommerce().info())
|
print(self.ed_ecommerce().info())
|
||||||
|
31
eland/tests/dataframe/test_select_dtypes_pytest.py
Normal file
31
eland/tests/dataframe/test_select_dtypes_pytest.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
# File called _pytest for PyCharm compatability
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from eland.tests.common import TestData
|
||||||
|
from eland.tests.common import (
|
||||||
|
assert_pandas_eland_frame_equal
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class TestDataFrameSelectDTypes(TestData):
|
||||||
|
|
||||||
|
def test_select_dtypes1(self):
|
||||||
|
ed_flights = self.ed_flights()
|
||||||
|
pd_flights = self.pd_flights()
|
||||||
|
|
||||||
|
ed_flights_numeric = ed_flights.select_dtypes(include=[np.number])
|
||||||
|
pd_flights_numeric = pd_flights.select_dtypes(include=[np.number])
|
||||||
|
|
||||||
|
assert_pandas_eland_frame_equal(pd_flights_numeric.head(103), ed_flights_numeric.head(103))
|
||||||
|
|
||||||
|
def test_select_dtypes2(self):
|
||||||
|
ed_flights = self.ed_flights()
|
||||||
|
pd_flights = self.pd_flights()
|
||||||
|
|
||||||
|
ed_flights_non_numeric = ed_flights.select_dtypes(exclude=[np.number])
|
||||||
|
pd_flights_non_numeric = pd_flights.select_dtypes(exclude=[np.number])
|
||||||
|
|
||||||
|
assert_pandas_eland_frame_equal(pd_flights_non_numeric.head(103), ed_flights_non_numeric.head(103))
|
||||||
|
|
File diff suppressed because one or more lines are too long
@ -2,67 +2,18 @@
|
|||||||
|
|
||||||
from eland.tests.common import TestData
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
from pandas.util.testing import assert_series_equal
|
from matplotlib.testing.decorators import check_figures_equal
|
||||||
|
|
||||||
import numpy as np
|
@check_figures_equal(extensions=['png'])
|
||||||
import pandas as pd
|
def test_plot(fig_test, fig_ref):
|
||||||
|
test_data = TestData()
|
||||||
class TestDataFrameHist(TestData):
|
|
||||||
|
|
||||||
def test_dataframe_hist1(self):
|
|
||||||
test_data = TestData()
|
|
||||||
|
|
||||||
pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
|
|
||||||
ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
|
|
||||||
|
|
||||||
"""
|
|
||||||
pd_flights.hist(figsize=[10, 10])
|
|
||||||
#ed_flights.hist(figsize=[10, 10])
|
|
||||||
|
|
||||||
pd_min = pd_flights['DistanceKilometers'].min()
|
|
||||||
pd_max = pd_flights['DistanceKilometers'].max()
|
|
||||||
|
|
||||||
#ed_min = ed_flights['DistanceKilometers'].min()
|
|
||||||
#ed_max = ed_flights['DistanceKilometers'].max()
|
|
||||||
|
|
||||||
#num_bins = 10.0
|
|
||||||
|
|
||||||
#bins = np.linspace(ed_min, ed_max, num=num_bins+1)
|
|
||||||
|
|
||||||
#print(bins)
|
|
||||||
|
|
||||||
#print(np.diff(bins).mean())
|
|
||||||
|
|
||||||
#hist = ed_flights['DistanceKilometers'].hist(np.diff(bins).mean())
|
|
||||||
|
|
||||||
|
|
||||||
x = [2956., 768., 719., 2662., 2934., 1320., 641., 529., 426., 104.]
|
|
||||||
bins = [0., 1988.14823146, 3976.29646292, 5964.44469437, 7952.59292583, 9940.74115729, 11928.88938875, 13917.03762021, 15905.18585166,17893.33408312,19881.48231458]
|
|
||||||
|
|
||||||
print(len(x))
|
|
||||||
print(len(bins))
|
|
||||||
|
|
||||||
a = bins[0:10]
|
|
||||||
|
|
||||||
print(np.histogram(a, weights=x, bins=bins))
|
|
||||||
#counts, bins = np.histogram(data)
|
|
||||||
#plt.hist(bins[:-1], bins, weights=counts)
|
|
||||||
"""
|
|
||||||
|
|
||||||
h1 = np.histogram(pd_flights['DistanceKilometers'], 10)
|
|
||||||
h2 = np.histogram(pd_flights['FlightDelayMin'], 10)
|
|
||||||
l1 = list(h1[0])
|
|
||||||
l2 = list(h2[0])
|
|
||||||
l1.append(0)
|
|
||||||
l2.append(0)
|
|
||||||
|
|
||||||
d = {'DistanceKilometers': h1[1],
|
|
||||||
'FlightDelayMin': h2[1]}
|
|
||||||
|
|
||||||
df = pd.DataFrame(data=d)
|
|
||||||
|
|
||||||
df.hist(weights=[l1, l2])
|
|
||||||
|
|
||||||
|
pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
|
||||||
|
ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
|
||||||
|
|
||||||
|
pd_ax = fig_ref.subplots()
|
||||||
|
pd_flights.hist(ax=pd_ax)
|
||||||
|
|
||||||
|
ed_ax = fig_test.subplots()
|
||||||
|
ed_flights.hist(ax=ed_ax)
|
||||||
|
|
||||||
|
@ -13,15 +13,15 @@ class TestQueryCopy(TestData):
|
|||||||
q.exists('field_a')
|
q.exists('field_a')
|
||||||
q.exists('field_b', must=False)
|
q.exists('field_b', must=False)
|
||||||
|
|
||||||
print(q.to_query())
|
print(q.to_search_body())
|
||||||
|
|
||||||
q1 = Query(q)
|
q1 = Query(q)
|
||||||
|
|
||||||
q.exists('field_c', must=False)
|
q.exists('field_c', must=False)
|
||||||
q1.exists('field_c1', must=False)
|
q1.exists('field_c1', must=False)
|
||||||
|
|
||||||
print(q.to_query())
|
print(q.to_search_body())
|
||||||
print(q1.to_query())
|
print(q1.to_search_body())
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user