mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Demo day notebook + minor updates added
This commit is contained in:
parent
ef289bfe78
commit
337bef1c5d
@ -227,6 +227,9 @@ class NDFrame(BasePandasDataset):
|
||||
raise NotImplementedError("Only sum of numeric fields is implemented")
|
||||
return self._query_compiler.max()
|
||||
|
||||
def nunique(self):
|
||||
return self._query_compiler.nunique()
|
||||
|
||||
def _hist(self, num_bins):
|
||||
return self._query_compiler._hist(num_bins)
|
||||
|
||||
|
@ -185,12 +185,12 @@ class Operations:
|
||||
raise NotImplementedError("Can not count field matches if size is set {}".format(size))
|
||||
|
||||
columns = self.get_columns()
|
||||
|
||||
numeric_source_fields = query_compiler._mappings.numeric_source_fields(columns)
|
||||
if columns is None:
|
||||
columns = query_compiler._mappings.source_fields()
|
||||
|
||||
body = Query(query_params['query'])
|
||||
|
||||
for field in numeric_source_fields:
|
||||
for field in columns:
|
||||
body.metric_aggs(field, func, field)
|
||||
|
||||
response = query_compiler._client.search(
|
||||
@ -200,10 +200,10 @@ class Operations:
|
||||
|
||||
results = {}
|
||||
|
||||
for field in numeric_source_fields:
|
||||
for field in columns:
|
||||
results[field] = response['aggregations'][field]['value']
|
||||
|
||||
s = pd.Series(data=results, index=numeric_source_fields)
|
||||
s = pd.Series(data=results, index=columns)
|
||||
|
||||
return s
|
||||
|
||||
|
@ -33,6 +33,9 @@ class BooleanFilter(object):
|
||||
return True
|
||||
return False
|
||||
|
||||
def __repr__(self):
|
||||
return str(self._filter)
|
||||
|
||||
@property
|
||||
def subtree(self):
|
||||
if 'bool' in self._filter:
|
||||
|
@ -63,7 +63,7 @@ def ed_hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
|
||||
# pandas / plotting / _core.py: 2410
|
||||
# ax.hist(data[col].dropna().values, bins=bins, **kwds)
|
||||
|
||||
ax.hist(ed_df_bins[col][:-1], bins=ed_df_bins[col], weights=ed_df_weights[col])
|
||||
ax.hist(ed_df_bins[col][:-1], bins=ed_df_bins[col], weights=ed_df_weights[col], **kwds)
|
||||
ax.set_title(col)
|
||||
ax.grid(grid)
|
||||
|
||||
|
@ -471,4 +471,6 @@ class ElandQueryCompiler(BaseQueryCompiler):
|
||||
|
||||
return result
|
||||
|
||||
#def isna(self):
|
||||
|
||||
|
||||
|
@ -20,7 +20,7 @@ import warnings
|
||||
import pandas as pd
|
||||
|
||||
from eland import NDFrame
|
||||
from eland.operators import Equal, Greater, ScriptFilter
|
||||
from eland.operators import NotFilter, Equal, Greater, Less, GreaterEqual, LessEqual, ScriptFilter
|
||||
|
||||
|
||||
class Series(NDFrame):
|
||||
@ -164,6 +164,37 @@ class Series(NDFrame):
|
||||
else:
|
||||
raise NotImplementedError(other, type(other))
|
||||
|
||||
def __lt__(self, other):
|
||||
if isinstance(other, Series):
|
||||
# Need to use scripted query to compare to values
|
||||
painless = "doc['{}'].value < doc['{}'].value".format(self.name, other.name)
|
||||
return ScriptFilter(painless)
|
||||
elif isinstance(other, (int, float)):
|
||||
return Less(field=self.name, value=other)
|
||||
else:
|
||||
raise NotImplementedError(other, type(other))
|
||||
|
||||
def __ge__(self, other):
|
||||
if isinstance(other, Series):
|
||||
# Need to use scripted query to compare to values
|
||||
painless = "doc['{}'].value >= doc['{}'].value".format(self.name, other.name)
|
||||
return ScriptFilter(painless)
|
||||
elif isinstance(other, (int, float)):
|
||||
return GreaterEqual(field=self.name, value=other)
|
||||
else:
|
||||
raise NotImplementedError(other, type(other))
|
||||
|
||||
def __le__(self, other):
|
||||
if isinstance(other, Series):
|
||||
# Need to use scripted query to compare to values
|
||||
painless = "doc['{}'].value <= doc['{}'].value".format(self.name, other.name)
|
||||
return ScriptFilter(painless)
|
||||
elif isinstance(other, (int, float)):
|
||||
return LessEqual(field=self.name, value=other)
|
||||
else:
|
||||
raise NotImplementedError(other, type(other))
|
||||
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, Series):
|
||||
# Need to use scripted query to compare to values
|
||||
@ -171,5 +202,19 @@ class Series(NDFrame):
|
||||
return ScriptFilter(painless)
|
||||
elif isinstance(other, (int, float)):
|
||||
return Equal(field=self.name, value=other)
|
||||
elif isinstance(other, str):
|
||||
return Equal(field=self.name, value=other)
|
||||
else:
|
||||
raise NotImplementedError(other, type(other))
|
||||
|
||||
def __ne__(self, other):
|
||||
if isinstance(other, Series):
|
||||
# Need to use scripted query to compare to values
|
||||
painless = "doc['{}'].value != doc['{}'].value".format(self.name, other.name)
|
||||
return ScriptFilter(painless)
|
||||
elif isinstance(other, (int, float)):
|
||||
return NotFilter(Equal(field=self.name, value=other))
|
||||
elif isinstance(other, str):
|
||||
return NotFilter(Equal(field=self.name, value=other))
|
||||
else:
|
||||
raise NotImplementedError(other, type(other))
|
||||
|
23
eland/tests/DEMO.md
Normal file
23
eland/tests/DEMO.md
Normal file
@ -0,0 +1,23 @@
|
||||
https://docs.google.com/presentation/d/1A3S5aIJC8SuEbi80PhEzyxTUNMjWJ7-_Om92yU9p3yo/edit#slide=id.g5f8a4bcb09_0_3
|
||||
https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python
|
||||
https://nbviewer.jupyter.org/github/parente/nbestimate/blob/master/estimate.ipynb
|
||||
https://stackoverflow.blog/2017/09/14/python-growing-quickly/
|
||||
https://github.com/elastic/eland
|
||||
http://localhost:8889/notebooks/eland/tests/demo_day_20190815.ipynb
|
||||
http://localhost:5601/app/kibana#/dev_tools/console?_g=()
|
||||
|
||||
|
||||
devtool console:
|
||||
```
|
||||
GET _cat/indices
|
||||
|
||||
# Clean demo
|
||||
DELETE ed_jetbeats_routes
|
||||
|
||||
# Demo day schema
|
||||
GET flights
|
||||
GET flights/_search
|
||||
|
||||
GET ed_jetbeats_routes
|
||||
GET ed_jetbeats_routes/_search
|
||||
```
|
@ -13,13 +13,10 @@ import numpy as np
|
||||
class TestDataFrameNUnique(TestData):
|
||||
|
||||
def test_nunique1(self):
|
||||
ed_ecommerce = self.ed_ecommerce()
|
||||
pd_ecommerce = self.pd_ecommerce()
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
print(pd_ecommerce.dtypes)
|
||||
print(ed_ecommerce.dtypes)
|
||||
#ed_nunique = ed_ecommerce.nunique()
|
||||
pd_selection = pd_ecommerce.drop(columns=['category'])
|
||||
pd_nunique = pd_selection.nunique(axis=1)
|
||||
print(pd_flights.dtypes)
|
||||
print(ed_flights.dtypes)
|
||||
print(ed_flights.nunique())
|
||||
|
||||
print(pd_nunique, type(pd_nunique))
|
||||
|
@ -39,10 +39,8 @@ class TestDataFrameQuery(TestData):
|
||||
assert_pandas_eland_frame_equal(pd_q2, ed_q2)
|
||||
assert_pandas_eland_frame_equal(pd_q3, ed_q3)
|
||||
|
||||
def test_query2(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
pd_q4 = pd_df[(pd_df.A > 2) & (pd_df.B > 3)]
|
||||
ed_q4 = ed_df[(ed_df.A > 2) & (ed_df.B > 3)]
|
||||
|
||||
cancelled = pd_flights[pd_flights.Cancelled == True]
|
||||
assert_pandas_eland_frame_equal(pd_q4, ed_q4)
|
||||
|
||||
print(cancelled.groupby(['DestWeather']).count())
|
||||
|
7159
eland/tests/demo_day_20190815.ipynb
Normal file
7159
eland/tests/demo_day_20190815.ipynb
Normal file
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user