mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Partial implementation of hist - does not work
Backup push
This commit is contained in:
parent
9bf3505b7e
commit
1fa4d3fbe7
@ -10,6 +10,7 @@ from .mappings import *
|
||||
from .query import *
|
||||
from .operations import *
|
||||
from .query_compiler import *
|
||||
from .plotting import *
|
||||
from .ndframe import *
|
||||
from .series import *
|
||||
from .dataframe import *
|
||||
|
@ -1,9 +1,9 @@
|
||||
import warnings
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
import pandas as pd
|
||||
from distutils.version import LooseVersion
|
||||
from pandas.compat import StringIO
|
||||
from pandas.core.common import apply_if_callable, is_bool_indexer
|
||||
from pandas.io.common import _expand_user, _stringify_path
|
||||
@ -13,7 +13,7 @@ from pandas.io.formats.printing import pprint_thing
|
||||
|
||||
from eland import NDFrame
|
||||
from eland import Series
|
||||
|
||||
from eland import hist_frame
|
||||
|
||||
class DataFrame(NDFrame):
|
||||
# This is effectively 2 constructors
|
||||
@ -74,6 +74,46 @@ class DataFrame(NDFrame):
|
||||
|
||||
return buf.getvalue()
|
||||
|
||||
def _info_repr(self):
|
||||
"""
|
||||
True if the repr should show the info view.
|
||||
"""
|
||||
info_repr_option = (pd.get_option("display.large_repr") == "info")
|
||||
return info_repr_option and not (self._repr_fits_horizontal_() and
|
||||
self._repr_fits_vertical_())
|
||||
|
||||
def _repr_html_(self):
|
||||
"""
|
||||
From pandas
|
||||
"""
|
||||
try:
|
||||
import IPython
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
if LooseVersion(IPython.__version__) < LooseVersion('3.0'):
|
||||
if console.in_qtconsole():
|
||||
# 'HTML output is disabled in QtConsole'
|
||||
return None
|
||||
|
||||
if self._info_repr():
|
||||
buf = StringIO(u(""))
|
||||
self.info(buf=buf)
|
||||
# need to escape the <class>, should be the first line.
|
||||
val = buf.getvalue().replace('<', r'<', 1)
|
||||
val = val.replace('>', r'>', 1)
|
||||
return '<pre>' + val + '</pre>'
|
||||
|
||||
if pd.get_option("display.notebook_repr_html"):
|
||||
max_rows = pd.get_option("display.max_rows")
|
||||
max_cols = pd.get_option("display.max_columns")
|
||||
show_dimensions = pd.get_option("display.show_dimensions")
|
||||
|
||||
return self.to_html(max_rows=max_rows, max_cols=max_cols,
|
||||
show_dimensions=show_dimensions, notebook=True)
|
||||
else:
|
||||
return None
|
||||
|
||||
def count(self):
|
||||
"""
|
||||
Count non-NA cells for each column (TODO row)
|
||||
@ -89,7 +129,6 @@ class DataFrame(NDFrame):
|
||||
"""
|
||||
return self._query_compiler.count()
|
||||
|
||||
|
||||
def info_es(self):
|
||||
buf = StringIO()
|
||||
|
||||
@ -222,6 +261,45 @@ class DataFrame(NDFrame):
|
||||
|
||||
fmt.buffer_put_lines(buf, lines)
|
||||
|
||||
def to_html(self, buf=None, columns=None, col_space=None, header=True,
|
||||
index=True, na_rep='NaN', formatters=None, float_format=None,
|
||||
sparsify=None, index_names=True, justify=None, max_rows=None,
|
||||
max_cols=None, show_dimensions=False, decimal='.',
|
||||
bold_rows=True, classes=None, escape=True, notebook=False,
|
||||
border=None, table_id=None, render_links=False):
|
||||
"""
|
||||
From pandas - except we set max_rows default to avoid careless extraction of entire index
|
||||
"""
|
||||
if max_rows is None:
|
||||
warnings.warn("DataFrame.to_string called without max_rows set "
|
||||
"- this will return entire index results. "
|
||||
"Setting max_rows=60, overwrite if different behaviour is required.")
|
||||
max_rows = 60
|
||||
|
||||
# Create a slightly bigger dataframe than display
|
||||
df = self._build_repr_df(max_rows + 1, max_cols)
|
||||
|
||||
if buf is not None:
|
||||
_buf = _expand_user(_stringify_path(buf))
|
||||
else:
|
||||
_buf = StringIO()
|
||||
|
||||
df.to_html(buf=_buf, columns=columns, col_space=col_space, header=header,
|
||||
index=index, na_rep=na_rep, formatters=formatters, float_format=float_format,
|
||||
sparsify=sparsify, index_names=index_names, justify=justify, max_rows=max_rows,
|
||||
max_cols=max_cols, show_dimensions=False, decimal=decimal,
|
||||
bold_rows=bold_rows, classes=classes, escape=escape, notebook=notebook,
|
||||
border=border, table_id=table_id, render_links=render_links)
|
||||
|
||||
# Our fake dataframe has incorrect number of rows (max_rows*2+1) - write out
|
||||
# the correct number of rows
|
||||
if show_dimensions:
|
||||
_buf.write("\n<p>{nrows} rows x {ncols} columns</p>"
|
||||
.format(nrows=len(self.index), ncols=len(self.columns)))
|
||||
|
||||
if buf is None:
|
||||
result = _buf.getvalue()
|
||||
return result
|
||||
|
||||
def to_string(self, buf=None, columns=None, col_space=None, header=True,
|
||||
index=True, na_rep='NaN', formatters=None, float_format=None,
|
||||
@ -238,7 +316,7 @@ class DataFrame(NDFrame):
|
||||
max_rows = 60
|
||||
|
||||
# Create a slightly bigger dataframe than display
|
||||
df = self._build_repr_df(max_rows+1, max_cols)
|
||||
df = self._build_repr_df(max_rows + 1, max_cols)
|
||||
|
||||
if buf is not None:
|
||||
_buf = _expand_user(_stringify_path(buf))
|
||||
@ -295,7 +373,6 @@ class DataFrame(NDFrame):
|
||||
if key not in self.columns:
|
||||
raise KeyError("Requested column is not in the DataFrame {}".format(key))
|
||||
s = self._reduce_dimension(self._query_compiler.getitem_column_array([key]))
|
||||
s._parent = self
|
||||
return s
|
||||
|
||||
def _getitem_array(self, key):
|
||||
@ -345,7 +422,7 @@ class DataFrame(NDFrame):
|
||||
if not inplace:
|
||||
return DataFrame(query_compiler=new_query_compiler)
|
||||
else:
|
||||
self._query_compiler=new_query_compiler
|
||||
self._query_compiler = new_query_compiler
|
||||
|
||||
def _reduce_dimension(self, query_compiler):
|
||||
return Series(query_compiler=query_compiler)
|
||||
@ -353,7 +430,31 @@ class DataFrame(NDFrame):
|
||||
def _to_pandas(self):
|
||||
return self._query_compiler.to_pandas()
|
||||
|
||||
def _empty_pd_df(self):
|
||||
return self._query_compiler._empty_pd_ef()
|
||||
|
||||
def squeeze(self, axis=None):
|
||||
return DataFrame(
|
||||
query_compiler=self._query_compiler.squeeze(axis)
|
||||
)
|
||||
|
||||
@property
|
||||
def shape(self):
|
||||
"""
|
||||
Return a tuple representing the dimensionality of the DataFrame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
shape: tuple
|
||||
0 - number of rows
|
||||
1 - number of columns
|
||||
"""
|
||||
num_rows = len(self)
|
||||
num_columns = len(self.columns)
|
||||
|
||||
return num_rows, num_columns
|
||||
|
||||
def keys(self):
|
||||
return self.columns
|
||||
|
||||
hist = hist_frame
|
||||
|
@ -53,7 +53,6 @@ class Index:
|
||||
# Make iterable
|
||||
def __next__(self):
|
||||
# TODO resolve this hack to make this 'iterable'
|
||||
print("In Index.__next__")
|
||||
raise StopIteration()
|
||||
|
||||
def __iter__(self):
|
||||
|
@ -403,7 +403,7 @@ class Mappings:
|
||||
|
||||
return is_source_field
|
||||
|
||||
def numeric_source_fields(self, columns):
|
||||
def numeric_source_fields(self, columns, include_bool=True):
|
||||
"""
|
||||
Returns
|
||||
-------
|
||||
|
@ -94,7 +94,6 @@ class NDFrame(BasePandasDataset):
|
||||
Returns:
|
||||
The value of the attribute.
|
||||
"""
|
||||
print(key)
|
||||
try:
|
||||
return object.__getattribute__(self, key)
|
||||
except AttributeError as e:
|
||||
@ -228,5 +227,26 @@ class NDFrame(BasePandasDataset):
|
||||
raise NotImplementedError("Only sum of numeric fields is implemented")
|
||||
return self._query_compiler.max()
|
||||
|
||||
def _hist(self, interval)
|
||||
return self._query_compiler._hist(interval)
|
||||
|
||||
def describe(self):
|
||||
return self._query_compiler.describe()
|
||||
|
||||
def get(self, key, default=None):
|
||||
"""Get item from object for given key (DataFrame column, Panel
|
||||
slice, etc.). Returns default value if not found.
|
||||
|
||||
Args:
|
||||
key (DataFrame column, Panel slice) : the key for which value
|
||||
to get
|
||||
|
||||
Returns:
|
||||
value (type of items contained in object) : A value that is
|
||||
stored at the key
|
||||
"""
|
||||
if key in self.keys():
|
||||
return self.__getitem__(key)
|
||||
else:
|
||||
return default
|
||||
|
||||
|
@ -2,6 +2,7 @@ import copy
|
||||
from enum import Enum
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from eland import Index
|
||||
from eland import Query
|
||||
@ -126,6 +127,12 @@ class Operations:
|
||||
def min(self, query_compiler):
|
||||
return self._metric_aggs(query_compiler, 'min')
|
||||
|
||||
def nunique(self, query_compiler):
|
||||
return self._terms_aggs(query_compiler, 'cardinality')
|
||||
|
||||
def hist(self, query_compiler, bins):
|
||||
return self._hist_aggs(query_compiler, bins)
|
||||
|
||||
def _metric_aggs(self, query_compiler, func):
|
||||
query_params, post_processing = self._resolve_tasks()
|
||||
|
||||
@ -155,6 +162,73 @@ class Operations:
|
||||
# }
|
||||
results = {}
|
||||
|
||||
for field in numeric_source_fields:
|
||||
results[field] = response['aggregations'][field]['value']
|
||||
|
||||
# Return single value if this is a series
|
||||
if len(numeric_source_fields) == 1:
|
||||
return np.float64(results[numeric_source_fields[0]])
|
||||
|
||||
s = pd.Series(data=results, index=numeric_source_fields)
|
||||
|
||||
return s
|
||||
|
||||
def _terms_aggs(self, query_compiler, func):
|
||||
query_params, post_processing = self._resolve_tasks()
|
||||
|
||||
size = self._size(query_params, post_processing)
|
||||
if size is not None:
|
||||
raise NotImplementedError("Can not count field matches if size is set {}".format(size))
|
||||
|
||||
columns = self.get_columns()
|
||||
|
||||
numeric_source_fields = query_compiler._mappings.numeric_source_fields(columns)
|
||||
|
||||
body = Query(query_params['query'])
|
||||
|
||||
for field in numeric_source_fields:
|
||||
body.metric_aggs(field, func, field)
|
||||
|
||||
response = query_compiler._client.search(
|
||||
index=query_compiler._index_pattern,
|
||||
size=0,
|
||||
body=body.to_search_body())
|
||||
|
||||
results = {}
|
||||
|
||||
for field in numeric_source_fields:
|
||||
results[field] = response['aggregations'][field]['value']
|
||||
|
||||
s = pd.Series(data=results, index=numeric_source_fields)
|
||||
|
||||
return s
|
||||
|
||||
def _hist_aggs(self, query_compiler, bins):
|
||||
query_params, post_processing = self._resolve_tasks()
|
||||
|
||||
size = self._size(query_params, post_processing)
|
||||
if size is not None:
|
||||
raise NotImplementedError("Can not count field matches if size is set {}".format(size))
|
||||
|
||||
columns = self.get_columns()
|
||||
|
||||
numeric_source_fields = query_compiler._mappings.numeric_source_fields(columns)
|
||||
|
||||
body = Query(query_params['query'])
|
||||
|
||||
min_aggs = self._metric_aggs(query_compiler, 'min')
|
||||
max_aggs = self._metric_aggs(query_compiler, 'max')
|
||||
|
||||
for field in numeric_source_fields:
|
||||
body.hist_aggs(field, min_aggs, max_aggs, bins)
|
||||
|
||||
response = query_compiler._client.search(
|
||||
index=query_compiler._index_pattern,
|
||||
size=0,
|
||||
body=body.to_search_body())
|
||||
|
||||
results = {}
|
||||
|
||||
for field in numeric_source_fields:
|
||||
results[field] = response['aggregations'][field]['value']
|
||||
|
||||
@ -181,8 +255,6 @@ class Operations:
|
||||
body.metric_aggs('extended_stats_' + field, 'extended_stats', field)
|
||||
body.metric_aggs('percentiles_' + field, 'percentiles', field)
|
||||
|
||||
print(body.to_search_body())
|
||||
|
||||
response = query_compiler._client.search(
|
||||
index=query_compiler._index_pattern,
|
||||
size=0,
|
||||
@ -219,15 +291,18 @@ class Operations:
|
||||
# Only return requested columns
|
||||
columns = self.get_columns()
|
||||
|
||||
es_results = None
|
||||
|
||||
# If size=None use scan not search - then post sort results when in df
|
||||
# If size>10000 use scan
|
||||
if size is not None and size <= 10000:
|
||||
es_results = query_compiler._client.search(
|
||||
index=query_compiler._index_pattern,
|
||||
size=size,
|
||||
sort=sort_params,
|
||||
body=body.to_search_body(),
|
||||
_source=columns)
|
||||
if size > 0:
|
||||
es_results = query_compiler._client.search(
|
||||
index=query_compiler._index_pattern,
|
||||
size=size,
|
||||
sort=sort_params,
|
||||
body=body.to_search_body(),
|
||||
_source=columns)
|
||||
else:
|
||||
es_results = query_compiler._client.scan(
|
||||
index=query_compiler._index_pattern,
|
||||
|
51
eland/plotting.py
Normal file
51
eland/plotting.py
Normal file
@ -0,0 +1,51 @@
|
||||
import numpy as np
|
||||
|
||||
import pandas.core.common as com
|
||||
from pandas.core.dtypes.generic import (
|
||||
ABCIndexClass)
|
||||
from pandas.plotting._core import (
|
||||
_raise_if_no_mpl, _converter, grouped_hist, _subplots, _flatten, _set_ticks_props)
|
||||
|
||||
|
||||
def hist_frame(ed_df, column=None, by=None, grid=True, xlabelsize=None,
|
||||
xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
|
||||
sharey=False, figsize=None, layout=None, bins=10, **kwds):
|
||||
"""
|
||||
Derived from pandas.plotting._core.hist_frame 0.24.2
|
||||
"""
|
||||
# Start with empty pandas data frame derived from
|
||||
empty_pd_df = ed_df._empty_pd_df()
|
||||
|
||||
_raise_if_no_mpl()
|
||||
_converter._WARN = False
|
||||
if by is not None:
|
||||
axes = grouped_hist(empty_pd_df, column=column, by=by, ax=ax, grid=grid,
|
||||
figsize=figsize, sharex=sharex, sharey=sharey,
|
||||
layout=layout, bins=bins, xlabelsize=xlabelsize,
|
||||
xrot=xrot, ylabelsize=ylabelsize,
|
||||
yrot=yrot, **kwds)
|
||||
return axes
|
||||
|
||||
if column is not None:
|
||||
if not isinstance(column, (list, np.ndarray, ABCIndexClass)):
|
||||
column = [column]
|
||||
empty_pd_df = empty_pd_df[column]
|
||||
data = empty_pd_df._get_numeric_data()
|
||||
naxes = len(empty_pd_df.columns)
|
||||
|
||||
fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False,
|
||||
sharex=sharex, sharey=sharey, figsize=figsize,
|
||||
layout=layout)
|
||||
_axes = _flatten(axes)
|
||||
|
||||
for i, col in enumerate(com.try_sort(data.columns)):
|
||||
ax = _axes[i]
|
||||
ax.hist(empty_pd_df[col].dropna().values, bins=bins, **kwds)
|
||||
ax.set_title(col)
|
||||
ax.grid(grid)
|
||||
|
||||
_set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot,
|
||||
ylabelsize=ylabelsize, yrot=yrot)
|
||||
fig.subplots_adjust(wspace=0.3, hspace=0.3)
|
||||
|
||||
return axes
|
@ -68,6 +68,19 @@ class Query:
|
||||
}
|
||||
self._aggs[name] = agg
|
||||
|
||||
def hist_aggs(self, name, field, min_aggs, max_aggs, bins):
|
||||
"""
|
||||
Add histogram agg e.g.
|
||||
"aggs": {
|
||||
"name": {
|
||||
"histogram": {
|
||||
"field": "AvgTicketPrice"
|
||||
"interval": (max_aggs[field] - min_aggs[field])/bins
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
def to_search_body(self):
|
||||
body = {"query": self._query, "aggs": self._aggs}
|
||||
return body
|
||||
|
@ -152,6 +152,8 @@ class ElandQueryCompiler(BaseQueryCompiler):
|
||||
TODO - an option here is to use Elasticsearch's multi-field matching instead of pandas treatment of lists (which isn't great)
|
||||
NOTE - using this lists is generally not a good way to use this API
|
||||
"""
|
||||
if results is None:
|
||||
return self._empty_pd_ef()
|
||||
|
||||
def flatten_dict(y):
|
||||
out = {}
|
||||
@ -257,6 +259,13 @@ class ElandQueryCompiler(BaseQueryCompiler):
|
||||
"""
|
||||
return self._operations.index_matches(self, self.index.index_field, items)
|
||||
|
||||
def _empty_pd_ef(self):
|
||||
# Return an empty dataframe with correct columns and dtypes
|
||||
df = pd.DataFrame()
|
||||
for c, d in zip(self.columns, self.dtypes):
|
||||
df[c] = pd.Series(dtype=d)
|
||||
return df
|
||||
|
||||
def copy(self):
|
||||
return self.__constructor__(
|
||||
client=self._client,
|
||||
@ -348,6 +357,8 @@ class ElandQueryCompiler(BaseQueryCompiler):
|
||||
return self._operations.min(self)
|
||||
def max(self):
|
||||
return self._operations.max(self)
|
||||
def nunique(self):
|
||||
return self._operations.nunique(self)
|
||||
|
||||
def info_es(self, buf):
|
||||
buf.write("index_pattern: {index_pattern}\n".format(index_pattern=self._index_pattern))
|
||||
@ -358,3 +369,6 @@ class ElandQueryCompiler(BaseQueryCompiler):
|
||||
|
||||
def describe(self):
|
||||
return self._operations.describe(self)
|
||||
|
||||
def _hist(self, interval):
|
||||
return self._operations.hist(self, interval)
|
||||
|
17734
eland/tests/Eland Demo Notebook.ipynb
Normal file
17734
eland/tests/Eland Demo Notebook.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
BIN
eland/tests/anonreviews.csv.gz
Normal file
BIN
eland/tests/anonreviews.csv.gz
Normal file
Binary file not shown.
@ -13,6 +13,9 @@ class TestDataFrameInfo(TestData):
|
||||
pd_describe = pd_flights.describe()
|
||||
ed_describe = ed_flights.describe()
|
||||
|
||||
print(pd_describe)
|
||||
print(ed_describe)
|
||||
|
||||
# TODO - this fails now as ES aggregations are approximate
|
||||
# if ES percentile agg uses
|
||||
# "hdr": {
|
||||
|
23
eland/tests/dataframe/test_get_pytest.py
Normal file
23
eland/tests/dataframe/test_get_pytest.py
Normal file
@ -0,0 +1,23 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pandas as pd
|
||||
import eland as ed
|
||||
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import (
|
||||
assert_pandas_eland_frame_equal,
|
||||
assert_pandas_eland_series_equal
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
class TestDataFrameiLoc(TestData):
|
||||
|
||||
def test_get1(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
ed_get0 = ed_flights.get('Carrier')
|
||||
pd_get0 = pd_flights.get('Carrier')
|
||||
|
||||
print(ed_get0, type(ed_get0))
|
||||
print(pd_get0, type(pd_get0))
|
@ -80,3 +80,10 @@ class TestDataFrameHeadTail(TestData):
|
||||
pd_head_4 = pd_tail_5.head(4)
|
||||
assert_pandas_eland_frame_equal(pd_head_4, ed_head_4)
|
||||
|
||||
def test_head_0(self):
|
||||
ed_flights = self.ed_flights()
|
||||
pd_flights = self.pd_flights()
|
||||
|
||||
ed_head_0 = ed_flights.head(0)
|
||||
pd_head_0 = pd_flights.head(0)
|
||||
assert_pandas_eland_frame_equal(pd_head_0, ed_head_0)
|
||||
|
25
eland/tests/dataframe/test_nunique_pytest.py
Normal file
25
eland/tests/dataframe/test_nunique_pytest.py
Normal file
@ -0,0 +1,25 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
import pandas as pd
|
||||
import eland as ed
|
||||
|
||||
from eland.tests.common import TestData
|
||||
from eland.tests.common import (
|
||||
assert_pandas_eland_frame_equal,
|
||||
assert_pandas_eland_series_equal
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
class TestDataFrameNUnique(TestData):
|
||||
|
||||
def test_nunique1(self):
|
||||
ed_ecommerce = self.ed_ecommerce()
|
||||
pd_ecommerce = self.pd_ecommerce()
|
||||
|
||||
print(pd_ecommerce.dtypes)
|
||||
print(ed_ecommerce.dtypes)
|
||||
#ed_nunique = ed_ecommerce.nunique()
|
||||
pd_selection = pd_ecommerce.drop(columns=['category'])
|
||||
pd_nunique = pd_selection.nunique(axis=1)
|
||||
|
||||
print(pd_nunique, type(pd_nunique))
|
17
eland/tests/dataframe/test_reviews_pytest.py
Normal file
17
eland/tests/dataframe/test_reviews_pytest.py
Normal file
@ -0,0 +1,17 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
import eland as ed
|
||||
|
||||
|
||||
class TestDataFrameReviews(TestData):
|
||||
|
||||
def test_explore(self):
|
||||
ed_reviews = ed.DataFrame('localhost', 'anonreviews')
|
||||
|
||||
print(ed_reviews.head())
|
||||
print(ed_reviews.describe())
|
||||
print(ed_reviews.info())
|
||||
print(ed_reviews.hist(column="rating", bins = 5))
|
||||
#print(ed_reviews.head().info_es())
|
26
eland/tests/dataframe/test_shape_pytest.py
Normal file
26
eland/tests/dataframe/test_shape_pytest.py
Normal file
@ -0,0 +1,26 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameShape(TestData):
|
||||
|
||||
def test_to_shape1(self):
|
||||
pd_ecommerce = self.pd_ecommerce()
|
||||
ed_ecommerce = self.ed_ecommerce()
|
||||
|
||||
pd_shape = pd_ecommerce.shape
|
||||
ed_shape = ed_ecommerce.shape
|
||||
|
||||
assert pd_shape == ed_shape
|
||||
|
||||
def test_to_shape2(self):
|
||||
pd_flights = self.pd_flights()
|
||||
ed_flights = self.ed_flights()
|
||||
|
||||
pd_shape = pd_flights.shape
|
||||
ed_shape = ed_flights.shape
|
||||
|
||||
assert pd_shape == ed_shape
|
||||
|
||||
|
44
eland/tests/dataframe/test_utils_pytest.py
Normal file
44
eland/tests/dataframe/test_utils_pytest.py
Normal file
@ -0,0 +1,44 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import eland as ed
|
||||
from eland.tests.common import ELASTICSEARCH_HOST
|
||||
from eland.tests.common import TestData
|
||||
|
||||
|
||||
class TestDataFrameUtils(TestData):
|
||||
|
||||
def test_generate_es_mappings(self):
|
||||
df = pd.DataFrame(data={'A': np.random.rand(3),
|
||||
'B': 1,
|
||||
'C': 'foo',
|
||||
'D': pd.Timestamp('20190102'),
|
||||
'E': [1.0, 2.0, 3.0],
|
||||
'F': False,
|
||||
'G': [1, 2, 3]},
|
||||
index=['0', '1', '2'])
|
||||
|
||||
expected_mappings = {'mappings': {
|
||||
'properties': {'A': {'type': 'double'},
|
||||
'B': {'type': 'long'},
|
||||
'C': {'type': 'keyword'},
|
||||
'D': {'type': 'date'},
|
||||
'E': {'type': 'double'},
|
||||
'F': {'type': 'boolean'},
|
||||
'G': {'type': 'long'}}}}
|
||||
|
||||
mappings = ed.Mappings._generate_es_mappings(df)
|
||||
|
||||
assert expected_mappings == mappings
|
||||
|
||||
# Now create index
|
||||
index_name = 'eland_test_generate_es_mappings'
|
||||
|
||||
ed.pandas_to_es(df, ELASTICSEARCH_HOST, index_name, if_exists="replace", refresh=True)
|
||||
|
||||
ed_df = ed.DataFrame(ELASTICSEARCH_HOST, index_name)
|
||||
ed_df_head = ed_df.head()
|
||||
|
||||
# assert_frame_equal(df, ed_df_head)
|
557
eland/tests/pivot_review_data_pandas.ipynb
Normal file
557
eland/tests/pivot_review_data_pandas.ipynb
Normal file
File diff suppressed because one or more lines are too long
170
eland/tests/plotting/test_dataframe_hist_pytest.ipynb
Normal file
170
eland/tests/plotting/test_dataframe_hist_pytest.ipynb
Normal file
File diff suppressed because one or more lines are too long
46
eland/tests/plotting/test_dataframe_hist_pytest.py
Normal file
46
eland/tests/plotting/test_dataframe_hist_pytest.py
Normal file
@ -0,0 +1,46 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
|
||||
from eland.tests.common import TestData
|
||||
|
||||
from pandas.util.testing import assert_series_equal
|
||||
|
||||
import numpy as np
|
||||
|
||||
class TestDataFrameHist(TestData):
|
||||
|
||||
def test_dataframe_hist1(self):
|
||||
test_data = TestData()
|
||||
|
||||
pd_flights = test_data.pd_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
|
||||
ed_flights = test_data.ed_flights()[['DistanceKilometers', 'DistanceMiles', 'FlightDelayMin', 'FlightTimeHour']]
|
||||
|
||||
pd_flights.hist(figsize=[10, 10])
|
||||
ed_flights.hist(figsize=[10, 10])
|
||||
|
||||
pd_min = pd_flights['DistanceKilometers'].min()
|
||||
pd_max = pd_flights['DistanceKilometers'].max()
|
||||
|
||||
ed_min = ed_flights['DistanceKilometers'].min()
|
||||
ed_max = ed_flights['DistanceKilometers'].max()
|
||||
|
||||
num_bins = 10.0
|
||||
|
||||
bins = np.linspace(ed_min, ed_max, num=num_bins+1)
|
||||
|
||||
print(bins)
|
||||
|
||||
print(np.diff(bins).mean())
|
||||
|
||||
hist = ed_flights['DistanceKilometers'].hist(np.diff(bins).mean())
|
||||
|
||||
|
||||
x = [2956., 768., 719., 2662., 2934., 1320., 641., 529., 426., 104.]
|
||||
bins = [0., 1988.14823146, 3976.29646292, 5964.44469437, 7952.59292583, 9940.74115729, 11928.88938875, 13917.03762021, 15905.18585166,17893.33408312,19881.48231458]
|
||||
|
||||
print(len(x))
|
||||
print(len(bins))
|
||||
|
||||
a = bins[0:10]
|
||||
|
||||
print(np.histogram(a, weights=x, bins=bins))
|
||||
|
@ -46,11 +46,11 @@ def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=
|
||||
)
|
||||
elif if_exists == "replace":
|
||||
client.index_delete(index=destination_index)
|
||||
client.index_create(index=destination_index, mapping=mapping)
|
||||
client.index_create(index=destination_index, body=mapping)
|
||||
# elif if_exists == "append":
|
||||
# TODO validate mapping is compatible
|
||||
else:
|
||||
client.index_create(index=destination_index, mapping=mapping)
|
||||
client.index_create(index=destination_index, body=mapping)
|
||||
|
||||
# Now add data
|
||||
actions = []
|
||||
|
Loading…
x
Reference in New Issue
Block a user