mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Adding drop + the ability for operations to have a query
Significant refactor - needs cleanup
This commit is contained in:
parent
dc07285aa1
commit
d71ce9f50c
@ -7,6 +7,7 @@ os.environ["MODIN_BACKEND"] = 'pandas'
|
|||||||
from .client import *
|
from .client import *
|
||||||
from .index import *
|
from .index import *
|
||||||
from .mappings import *
|
from .mappings import *
|
||||||
|
from .query import *
|
||||||
from .operations import *
|
from .operations import *
|
||||||
from .query_compiler import *
|
from .query_compiler import *
|
||||||
from .ndframe import *
|
from .ndframe import *
|
||||||
|
@ -12,7 +12,16 @@ class Client:
|
|||||||
self._es = es._es
|
self._es = es._es
|
||||||
else:
|
else:
|
||||||
self._es = Elasticsearch(es)
|
self._es = Elasticsearch(es)
|
||||||
|
|
||||||
|
def index_create(self, **kwargs):
|
||||||
|
return self._es.indices.create(**kwargs)
|
||||||
|
|
||||||
|
def index_delete(self, **kwargs):
|
||||||
|
return self._es.indices.delete(**kwargs)
|
||||||
|
|
||||||
|
def index_exists(self, **kwargs):
|
||||||
|
return self._es.indices.exists(**kwargs)
|
||||||
|
|
||||||
def get_mapping(self, **kwargs):
|
def get_mapping(self, **kwargs):
|
||||||
return self._es.indices.get_mapping(**kwargs)
|
return self._es.indices.get_mapping(**kwargs)
|
||||||
|
|
||||||
|
@ -16,7 +16,7 @@ from eland import Series
|
|||||||
|
|
||||||
|
|
||||||
class DataFrame(NDFrame):
|
class DataFrame(NDFrame):
|
||||||
# TODO create effectively 2 constructors
|
# This is effectively 2 constructors
|
||||||
# 1. client, index_pattern, columns, index_field
|
# 1. client, index_pattern, columns, index_field
|
||||||
# 2. query_compiler
|
# 2. query_compiler
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
@ -74,6 +74,22 @@ class DataFrame(NDFrame):
|
|||||||
|
|
||||||
return buf.getvalue()
|
return buf.getvalue()
|
||||||
|
|
||||||
|
def count(self):
|
||||||
|
"""
|
||||||
|
Count non-NA cells for each column (TODO row)
|
||||||
|
|
||||||
|
Counts are based on exists queries against ES
|
||||||
|
|
||||||
|
This is inefficient, as it creates N queries (N is number of fields).
|
||||||
|
|
||||||
|
An alternative approach is to use value_count aggregations. However, they have issues in that:
|
||||||
|
1. They can only be used with aggregatable fields (e.g. keyword not text)
|
||||||
|
2. For list fields they return multiple counts. E.g. tags=['elastic', 'ml'] returns value_count=2
|
||||||
|
for a single document.
|
||||||
|
"""
|
||||||
|
return self._query_compiler.count()
|
||||||
|
|
||||||
|
|
||||||
def info_es(self):
|
def info_es(self):
|
||||||
buf = StringIO()
|
buf = StringIO()
|
||||||
|
|
||||||
@ -81,6 +97,130 @@ class DataFrame(NDFrame):
|
|||||||
|
|
||||||
return buf.getvalue()
|
return buf.getvalue()
|
||||||
|
|
||||||
|
def _index_summary(self):
|
||||||
|
head = self.head(1)._to_pandas().index[0]
|
||||||
|
tail = self.tail(1)._to_pandas().index[0]
|
||||||
|
index_summary = ', %s to %s' % (pprint_thing(head),
|
||||||
|
pprint_thing(tail))
|
||||||
|
|
||||||
|
name = "Index"
|
||||||
|
return '%s: %s entries%s' % (name, len(self), index_summary)
|
||||||
|
|
||||||
|
def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,
|
||||||
|
null_counts=None):
|
||||||
|
"""
|
||||||
|
Print a concise summary of a DataFrame.
|
||||||
|
|
||||||
|
This method prints information about a DataFrame including
|
||||||
|
the index dtype and column dtypes, non-null values and memory usage.
|
||||||
|
|
||||||
|
This copies a lot of code from pandas.DataFrame.info as it is difficult
|
||||||
|
to split out the appropriate code or creating a SparseDataFrame gives
|
||||||
|
incorrect results on types and counts.
|
||||||
|
"""
|
||||||
|
if buf is None: # pragma: no cover
|
||||||
|
buf = sys.stdout
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
|
||||||
|
lines.append(str(type(self)))
|
||||||
|
lines.append(self._index_summary())
|
||||||
|
|
||||||
|
if len(self.columns) == 0:
|
||||||
|
lines.append('Empty {name}'.format(name=type(self).__name__))
|
||||||
|
fmt.buffer_put_lines(buf, lines)
|
||||||
|
return
|
||||||
|
|
||||||
|
cols = self.columns
|
||||||
|
|
||||||
|
# hack
|
||||||
|
if max_cols is None:
|
||||||
|
max_cols = pd.get_option('display.max_info_columns',
|
||||||
|
len(self.columns) + 1)
|
||||||
|
|
||||||
|
max_rows = pd.get_option('display.max_info_rows', len(self) + 1)
|
||||||
|
|
||||||
|
if null_counts is None:
|
||||||
|
show_counts = ((len(self.columns) <= max_cols) and
|
||||||
|
(len(self) < max_rows))
|
||||||
|
else:
|
||||||
|
show_counts = null_counts
|
||||||
|
exceeds_info_cols = len(self.columns) > max_cols
|
||||||
|
|
||||||
|
# From pandas.DataFrame
|
||||||
|
def _put_str(s, space):
|
||||||
|
return '{s}'.format(s=s)[:space].ljust(space)
|
||||||
|
|
||||||
|
def _verbose_repr():
|
||||||
|
lines.append('Data columns (total %d columns):' %
|
||||||
|
len(self.columns))
|
||||||
|
space = max(len(pprint_thing(k)) for k in self.columns) + 4
|
||||||
|
counts = None
|
||||||
|
|
||||||
|
tmpl = "{count}{dtype}"
|
||||||
|
if show_counts:
|
||||||
|
counts = self.count()
|
||||||
|
if len(cols) != len(counts): # pragma: no cover
|
||||||
|
raise AssertionError(
|
||||||
|
'Columns must equal counts '
|
||||||
|
'({cols:d} != {counts:d})'.format(
|
||||||
|
cols=len(cols), counts=len(counts)))
|
||||||
|
tmpl = "{count} non-null {dtype}"
|
||||||
|
|
||||||
|
dtypes = self.dtypes
|
||||||
|
for i, col in enumerate(self.columns):
|
||||||
|
dtype = dtypes.iloc[i]
|
||||||
|
col = pprint_thing(col)
|
||||||
|
|
||||||
|
count = ""
|
||||||
|
if show_counts:
|
||||||
|
count = counts.iloc[i]
|
||||||
|
|
||||||
|
lines.append(_put_str(col, space) + tmpl.format(count=count,
|
||||||
|
dtype=dtype))
|
||||||
|
|
||||||
|
def _non_verbose_repr():
|
||||||
|
lines.append(self.columns._summary(name='Columns'))
|
||||||
|
|
||||||
|
def _sizeof_fmt(num, size_qualifier):
|
||||||
|
# returns size in human readable format
|
||||||
|
for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
|
||||||
|
if num < 1024.0:
|
||||||
|
return ("{num:3.1f}{size_q} "
|
||||||
|
"{x}".format(num=num, size_q=size_qualifier, x=x))
|
||||||
|
num /= 1024.0
|
||||||
|
return "{num:3.1f}{size_q} {pb}".format(num=num,
|
||||||
|
size_q=size_qualifier,
|
||||||
|
pb='PB')
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
_verbose_repr()
|
||||||
|
elif verbose is False: # specifically set to False, not nesc None
|
||||||
|
_non_verbose_repr()
|
||||||
|
else:
|
||||||
|
if exceeds_info_cols:
|
||||||
|
_non_verbose_repr()
|
||||||
|
else:
|
||||||
|
_verbose_repr()
|
||||||
|
|
||||||
|
counts = self.get_dtype_counts()
|
||||||
|
dtypes = ['{k}({kk:d})'.format(k=k[0], kk=k[1]) for k
|
||||||
|
in sorted(counts.items())]
|
||||||
|
lines.append('dtypes: {types}'.format(types=', '.join(dtypes)))
|
||||||
|
|
||||||
|
if memory_usage is None:
|
||||||
|
memory_usage = pd.get_option('display.memory_usage')
|
||||||
|
if memory_usage:
|
||||||
|
# append memory usage of df to display
|
||||||
|
size_qualifier = ''
|
||||||
|
|
||||||
|
# TODO - this is different from pd.DataFrame as we shouldn't
|
||||||
|
# really hold much in memory. For now just approximate with getsizeof + ignore deep
|
||||||
|
mem_usage = sys.getsizeof(self)
|
||||||
|
lines.append("memory usage: {mem}\n".format(
|
||||||
|
mem=_sizeof_fmt(mem_usage, size_qualifier)))
|
||||||
|
|
||||||
|
fmt.buffer_put_lines(buf, lines)
|
||||||
|
|
||||||
|
|
||||||
def to_string(self, buf=None, columns=None, col_space=None, header=True,
|
def to_string(self, buf=None, columns=None, col_space=None, header=True,
|
||||||
@ -153,7 +293,7 @@ class DataFrame(NDFrame):
|
|||||||
|
|
||||||
def _getitem_column(self, key):
|
def _getitem_column(self, key):
|
||||||
if key not in self.columns:
|
if key not in self.columns:
|
||||||
raise KeyError("{}".format(key))
|
raise KeyError("Requested column is not in the DataFrame {}".format(key))
|
||||||
s = self._reduce_dimension(self._query_compiler.getitem_column_array([key]))
|
s = self._reduce_dimension(self._query_compiler.getitem_column_array([key]))
|
||||||
s._parent = self
|
s._parent = self
|
||||||
return s
|
return s
|
||||||
@ -196,6 +336,17 @@ class DataFrame(NDFrame):
|
|||||||
query_compiler=self._query_compiler.getitem_column_array(key)
|
query_compiler=self._query_compiler.getitem_column_array(key)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _create_or_update_from_compiler(self, new_query_compiler, inplace=False):
|
||||||
|
"""Returns or updates a DataFrame given new query_compiler"""
|
||||||
|
assert (
|
||||||
|
isinstance(new_query_compiler, type(self._query_compiler))
|
||||||
|
or type(new_query_compiler) in self._query_compiler.__class__.__bases__
|
||||||
|
), "Invalid Query Compiler object: {}".format(type(new_query_compiler))
|
||||||
|
if not inplace:
|
||||||
|
return DataFrame(query_compiler=new_query_compiler)
|
||||||
|
else:
|
||||||
|
self._query_compiler=new_query_compiler
|
||||||
|
|
||||||
def _reduce_dimension(self, query_compiler):
|
def _reduce_dimension(self, query_compiler):
|
||||||
return Series(query_compiler=query_compiler)
|
return Series(query_compiler=query_compiler)
|
||||||
|
|
||||||
|
@ -12,6 +12,428 @@ the method in the left column. ``Y`` stands for yes, ``N`` stands for no, ``P``
|
|||||||
for partial (meaning some parameters may not be supported yet), and ``D`` stands for
|
for partial (meaning some parameters may not be supported yet), and ``D`` stands for
|
||||||
default to pandas.
|
default to pandas.
|
||||||
|
|
||||||
|
https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv represents a prioritised list.
|
||||||
|
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| Method | Count | Notes |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.read_csv | 1422 | Not implemented ed.read_es implemented instead |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.DataFrame | 886 | y |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.append | 792 | Not implemented |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.mean | 783 | y |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.head | 783 | y |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.drop | 761 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.sum | 755 | y |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.to_csv | 693 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.get | 669 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.mode | 653 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.astype | 649 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.sub | 637 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.concat | 582 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.apply | 577 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.groupby | 557 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.join | 544 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.fillna | 543 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.max | 508 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.reset_index | 434 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.unique | 433 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.le | 405 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.count | 399 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.value_counts | 397 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.sort_values | 390 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.transform | 387 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.merge | 376 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.add | 346 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.isnull | 338 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.min | 321 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.copy | 314 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.replace | 300 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.std | 261 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.hist | 246 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.filter | 234 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.describe | 220 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.ne | 218 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.corr | 217 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.median | 217 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.items | 212 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.to_datetime | 204 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.isin | 203 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.dropna | 195 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.get_dummies | 190 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.rename | 185 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.info | 180 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.set_index | 166 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.keys | 159 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.sample | 155 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.agg | 140 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.where | 138 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.boxplot | 134 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.clip | 116 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.round | 116 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.abs | 101 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.stack | 97 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.tail | 94 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.update | 92 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.iterrows | 90 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.transpose | 87 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.any | 85 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.pipe | 80 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.eval | 73 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.eval | 73 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.read_json | 72 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.nunique | 70 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.pivot | 70 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.select | 68 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.as_matrix | 67 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.notnull | 66 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.cumsum | 66 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.prod | 64 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.unstack | 64 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.drop_duplicates | 63 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.div | 63 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.crosstab | 59 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.select_dtypes | 57 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.pow | 56 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.sort_index | 56 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.product | 52 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.isna | 51 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.dot | 46 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.cut | 45 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.bool | 44 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.to_dict | 44 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.diff | 44 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.insert | 44 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.pop | 44 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.query | 43 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.var | 43 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.__init__ | 41 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.to_numeric | 39 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.squeeze | 39 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.ge | 37 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.quantile | 37 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.reindex | 37 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.rolling | 35 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.factorize | 32 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.melt | 31 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.melt | 31 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.rank | 31 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.read_table | 30 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.pivot_table | 30 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.idxmax | 30 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.test | 29 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.iteritems | 29 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.shift | 28 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.mul | 28 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.qcut | 25 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.set_value | 25 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.all | 24 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.skew | 24 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.aggregate | 23 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.match | 22 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.nlargest | 22 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.multiply | 21 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.set_axis | 19 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.eq | 18 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.resample | 18 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.read_sql | 17 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.duplicated | 16 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.date_range | 16 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.interpolate | 15 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.memory_usage | 15 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.divide | 14 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.cov | 13 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.assign | 12 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.subtract | 12 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.read_pickle | 11 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.applymap | 11 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.first | 11 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.kurt | 10 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.truncate | 10 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.get_value | 9 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.read_hdf | 9 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.to_html | 9 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.read_sql_query | 9 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.take | 8 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.to_pickle | 7 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.itertuples | 7 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.to_string | 7 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.last | 7 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.sem | 7 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.to_pickle | 7 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.to_json | 7 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.idxmin | 7 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.xs | 6 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.combine | 6 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.rolling_mean | 6 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.to_period | 6 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.convert_objects | 5 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.mask | 4 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.pct_change | 4 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.add_prefix | 4 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.read_excel | 4 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.rolling_std | 3 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.to_records | 3 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.corrwith | 3 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.swapaxes | 3 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.__iter__ | 3 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.to_sql | 3 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.read_feather | 3 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.to_feather | 3 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.__len__ | 3 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.kurtosis | 3 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.mod | 2 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.to_sparse | 2 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.get_values | 2 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.__eq__ | 2 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.bdate_range | 2 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.get_dtype_counts | 2 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.combine_first | 2 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df._get_numeric_data | 2 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.nsmallest | 2 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.scatter_matrix | 2 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.rename_axis | 2 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.__setstate__ | 2 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.cumprod | 2 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.__getstate__ | 2 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.equals | 2 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.__getitem__ | 2 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.clip_upper | 2 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.floordiv | 2 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.to_excel | 2 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.reindex_axis | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.to_timedelta | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.ewm | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.tz_localize | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.tz_convert | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.to_hdf | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.lookup | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.merge_ordered | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.swaplevel | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.first_valid_index | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.lt | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.add_suffix | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.rolling_median | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.to_dense | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.mad | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.align | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.__copy__ | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.set_eng_float_format | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.add_suffix | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.rolling_median | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.to_dense | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.mad | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.align | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| df.__copy__ | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
| pd.set_eng_float_format | 1 | |
|
||||||
|
+-------------------------+-------+------------------------------------------------+
|
||||||
|
|
||||||
+---------------------------+---------------------------------+----------------------------------------------------+
|
+---------------------------+---------------------------------+----------------------------------------------------+
|
||||||
| DataFrame method | Eland Implementation? (Y/N/P/D) | Notes for Current implementation |
|
| DataFrame method | Eland Implementation? (Y/N/P/D) | Notes for Current implementation |
|
||||||
+---------------------------+---------------------------------+----------------------------------------------------+
|
+---------------------------+---------------------------------+----------------------------------------------------+
|
||||||
|
@ -53,6 +53,7 @@ class Index:
|
|||||||
# Make iterable
|
# Make iterable
|
||||||
def __next__(self):
|
def __next__(self):
|
||||||
# TODO resolve this hack to make this 'iterable'
|
# TODO resolve this hack to make this 'iterable'
|
||||||
|
print("In Index.__next__")
|
||||||
raise StopIteration()
|
raise StopIteration()
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from pandas.core.dtypes.common import (is_float_dtype, is_bool_dtype, is_integer_dtype, is_datetime_or_timedelta_dtype,
|
||||||
|
is_string_dtype)
|
||||||
|
|
||||||
from pandas.core.dtypes.common import (is_float_dtype, is_bool_dtype, is_integer_dtype, is_datetime_or_timedelta_dtype, is_string_dtype)
|
|
||||||
|
|
||||||
class Mappings:
|
class Mappings:
|
||||||
"""
|
"""
|
||||||
@ -33,8 +34,7 @@ class Mappings:
|
|||||||
def __init__(self,
|
def __init__(self,
|
||||||
client=None,
|
client=None,
|
||||||
index_pattern=None,
|
index_pattern=None,
|
||||||
mappings=None,
|
mappings=None):
|
||||||
columns=None):
|
|
||||||
"""
|
"""
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
@ -48,9 +48,6 @@ class Mappings:
|
|||||||
|
|
||||||
mappings: Mappings
|
mappings: Mappings
|
||||||
Object to copy
|
Object to copy
|
||||||
|
|
||||||
columns: list of str
|
|
||||||
Columns to copy
|
|
||||||
"""
|
"""
|
||||||
if (client is not None) and (index_pattern is not None):
|
if (client is not None) and (index_pattern is not None):
|
||||||
get_mapping = client.get_mapping(index=index_pattern)
|
get_mapping = client.get_mapping(index=index_pattern)
|
||||||
@ -203,11 +200,11 @@ class Mappings:
|
|||||||
|
|
||||||
if 'non_aggregatable_indices' in vv:
|
if 'non_aggregatable_indices' in vv:
|
||||||
warnings.warn("Field {} has conflicting aggregatable fields across indexes {}",
|
warnings.warn("Field {} has conflicting aggregatable fields across indexes {}",
|
||||||
format(field_name, vv['non_aggregatable_indices']),
|
format(field, vv['non_aggregatable_indices']),
|
||||||
UserWarning)
|
UserWarning)
|
||||||
if 'non_searchable_indices' in vv:
|
if 'non_searchable_indices' in vv:
|
||||||
warnings.warn("Field {} has conflicting searchable fields across indexes {}",
|
warnings.warn("Field {} has conflicting searchable fields across indexes {}",
|
||||||
format(field_name, vv['non_searchable_indices']),
|
format(field, vv['non_searchable_indices']),
|
||||||
UserWarning)
|
UserWarning)
|
||||||
|
|
||||||
capability_matrix_df = pd.DataFrame.from_dict(capability_matrix, orient='index', columns=columns)
|
capability_matrix_df = pd.DataFrame.from_dict(capability_matrix, orient='index', columns=columns)
|
||||||
@ -406,16 +403,23 @@ class Mappings:
|
|||||||
|
|
||||||
return is_source_field
|
return is_source_field
|
||||||
|
|
||||||
def numeric_source_fields(self):
|
def numeric_source_fields(self, columns):
|
||||||
"""
|
"""
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
numeric_source_fields: list of str
|
numeric_source_fields: list of str
|
||||||
List of source fields where pd_dtype == (int64 or float64)
|
List of source fields where pd_dtype == (int64 or float64 or bool)
|
||||||
"""
|
"""
|
||||||
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
if columns is not None:
|
||||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||||
(self._mappings_capabilities.pd_dtype == 'float64'))].index.tolist()
|
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||||
|
(self._mappings_capabilities.pd_dtype == 'float64') |
|
||||||
|
(self._mappings_capabilities.pd_dtype == 'bool'))].loc[columns].index.tolist()
|
||||||
|
else:
|
||||||
|
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||||
|
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||||
|
(self._mappings_capabilities.pd_dtype == 'float64') |
|
||||||
|
(self._mappings_capabilities.pd_dtype == 'bool'))].index.tolist()
|
||||||
|
|
||||||
def source_fields(self):
|
def source_fields(self):
|
||||||
"""
|
"""
|
||||||
@ -435,16 +439,20 @@ class Mappings:
|
|||||||
"""
|
"""
|
||||||
return len(self.source_fields())
|
return len(self.source_fields())
|
||||||
|
|
||||||
def dtypes(self):
|
def dtypes(self, columns=None):
|
||||||
"""
|
"""
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
dtypes: pd.Series
|
dtypes: pd.Series
|
||||||
Source field name + pd_dtype
|
Source field name + pd_dtype
|
||||||
"""
|
"""
|
||||||
|
if columns is not None:
|
||||||
|
return pd.Series(
|
||||||
|
{key: self._source_field_pd_dtypes[key] for key in columns})
|
||||||
|
|
||||||
return pd.Series(self._source_field_pd_dtypes)
|
return pd.Series(self._source_field_pd_dtypes)
|
||||||
|
|
||||||
def get_dtype_counts(self):
|
def get_dtype_counts(self, columns=None):
|
||||||
"""
|
"""
|
||||||
Return counts of unique dtypes in this object.
|
Return counts of unique dtypes in this object.
|
||||||
|
|
||||||
@ -453,10 +461,17 @@ class Mappings:
|
|||||||
get_dtype_counts : Series
|
get_dtype_counts : Series
|
||||||
Series with the count of columns with each dtype.
|
Series with the count of columns with each dtype.
|
||||||
"""
|
"""
|
||||||
return pd.Series(self._mappings_capabilities[self._mappings_capabilities._source == True].groupby('pd_dtype')[
|
|
||||||
'_source'].count().to_dict())
|
if columns is not None:
|
||||||
|
return pd.Series(self._mappings_capabilities[self._mappings_capabilities._source == True]
|
||||||
|
.loc[columns]
|
||||||
|
.groupby('pd_dtype')['_source']
|
||||||
|
.count().to_dict())
|
||||||
|
|
||||||
|
return pd.Series(self._mappings_capabilities[self._mappings_capabilities._source == True]
|
||||||
|
.groupby('pd_dtype')['_source']
|
||||||
|
.count().to_dict())
|
||||||
|
|
||||||
def info_es(self, buf):
|
def info_es(self, buf):
|
||||||
buf.write("Mappings:\n")
|
buf.write("Mappings:\n")
|
||||||
buf.write("\tcapabilities: {0}\n".format(self._mappings_capabilities))
|
buf.write("\tcapabilities: {0}\n".format(self._mappings_capabilities))
|
||||||
|
|
||||||
|
131
eland/ndframe.py
131
eland/ndframe.py
@ -23,8 +23,13 @@ only Elasticsearch aggregatable fields can be aggregated or grouped.
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
from modin.pandas.base import BasePandasDataset
|
from modin.pandas.base import BasePandasDataset
|
||||||
from modin.pandas.indexing import _iLocIndexer
|
from modin.pandas.indexing import _iLocIndexer
|
||||||
|
from pandas.util._validators import validate_bool_kwarg
|
||||||
|
from pandas.core.dtypes.common import is_list_like
|
||||||
|
|
||||||
from eland import ElandQueryCompiler
|
from eland import ElandQueryCompiler
|
||||||
|
|
||||||
@ -52,12 +57,18 @@ class NDFrame(BasePandasDataset):
|
|||||||
index_field=index_field)
|
index_field=index_field)
|
||||||
self._query_compiler = query_compiler
|
self._query_compiler = query_compiler
|
||||||
|
|
||||||
|
|
||||||
def _get_index(self):
|
def _get_index(self):
|
||||||
return self._query_compiler.index
|
return self._query_compiler.index
|
||||||
|
|
||||||
index = property(_get_index)
|
index = property(_get_index)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dtypes(self):
|
||||||
|
return self._query_compiler.dtypes
|
||||||
|
|
||||||
|
def get_dtype_counts(self):
|
||||||
|
return self._query_compiler.get_dtype_counts()
|
||||||
|
|
||||||
def _build_repr_df(self, num_rows, num_cols):
|
def _build_repr_df(self, num_rows, num_cols):
|
||||||
# Overriden version of BasePandasDataset._build_repr_df
|
# Overriden version of BasePandasDataset._build_repr_df
|
||||||
# to avoid issues with concat
|
# to avoid issues with concat
|
||||||
@ -91,6 +102,10 @@ class NDFrame(BasePandasDataset):
|
|||||||
return self[key]
|
return self[key]
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
def __sizeof__(self):
|
||||||
|
# Don't default to pandas, just return approximation TODO - make this more accurate
|
||||||
|
return sys.getsizeof(self._query_compiler)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def iloc(self):
|
def iloc(self):
|
||||||
"""Purely integer-location based indexing for selection by position.
|
"""Purely integer-location based indexing for selection by position.
|
||||||
@ -101,3 +116,117 @@ class NDFrame(BasePandasDataset):
|
|||||||
def info_es(self, buf):
|
def info_es(self, buf):
|
||||||
self._query_compiler.info_es(buf)
|
self._query_compiler.info_es(buf)
|
||||||
|
|
||||||
|
def drop(
|
||||||
|
self,
|
||||||
|
labels=None,
|
||||||
|
axis=0,
|
||||||
|
index=None,
|
||||||
|
columns=None,
|
||||||
|
level=None,
|
||||||
|
inplace=False,
|
||||||
|
errors="raise",
|
||||||
|
):
|
||||||
|
"""Return new object with labels in requested axis removed.
|
||||||
|
Args:
|
||||||
|
labels: Index or column labels to drop.
|
||||||
|
axis: Whether to drop labels from the index (0 / 'index') or
|
||||||
|
columns (1 / 'columns').
|
||||||
|
index, columns: Alternative to specifying axis (labels, axis=1 is
|
||||||
|
equivalent to columns=labels).
|
||||||
|
level: For MultiIndex
|
||||||
|
inplace: If True, do operation inplace and return None.
|
||||||
|
errors: If 'ignore', suppress error and existing labels are
|
||||||
|
dropped.
|
||||||
|
Returns:
|
||||||
|
dropped : type of caller
|
||||||
|
|
||||||
|
(derived from modin.base.BasePandasDataset)
|
||||||
|
"""
|
||||||
|
# Level not supported
|
||||||
|
if level is not None:
|
||||||
|
raise NotImplementedError("level not supported {}".format(level))
|
||||||
|
|
||||||
|
inplace = validate_bool_kwarg(inplace, "inplace")
|
||||||
|
if labels is not None:
|
||||||
|
if index is not None or columns is not None:
|
||||||
|
raise ValueError("Cannot specify both 'labels' and 'index'/'columns'")
|
||||||
|
axis = pd.DataFrame()._get_axis_name(axis)
|
||||||
|
axes = {axis: labels}
|
||||||
|
elif index is not None or columns is not None:
|
||||||
|
axes, _ = pd.DataFrame()._construct_axes_from_arguments(
|
||||||
|
(index, columns), {}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"Need to specify at least one of 'labels', 'index' or 'columns'"
|
||||||
|
)
|
||||||
|
|
||||||
|
# TODO Clean up this error checking
|
||||||
|
if "index" not in axes:
|
||||||
|
axes["index"] = None
|
||||||
|
elif axes["index"] is not None:
|
||||||
|
if not is_list_like(axes["index"]):
|
||||||
|
axes["index"] = [axes["index"]]
|
||||||
|
if errors == "raise":
|
||||||
|
# Check if axes['index'] values exists in index
|
||||||
|
count = self._query_compiler._index_matches_count(axes["index"])
|
||||||
|
if count != len(axes["index"]):
|
||||||
|
raise ValueError(
|
||||||
|
"number of labels {}!={} not contained in axis".format(count, len(axes["index"]))
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
"""
|
||||||
|
axes["index"] = self._query_compiler.index_matches(axes["index"])
|
||||||
|
# If the length is zero, we will just do nothing
|
||||||
|
if not len(axes["index"]):
|
||||||
|
axes["index"] = None
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
if "columns" not in axes:
|
||||||
|
axes["columns"] = None
|
||||||
|
elif axes["columns"] is not None:
|
||||||
|
if not is_list_like(axes["columns"]):
|
||||||
|
axes["columns"] = [axes["columns"]]
|
||||||
|
if errors == "raise":
|
||||||
|
non_existant = [
|
||||||
|
obj for obj in axes["columns"] if obj not in self.columns
|
||||||
|
]
|
||||||
|
if len(non_existant):
|
||||||
|
raise ValueError(
|
||||||
|
"labels {} not contained in axis".format(non_existant)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
axes["columns"] = [
|
||||||
|
obj for obj in axes["columns"] if obj in self.columns
|
||||||
|
]
|
||||||
|
# If the length is zero, we will just do nothing
|
||||||
|
if not len(axes["columns"]):
|
||||||
|
axes["columns"] = None
|
||||||
|
|
||||||
|
new_query_compiler = self._query_compiler.drop(
|
||||||
|
index=axes["index"], columns=axes["columns"]
|
||||||
|
)
|
||||||
|
return self._create_or_update_from_compiler(new_query_compiler, inplace)
|
||||||
|
|
||||||
|
# TODO implement arguments
|
||||||
|
def mean(self):
|
||||||
|
return self._query_compiler.mean()
|
||||||
|
|
||||||
|
def sum(self, numeric_only=True):
|
||||||
|
if numeric_only == False:
|
||||||
|
raise NotImplementedError("Only sum of numeric fields is implemented")
|
||||||
|
return self._query_compiler.sum()
|
||||||
|
|
||||||
|
def min(self, numeric_only=True):
|
||||||
|
if numeric_only == False:
|
||||||
|
raise NotImplementedError("Only sum of numeric fields is implemented")
|
||||||
|
return self._query_compiler.min()
|
||||||
|
|
||||||
|
def max(self, numeric_only=True):
|
||||||
|
if numeric_only == False:
|
||||||
|
raise NotImplementedError("Only sum of numeric fields is implemented")
|
||||||
|
return self._query_compiler.max()
|
||||||
|
|
||||||
|
def describe(self):
|
||||||
|
return self._query_compiler.describe()
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
|
import copy
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
|
||||||
from pandas.core.indexes.numeric import Int64Index
|
import pandas as pd
|
||||||
from pandas.core.indexes.range import RangeIndex
|
from elasticsearch_dsl import Search
|
||||||
|
|
||||||
import copy
|
from eland import Index
|
||||||
|
from eland import Query
|
||||||
|
|
||||||
|
|
||||||
class Operations:
|
class Operations:
|
||||||
@ -16,17 +18,6 @@ class Operations:
|
|||||||
- a query to filter the results (e.g. df.A > 10)
|
- a query to filter the results (e.g. df.A > 10)
|
||||||
|
|
||||||
This is maintained as a 'task graph' (inspired by dask)
|
This is maintained as a 'task graph' (inspired by dask)
|
||||||
|
|
||||||
A task graph is a dictionary mapping keys to computations:
|
|
||||||
|
|
||||||
A key is any hashable value that is not a task:
|
|
||||||
```
|
|
||||||
{'x': 1,
|
|
||||||
'y': 2,
|
|
||||||
'z': (add, 'x', 'y'),
|
|
||||||
'w': (sum, ['x', 'y', 'z']),
|
|
||||||
'v': [(sum, ['w', 'z']), 2]}
|
|
||||||
```
|
|
||||||
(see https://docs.dask.org/en/latest/spec.html)
|
(see https://docs.dask.org/en/latest/spec.html)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -54,7 +45,6 @@ class Operations:
|
|||||||
|
|
||||||
return Operations.SortOrder.DESC
|
return Operations.SortOrder.DESC
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, tasks=None):
|
def __init__(self, tasks=None):
|
||||||
if tasks == None:
|
if tasks == None:
|
||||||
self._tasks = []
|
self._tasks = []
|
||||||
@ -84,6 +74,11 @@ class Operations:
|
|||||||
# TODO - validate we are setting columns to a subset of last columns?
|
# TODO - validate we are setting columns to a subset of last columns?
|
||||||
task = ('columns', columns)
|
task = ('columns', columns)
|
||||||
self._tasks.append(task)
|
self._tasks.append(task)
|
||||||
|
# Iterate backwards through task list looking for last 'columns' task
|
||||||
|
for task in reversed(self._tasks):
|
||||||
|
if task[0] == 'columns':
|
||||||
|
return task[1]
|
||||||
|
return None
|
||||||
|
|
||||||
def get_columns(self):
|
def get_columns(self):
|
||||||
# Iterate backwards through task list looking for last 'columns' task
|
# Iterate backwards through task list looking for last 'columns' task
|
||||||
@ -95,10 +90,132 @@ class Operations:
|
|||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return repr(self._tasks)
|
return repr(self._tasks)
|
||||||
|
|
||||||
def to_pandas(self, query_compiler):
|
def count(self, query_compiler):
|
||||||
query, post_processing = self._to_es_query()
|
query_params, post_processing = self._resolve_tasks()
|
||||||
|
|
||||||
size, sort_params = Operations._query_to_params(query)
|
# Elasticsearch _count is very efficient and so used to return results here. This means that
|
||||||
|
# data frames that have restricted size or sort params will not return valid results (_count doesn't support size).
|
||||||
|
# Longer term we may fall back to pandas, but this may result in loading all index into memory.
|
||||||
|
if self._size(query_params, post_processing) is not None:
|
||||||
|
raise NotImplementedError("Requesting count with additional query and processing parameters "
|
||||||
|
"not supported {0} {1}"
|
||||||
|
.format(query_params, post_processing))
|
||||||
|
|
||||||
|
# Only return requested columns
|
||||||
|
fields = query_compiler.columns
|
||||||
|
|
||||||
|
counts = {}
|
||||||
|
for field in fields:
|
||||||
|
body = Query(query_params['query'])
|
||||||
|
body.exists(field, must=True)
|
||||||
|
|
||||||
|
field_exists_count = query_compiler._client.count(index=query_compiler._index_pattern,
|
||||||
|
body=body.to_count_body())
|
||||||
|
counts[field] = field_exists_count
|
||||||
|
|
||||||
|
return pd.Series(data=counts, index=fields)
|
||||||
|
|
||||||
|
def mean(self, query_compiler):
|
||||||
|
return self._metric_aggs(query_compiler, 'avg')
|
||||||
|
|
||||||
|
def sum(self, query_compiler):
|
||||||
|
return self._metric_aggs(query_compiler, 'sum')
|
||||||
|
|
||||||
|
def max(self, query_compiler):
|
||||||
|
return self._metric_aggs(query_compiler, 'max')
|
||||||
|
|
||||||
|
def min(self, query_compiler):
|
||||||
|
return self._metric_aggs(query_compiler, 'min')
|
||||||
|
|
||||||
|
def _metric_aggs(self, query_compiler, func):
|
||||||
|
query_params, post_processing = self._resolve_tasks()
|
||||||
|
|
||||||
|
size = self._size(query_params, post_processing)
|
||||||
|
if size is not None:
|
||||||
|
raise NotImplementedError("Can not count field matches if size is set {}".format(size))
|
||||||
|
|
||||||
|
columns = self.get_columns()
|
||||||
|
|
||||||
|
numeric_source_fields = query_compiler._mappings.numeric_source_fields(columns)
|
||||||
|
|
||||||
|
body = Query(query_params['query'])
|
||||||
|
|
||||||
|
for field in numeric_source_fields:
|
||||||
|
body.metric_aggs(field, func, field)
|
||||||
|
|
||||||
|
response = query_compiler._client.search(
|
||||||
|
index=query_compiler._index_pattern,
|
||||||
|
size=0,
|
||||||
|
body=body.to_search_body())
|
||||||
|
|
||||||
|
# Results are of the form
|
||||||
|
# "aggregations" : {
|
||||||
|
# "AvgTicketPrice" : {
|
||||||
|
# "value" : 628.2536888148849
|
||||||
|
# }
|
||||||
|
# }
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
for field in numeric_source_fields:
|
||||||
|
results[field] = response['aggregations'][field]['value']
|
||||||
|
|
||||||
|
s = pd.Series(data=results, index=numeric_source_fields)
|
||||||
|
|
||||||
|
return s
|
||||||
|
|
||||||
|
def describe(self, query_compiler):
|
||||||
|
query_params, post_processing = self._resolve_tasks()
|
||||||
|
|
||||||
|
size = self._size(query_params, post_processing)
|
||||||
|
if size is not None:
|
||||||
|
raise NotImplementedError("Can not count field matches if size is set {}".format(size))
|
||||||
|
|
||||||
|
columns = self.get_columns()
|
||||||
|
|
||||||
|
numeric_source_fields = query_compiler._mappings.numeric_source_fields(columns)
|
||||||
|
|
||||||
|
# for each field we compute:
|
||||||
|
# count, mean, std, min, 25%, 50%, 75%, max
|
||||||
|
body = Query(query_params['query'])
|
||||||
|
|
||||||
|
for field in numeric_source_fields:
|
||||||
|
body.metric_aggs('extended_stats_' + field, 'extended_stats', field)
|
||||||
|
body.metric_aggs('percentiles_' + field, 'percentiles', field)
|
||||||
|
|
||||||
|
print(body.to_search_body())
|
||||||
|
|
||||||
|
response = query_compiler._client.search(
|
||||||
|
index=query_compiler._index_pattern,
|
||||||
|
size=0,
|
||||||
|
body=body.to_search_body())
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
for field in numeric_source_fields:
|
||||||
|
values = list()
|
||||||
|
values.append(response['aggregations']['extended_stats_' + field]['count'])
|
||||||
|
values.append(response['aggregations']['extended_stats_' + field]['avg'])
|
||||||
|
values.append(response['aggregations']['extended_stats_' + field]['std_deviation'])
|
||||||
|
values.append(response['aggregations']['extended_stats_' + field]['min'])
|
||||||
|
values.append(response['aggregations']['percentiles_' + field]['values']['25.0'])
|
||||||
|
values.append(response['aggregations']['percentiles_' + field]['values']['50.0'])
|
||||||
|
values.append(response['aggregations']['percentiles_' + field]['values']['75.0'])
|
||||||
|
values.append(response['aggregations']['extended_stats_' + field]['max'])
|
||||||
|
|
||||||
|
# if not None
|
||||||
|
if values.count(None) < len(values):
|
||||||
|
results[field] = values
|
||||||
|
|
||||||
|
df = pd.DataFrame(data=results, index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
def to_pandas(self, query_compiler):
|
||||||
|
query_params, post_processing = self._resolve_tasks()
|
||||||
|
|
||||||
|
size, sort_params = Operations._query_params_to_size_and_sort(query_params)
|
||||||
|
|
||||||
|
body = Query(query_params['query'])
|
||||||
|
|
||||||
# Only return requested columns
|
# Only return requested columns
|
||||||
columns = self.get_columns()
|
columns = self.get_columns()
|
||||||
@ -110,10 +227,12 @@ class Operations:
|
|||||||
index=query_compiler._index_pattern,
|
index=query_compiler._index_pattern,
|
||||||
size=size,
|
size=size,
|
||||||
sort=sort_params,
|
sort=sort_params,
|
||||||
|
body=body.to_search_body(),
|
||||||
_source=columns)
|
_source=columns)
|
||||||
else:
|
else:
|
||||||
es_results = query_compiler._client.scan(
|
es_results = query_compiler._client.scan(
|
||||||
index=query_compiler._index_pattern,
|
index=query_compiler._index_pattern,
|
||||||
|
query=body.to_search_body(),
|
||||||
_source=columns)
|
_source=columns)
|
||||||
# create post sort
|
# create post sort
|
||||||
if sort_params is not None:
|
if sort_params is not None:
|
||||||
@ -132,25 +251,65 @@ class Operations:
|
|||||||
task = ('squeeze', axis)
|
task = ('squeeze', axis)
|
||||||
self._tasks.append(task)
|
self._tasks.append(task)
|
||||||
|
|
||||||
def to_count(self, query_compiler):
|
def index_count(self, query_compiler, field):
|
||||||
query, post_processing = self._to_es_query()
|
# field is the index field so count values
|
||||||
|
query_params, post_processing = self._resolve_tasks()
|
||||||
|
|
||||||
size = query['query_size'] # can be None
|
size = self._size(query_params, post_processing)
|
||||||
|
|
||||||
pp_size = self._count_post_processing(post_processing)
|
|
||||||
if pp_size is not None:
|
|
||||||
if size is not None:
|
|
||||||
size = min(size, pp_size)
|
|
||||||
else:
|
|
||||||
size = pp_size
|
|
||||||
|
|
||||||
# Size is dictated by operations
|
# Size is dictated by operations
|
||||||
if size is not None:
|
if size is not None:
|
||||||
|
# TODO - this is not necessarily valid as the field may not exist in ALL these docs
|
||||||
return size
|
return size
|
||||||
|
|
||||||
exists_query = {"query": {"exists": {"field": query_compiler.index.index_field}}}
|
body = Query(query_params['query'])
|
||||||
|
body.exists(field, must=True)
|
||||||
|
|
||||||
return query_compiler._client.count(index=query_compiler._index_pattern, body=exists_query)
|
return query_compiler._client.count(index=query_compiler._index_pattern, body=body.to_count_body())
|
||||||
|
|
||||||
|
def _validate_index_operation(self, items):
|
||||||
|
if not isinstance(items, list):
|
||||||
|
raise TypeError("list item required - not {}".format(type(items)))
|
||||||
|
|
||||||
|
# field is the index field so count values
|
||||||
|
query_params, post_processing = self._resolve_tasks()
|
||||||
|
|
||||||
|
size = self._size(query_params, post_processing)
|
||||||
|
|
||||||
|
# Size is dictated by operations
|
||||||
|
if size is not None:
|
||||||
|
raise NotImplementedError("Can not count field matches if size is set {}".format(size))
|
||||||
|
|
||||||
|
return query_params, post_processing
|
||||||
|
|
||||||
|
def index_matches_count(self, query_compiler, field, items):
|
||||||
|
query_params, post_processing = self._validate_index_operation(items)
|
||||||
|
|
||||||
|
body = Query(query_params['query'])
|
||||||
|
|
||||||
|
if field == Index.ID_INDEX_FIELD:
|
||||||
|
body.ids(items, must=True)
|
||||||
|
else:
|
||||||
|
body.terms(items, must=True)
|
||||||
|
|
||||||
|
return query_compiler._client.count(index=query_compiler._index_pattern, body=body.to_count_body())
|
||||||
|
|
||||||
|
def drop_index_values(self, query_compiler, field, items):
|
||||||
|
self._validate_index_operation(items)
|
||||||
|
|
||||||
|
# Putting boolean queries together
|
||||||
|
# i = 10
|
||||||
|
# not i = 20
|
||||||
|
# _id in [1,2,3]
|
||||||
|
# _id not in [1,2,3]
|
||||||
|
# a in ['a','b','c']
|
||||||
|
# b not in ['a','b','c']
|
||||||
|
# For now use term queries
|
||||||
|
if field == Index.ID_INDEX_FIELD:
|
||||||
|
task = ('query_ids', ('must_not', items))
|
||||||
|
else:
|
||||||
|
task = ('query_terms', ('must_not', (field, items)))
|
||||||
|
self._tasks.append(task)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _sort_params_to_postprocessing(input):
|
def _sort_params_to_postprocessing(input):
|
||||||
@ -165,18 +324,21 @@ class Operations:
|
|||||||
return task
|
return task
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _query_to_params(query):
|
def _query_params_to_size_and_sort(query_params):
|
||||||
sort_params = None
|
sort_params = None
|
||||||
if query['query_sort_field'] and query['query_sort_order']:
|
if query_params['query_sort_field'] and query_params['query_sort_order']:
|
||||||
sort_params = query['query_sort_field'] + ":" + Operations.SortOrder.to_string(query['query_sort_order'])
|
sort_params = query_params['query_sort_field'] + ":" + Operations.SortOrder.to_string(
|
||||||
|
query_params['query_sort_order'])
|
||||||
|
|
||||||
size = query['query_size']
|
size = query_params['query_size']
|
||||||
|
|
||||||
return size, sort_params
|
return size, sort_params
|
||||||
|
|
||||||
1
|
1
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _count_post_processing(post_processing):
|
def _count_post_processing(post_processing):
|
||||||
size = None
|
size = None
|
||||||
for action in post_processing:
|
for action in post_processing:
|
||||||
if action[0] == 'head' or action[0] == 'tail':
|
if action[0] == 'head' or action[0] == 'tail':
|
||||||
if size is None or action[1][1] < size:
|
if size is None or action[1][1] < size:
|
||||||
@ -201,45 +363,48 @@ class Operations:
|
|||||||
else:
|
else:
|
||||||
df = df.sort_values(sort_field, False)
|
df = df.sort_values(sort_field, False)
|
||||||
elif action[0] == 'iloc':
|
elif action[0] == 'iloc':
|
||||||
index_indexer = action[1][0]
|
index_indexer = action[1][0]
|
||||||
column_indexer = action[1][1]
|
column_indexer = action[1][1]
|
||||||
if index_indexer is None:
|
if index_indexer is None:
|
||||||
index_indexer = slice(None)
|
index_indexer = slice(None)
|
||||||
if column_indexer is None:
|
if column_indexer is None:
|
||||||
column_indexer = slice(None)
|
column_indexer = slice(None)
|
||||||
df = df.iloc[index_indexer, column_indexer]
|
df = df.iloc[index_indexer, column_indexer]
|
||||||
elif action[0] == 'squeeze':
|
elif action[0] == 'squeeze':
|
||||||
print(df)
|
|
||||||
df = df.squeeze(axis=action[1])
|
df = df.squeeze(axis=action[1])
|
||||||
print(df)
|
|
||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
def _to_es_query(self):
|
def _resolve_tasks(self):
|
||||||
# We now try and combine all tasks into an Elasticsearch query
|
# We now try and combine all tasks into an Elasticsearch query
|
||||||
# Some operations can be simply combined into a single query
|
# Some operations can be simply combined into a single query
|
||||||
# other operations require pre-queries and then combinations
|
# other operations require pre-queries and then combinations
|
||||||
# other operations require in-core post-processing of results
|
# other operations require in-core post-processing of results
|
||||||
query = {"query_sort_field": None,
|
query_params = {"query_sort_field": None,
|
||||||
"query_sort_order": None,
|
"query_sort_order": None,
|
||||||
"query_size": None,
|
"query_size": None,
|
||||||
"query_fields": None}
|
"query_fields": None,
|
||||||
|
"query": Query()}
|
||||||
|
|
||||||
post_processing = []
|
post_processing = []
|
||||||
|
|
||||||
for task in self._tasks:
|
for task in self._tasks:
|
||||||
if task[0] == 'head':
|
if task[0] == 'head':
|
||||||
query, post_processing = self._resolve_head(task, query, post_processing)
|
query_params, post_processing = self._resolve_head(task, query_params, post_processing)
|
||||||
elif task[0] == 'tail':
|
elif task[0] == 'tail':
|
||||||
query, post_processing = self._resolve_tail(task, query, post_processing)
|
query_params, post_processing = self._resolve_tail(task, query_params, post_processing)
|
||||||
elif task[0] == 'iloc':
|
elif task[0] == 'iloc':
|
||||||
query, post_processing = self._resolve_iloc(task, query, post_processing)
|
query_params, post_processing = self._resolve_iloc(task, query_params, post_processing)
|
||||||
else: # a lot of operations simply post-process the dataframe - put these straight through
|
elif task[0] == 'query_ids':
|
||||||
query, post_processing = self._resolve_post_processing_task(task, query, post_processing)
|
query_params, post_processing = self._resolve_query_ids(task, query_params, post_processing)
|
||||||
|
elif task[0] == 'query_terms':
|
||||||
|
query_params, post_processing = self._resolve_query_terms(task, query_params, post_processing)
|
||||||
|
else: # a lot of operations simply post-process the dataframe - put these straight through
|
||||||
|
query_params, post_processing = self._resolve_post_processing_task(task, query_params, post_processing)
|
||||||
|
|
||||||
return query, post_processing
|
return query_params, post_processing
|
||||||
|
|
||||||
def _resolve_head(self, item, query, post_processing):
|
def _resolve_head(self, item, query_params, post_processing):
|
||||||
# head - sort asc, size n
|
# head - sort asc, size n
|
||||||
# |12345-------------|
|
# |12345-------------|
|
||||||
query_sort_field = item[1][0]
|
query_sort_field = item[1][0]
|
||||||
@ -251,26 +416,26 @@ class Operations:
|
|||||||
# overwriting previous head)
|
# overwriting previous head)
|
||||||
if len(post_processing) > 0:
|
if len(post_processing) > 0:
|
||||||
post_processing.append(item)
|
post_processing.append(item)
|
||||||
return query, post_processing
|
return query_params, post_processing
|
||||||
|
|
||||||
if query['query_sort_field'] is None:
|
if query_params['query_sort_field'] is None:
|
||||||
query['query_sort_field'] = query_sort_field
|
query_params['query_sort_field'] = query_sort_field
|
||||||
# if it is already sorted we use existing field
|
# if it is already sorted we use existing field
|
||||||
|
|
||||||
if query['query_sort_order'] is None:
|
if query_params['query_sort_order'] is None:
|
||||||
query['query_sort_order'] = query_sort_order
|
query_params['query_sort_order'] = query_sort_order
|
||||||
# if it is already sorted we get head of existing order
|
# if it is already sorted we get head of existing order
|
||||||
|
|
||||||
if query['query_size'] is None:
|
if query_params['query_size'] is None:
|
||||||
query['query_size'] = query_size
|
query_params['query_size'] = query_size
|
||||||
else:
|
else:
|
||||||
# truncate if head is smaller
|
# truncate if head is smaller
|
||||||
if query_size < query['query_size']:
|
if query_size < query_params['query_size']:
|
||||||
query['query_size'] = query_size
|
query_params['query_size'] = query_size
|
||||||
|
|
||||||
return query, post_processing
|
return query_params, post_processing
|
||||||
|
|
||||||
def _resolve_tail(self, item, query, post_processing):
|
def _resolve_tail(self, item, query_params, post_processing):
|
||||||
# tail - sort desc, size n, post-process sort asc
|
# tail - sort desc, size n, post-process sort asc
|
||||||
# |-------------12345|
|
# |-------------12345|
|
||||||
query_sort_field = item[1][0]
|
query_sort_field = item[1][0]
|
||||||
@ -278,41 +443,41 @@ class Operations:
|
|||||||
query_size = item[1][1]
|
query_size = item[1][1]
|
||||||
|
|
||||||
# If this is a tail of a tail adjust settings and return
|
# If this is a tail of a tail adjust settings and return
|
||||||
if query['query_size'] is not None and \
|
if query_params['query_size'] is not None and \
|
||||||
query['query_sort_order'] == query_sort_order and \
|
query_params['query_sort_order'] == query_sort_order and \
|
||||||
post_processing == [('sort_index')]:
|
post_processing == [('sort_index')]:
|
||||||
if query_size < query['query_size']:
|
if query_size < query_params['query_size']:
|
||||||
query['query_size'] = query_size
|
query_params['query_size'] = query_size
|
||||||
return query, post_processing
|
return query_params, post_processing
|
||||||
|
|
||||||
# If we are already postprocessing the query results, just get 'tail' of these
|
# If we are already postprocessing the query results, just get 'tail' of these
|
||||||
# (note, currently we just append another tail, we don't optimise by
|
# (note, currently we just append another tail, we don't optimise by
|
||||||
# overwriting previous tail)
|
# overwriting previous tail)
|
||||||
if len(post_processing) > 0:
|
if len(post_processing) > 0:
|
||||||
post_processing.append(item)
|
post_processing.append(item)
|
||||||
return query, post_processing
|
return query_params, post_processing
|
||||||
|
|
||||||
# If results are already constrained, just get 'tail' of these
|
# If results are already constrained, just get 'tail' of these
|
||||||
# (note, currently we just append another tail, we don't optimise by
|
# (note, currently we just append another tail, we don't optimise by
|
||||||
# overwriting previous tail)
|
# overwriting previous tail)
|
||||||
if query['query_size'] is not None:
|
if query_params['query_size'] is not None:
|
||||||
post_processing.append(item)
|
post_processing.append(item)
|
||||||
return query, post_processing
|
return query_params, post_processing
|
||||||
else:
|
else:
|
||||||
query['query_size'] = query_size
|
query_params['query_size'] = query_size
|
||||||
if query['query_sort_field'] is None:
|
if query_params['query_sort_field'] is None:
|
||||||
query['query_sort_field'] = query_sort_field
|
query_params['query_sort_field'] = query_sort_field
|
||||||
if query['query_sort_order'] is None:
|
if query_params['query_sort_order'] is None:
|
||||||
query['query_sort_order'] = query_sort_order
|
query_params['query_sort_order'] = query_sort_order
|
||||||
else:
|
else:
|
||||||
# reverse sort order
|
# reverse sort order
|
||||||
query['query_sort_order'] = Operations.SortOrder.reverse(query_sort_order)
|
query_params['query_sort_order'] = Operations.SortOrder.reverse(query_sort_order)
|
||||||
|
|
||||||
post_processing.append(('sort_index'))
|
post_processing.append(('sort_index'))
|
||||||
|
|
||||||
return query, post_processing
|
return query_params, post_processing
|
||||||
|
|
||||||
def _resolve_iloc(self, item, query, post_processing):
|
def _resolve_iloc(self, item, query_params, post_processing):
|
||||||
# tail - sort desc, size n, post-process sort asc
|
# tail - sort desc, size n, post-process sort asc
|
||||||
# |---4--7-9---------|
|
# |---4--7-9---------|
|
||||||
|
|
||||||
@ -322,33 +487,70 @@ class Operations:
|
|||||||
last_item = int_index.max()
|
last_item = int_index.max()
|
||||||
|
|
||||||
# If we have a query_size we do this post processing
|
# If we have a query_size we do this post processing
|
||||||
if query['query_size'] is not None:
|
if query_params['query_size'] is not None:
|
||||||
post_processing.append(item)
|
post_processing.append(item)
|
||||||
return query, post_processing
|
return query_params, post_processing
|
||||||
|
|
||||||
# size should be > last item
|
# size should be > last item
|
||||||
query['query_size'] = last_item + 1
|
query_params['query_size'] = last_item + 1
|
||||||
post_processing.append(item)
|
post_processing.append(item)
|
||||||
|
|
||||||
return query, post_processing
|
return query_params, post_processing
|
||||||
|
|
||||||
def _resolve_post_processing_task(self, item, query, post_processing):
|
def _resolve_query_ids(self, item, query_params, post_processing):
|
||||||
|
# task = ('query_ids', ('must_not', items))
|
||||||
|
must_clause = item[1][0]
|
||||||
|
ids = item[1][1]
|
||||||
|
|
||||||
|
if must_clause == 'must':
|
||||||
|
query_params['query'].ids(ids, must=True)
|
||||||
|
else:
|
||||||
|
query_params['query'].ids(ids, must=False)
|
||||||
|
|
||||||
|
return query_params, post_processing
|
||||||
|
|
||||||
|
def _resolve_query_terms(self, item, query_params, post_processing):
|
||||||
|
# task = ('query_terms', ('must_not', (field, terms)))
|
||||||
|
must_clause = item[1][0]
|
||||||
|
field = item[1][1][0]
|
||||||
|
terms = item[1][1][1]
|
||||||
|
|
||||||
|
if must_clause == 'must':
|
||||||
|
query_params['query'].terms(field, terms, must=True)
|
||||||
|
else:
|
||||||
|
query_params['query'].terms(field, terms, must=False)
|
||||||
|
|
||||||
|
return query_params, post_processing
|
||||||
|
|
||||||
|
def _resolve_post_processing_task(self, item, query_params, post_processing):
|
||||||
# Just do this in post-processing
|
# Just do this in post-processing
|
||||||
post_processing.append(item)
|
post_processing.append(item)
|
||||||
|
|
||||||
return query, post_processing
|
return query_params, post_processing
|
||||||
|
|
||||||
|
def _size(self, query_params, post_processing):
|
||||||
|
# Shrink wrap code around checking if size parameter is set
|
||||||
|
size = query_params['query_size'] # can be None
|
||||||
|
|
||||||
|
pp_size = self._count_post_processing(post_processing)
|
||||||
|
if pp_size is not None:
|
||||||
|
if size is not None:
|
||||||
|
size = min(size, pp_size)
|
||||||
|
else:
|
||||||
|
size = pp_size
|
||||||
|
|
||||||
|
# This can return None
|
||||||
|
return size
|
||||||
|
|
||||||
def info_es(self, buf):
|
def info_es(self, buf):
|
||||||
buf.write("Operations:\n")
|
buf.write("Operations:\n")
|
||||||
buf.write("\ttasks: {0}\n".format(self._tasks))
|
buf.write("\ttasks: {0}\n".format(self._tasks))
|
||||||
|
|
||||||
query, post_processing = self._to_es_query()
|
query_params, post_processing = self._resolve_tasks()
|
||||||
size, sort_params = Operations._query_to_params(query)
|
size, sort_params = Operations._query_params_to_size_and_sort(query_params)
|
||||||
columns = self.get_columns()
|
columns = self.get_columns()
|
||||||
|
|
||||||
buf.write("\tsize: {0}\n".format(size))
|
buf.write("\tsize: {0}\n".format(size))
|
||||||
buf.write("\tsort_params: {0}\n".format(sort_params))
|
buf.write("\tsort_params: {0}\n".format(sort_params))
|
||||||
buf.write("\tcolumns: {0}\n".format(columns))
|
buf.write("\tcolumns: {0}\n".format(columns))
|
||||||
buf.write("\tpost_processing: {0}\n".format(post_processing))
|
buf.write("\tpost_processing: {0}\n".format(post_processing))
|
||||||
|
|
||||||
|
|
||||||
|
93
eland/query.py
Normal file
93
eland/query.py
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
import warnings
|
||||||
|
from copy import deepcopy
|
||||||
|
|
||||||
|
|
||||||
|
class Query:
|
||||||
|
"""
|
||||||
|
Simple class to manage building Elasticsearch queries.
|
||||||
|
|
||||||
|
Specifically, this
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, query=None):
|
||||||
|
if query is None:
|
||||||
|
self._query = self._query_template()
|
||||||
|
self._aggs = {}
|
||||||
|
else:
|
||||||
|
# Deep copy the incoming query so we can change it
|
||||||
|
self._query = deepcopy(query._query)
|
||||||
|
self._aggs = deepcopy(query._aggs)
|
||||||
|
|
||||||
|
def exists(self, field, must=True):
|
||||||
|
"""
|
||||||
|
Add exists query
|
||||||
|
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-exists-query.html
|
||||||
|
"""
|
||||||
|
if must:
|
||||||
|
self._query['bool']['must'].append({'exists': {'field': field}})
|
||||||
|
else:
|
||||||
|
self._query['bool']['must_not'].append({'exists': {'field': field}})
|
||||||
|
|
||||||
|
def ids(self, items, must=True):
|
||||||
|
"""
|
||||||
|
Add ids query
|
||||||
|
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-ids-query.html
|
||||||
|
"""
|
||||||
|
if must:
|
||||||
|
self._query['bool']['must'].append({'ids': {'values': items}})
|
||||||
|
else:
|
||||||
|
self._query['bool']['must_not'].append({'ids': {'values': items}})
|
||||||
|
|
||||||
|
def terms(self, field, items, must=True):
|
||||||
|
"""
|
||||||
|
Add ids query
|
||||||
|
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-terms-query.html
|
||||||
|
"""
|
||||||
|
if must:
|
||||||
|
self._query['bool']['must'].append({'terms': {field: items}})
|
||||||
|
else:
|
||||||
|
self._query['bool']['must_not'].append({'terms': {field: items}})
|
||||||
|
|
||||||
|
def metric_aggs(self, name, func, field):
|
||||||
|
"""
|
||||||
|
Add metric agg e.g
|
||||||
|
|
||||||
|
"aggs": {
|
||||||
|
"name": {
|
||||||
|
"max": {
|
||||||
|
"field": "AvgTicketPrice"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
agg = {
|
||||||
|
func: {
|
||||||
|
"field": field
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self._aggs[name] = agg
|
||||||
|
|
||||||
|
def to_search_body(self):
|
||||||
|
body = {"query": self._query, "aggs": self._aggs}
|
||||||
|
return body
|
||||||
|
|
||||||
|
def to_count_body(self):
|
||||||
|
if len(self._aggs) > 0:
|
||||||
|
warnings.warn('Requesting count for agg query {}', self)
|
||||||
|
body = {"query": self._query}
|
||||||
|
|
||||||
|
return body
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return repr(self.to_search_body())
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _query_template():
|
||||||
|
template = {
|
||||||
|
"bool": {
|
||||||
|
"must": [],
|
||||||
|
"must_not": []
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return deepcopy(template)
|
@ -52,6 +52,17 @@ class ElandQueryCompiler(BaseQueryCompiler):
|
|||||||
columns = property(_get_columns, _set_columns)
|
columns = property(_get_columns, _set_columns)
|
||||||
index = property(_get_index)
|
index = property(_get_index)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dtypes(self):
|
||||||
|
columns = self._operations.get_columns()
|
||||||
|
|
||||||
|
return self._mappings.dtypes(columns)
|
||||||
|
|
||||||
|
def get_dtype_counts(self):
|
||||||
|
columns = self._operations.get_columns()
|
||||||
|
|
||||||
|
return self._mappings.get_dtype_counts(columns)
|
||||||
|
|
||||||
# END Index, columns, and dtypes objects
|
# END Index, columns, and dtypes objects
|
||||||
|
|
||||||
def _es_results_to_pandas(self, results):
|
def _es_results_to_pandas(self, results):
|
||||||
@ -226,7 +237,25 @@ class ElandQueryCompiler(BaseQueryCompiler):
|
|||||||
index_count: int
|
index_count: int
|
||||||
Count of docs where index_field exists
|
Count of docs where index_field exists
|
||||||
"""
|
"""
|
||||||
return self._operations.to_count(self)
|
return self._operations.index_count(self, self.index.index_field)
|
||||||
|
|
||||||
|
def _index_matches_count(self, items):
|
||||||
|
"""
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
index_count: int
|
||||||
|
Count of docs where items exist
|
||||||
|
"""
|
||||||
|
return self._operations.index_matches_count(self, self.index.index_field, items)
|
||||||
|
|
||||||
|
def _index_matches(self, items):
|
||||||
|
"""
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
index_count: int
|
||||||
|
Count of list of the items that match
|
||||||
|
"""
|
||||||
|
return self._operations.index_matches(self, self.index.index_field, items)
|
||||||
|
|
||||||
def copy(self):
|
def copy(self):
|
||||||
return self.__constructor__(
|
return self.__constructor__(
|
||||||
@ -295,6 +324,31 @@ class ElandQueryCompiler(BaseQueryCompiler):
|
|||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def drop(self, index=None, columns=None):
|
||||||
|
result = self.copy()
|
||||||
|
|
||||||
|
# Drop gets all columns and removes drops
|
||||||
|
if columns is not None:
|
||||||
|
# columns is a pandas.Index so we can use pandas drop feature
|
||||||
|
new_columns = self.columns.drop(columns)
|
||||||
|
result._operations.set_columns(new_columns.to_list())
|
||||||
|
|
||||||
|
if index is not None:
|
||||||
|
result._operations.drop_index_values(self, self.index.index_field, index)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def count(self):
|
||||||
|
return self._operations.count(self)
|
||||||
|
def mean(self):
|
||||||
|
return self._operations.mean(self)
|
||||||
|
def sum(self):
|
||||||
|
return self._operations.sum(self)
|
||||||
|
def min(self):
|
||||||
|
return self._operations.min(self)
|
||||||
|
def max(self):
|
||||||
|
return self._operations.max(self)
|
||||||
|
|
||||||
def info_es(self, buf):
|
def info_es(self, buf):
|
||||||
buf.write("index_pattern: {index_pattern}\n".format(index_pattern=self._index_pattern))
|
buf.write("index_pattern: {index_pattern}\n".format(index_pattern=self._index_pattern))
|
||||||
|
|
||||||
@ -302,3 +356,5 @@ class ElandQueryCompiler(BaseQueryCompiler):
|
|||||||
self._mappings.info_es(buf)
|
self._mappings.info_es(buf)
|
||||||
self._operations.info_es(buf)
|
self._operations.info_es(buf)
|
||||||
|
|
||||||
|
def describe(self):
|
||||||
|
return self._operations.describe(self)
|
||||||
|
@ -56,6 +56,18 @@ def assert_pandas_eland_frame_equal(left, right):
|
|||||||
# Use pandas tests to check similarity
|
# Use pandas tests to check similarity
|
||||||
assert_frame_equal(left, right._to_pandas())
|
assert_frame_equal(left, right._to_pandas())
|
||||||
|
|
||||||
|
def assert_eland_frame_equal(left, right):
|
||||||
|
if not isinstance(left, ed.DataFrame):
|
||||||
|
raise AssertionError("Expected type {exp_type}, found {act_type} instead".format(
|
||||||
|
exp_type='ed.DataFrame', act_type=type(left)))
|
||||||
|
|
||||||
|
if not isinstance(right, ed.DataFrame):
|
||||||
|
raise AssertionError("Expected type {exp_type}, found {act_type} instead".format(
|
||||||
|
exp_type='ed.DataFrame', act_type=type(right)))
|
||||||
|
|
||||||
|
# Use pandas tests to check similarity
|
||||||
|
assert_frame_equal(left._to_pandas(), right._to_pandas())
|
||||||
|
|
||||||
|
|
||||||
def assert_pandas_eland_series_equal(left, right):
|
def assert_pandas_eland_series_equal(left, right):
|
||||||
if not isinstance(left, pd.Series):
|
if not isinstance(left, pd.Series):
|
||||||
|
@ -5,9 +5,15 @@ from eland.tests.common import TestData
|
|||||||
|
|
||||||
class TestDataFrameCount(TestData):
|
class TestDataFrameCount(TestData):
|
||||||
|
|
||||||
def test_to_string1(self):
|
def test_to_count1(self):
|
||||||
ed_flights = self.ed_flights()
|
pd_ecommerce = self.pd_ecommerce()
|
||||||
pd_flights = self.pd_flights()
|
ed_ecommerce = self.ed_ecommerce()
|
||||||
|
|
||||||
|
pd_count = pd_ecommerce.count()
|
||||||
|
ed_count = ed_ecommerce.count()
|
||||||
|
|
||||||
|
print(pd_count)
|
||||||
|
print(ed_count)
|
||||||
|
|
||||||
|
|
||||||
#ed_count = ed_flights.count()
|
|
||||||
|
|
||||||
|
40
eland/tests/dataframe/test_describe_pytest.py
Normal file
40
eland/tests/dataframe/test_describe_pytest.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
# File called _pytest for PyCharm compatability
|
||||||
|
from io import StringIO
|
||||||
|
|
||||||
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
|
||||||
|
class TestDataFrameInfo(TestData):
|
||||||
|
|
||||||
|
def test_to_describe1(self):
|
||||||
|
pd_flights = self.pd_flights()
|
||||||
|
ed_flights = self.ed_flights()
|
||||||
|
|
||||||
|
pd_describe = pd_flights.describe()
|
||||||
|
ed_describe = ed_flights.describe()
|
||||||
|
|
||||||
|
# TODO - this fails now as ES aggregations are approximate
|
||||||
|
# if ES percentile agg uses
|
||||||
|
# "hdr": {
|
||||||
|
# "number_of_significant_value_digits": 3
|
||||||
|
# }
|
||||||
|
# this works
|
||||||
|
# assert_almost_equal(pd_flights_describe, ed_flights_describe)
|
||||||
|
|
||||||
|
pd_ecommerce_describe = self.pd_ecommerce().describe()
|
||||||
|
ed_ecommerce_describe = self.ed_ecommerce().describe()
|
||||||
|
|
||||||
|
# We don't compare ecommerce here as the default dtypes in pandas from read_json
|
||||||
|
# don't match the mapping types. This is mainly because the products field is
|
||||||
|
# nested and so can be treated as a multi-field in ES, but not in pandas
|
||||||
|
|
||||||
|
def test_to_describe2(self):
|
||||||
|
pd_flights = self.pd_flights().head()
|
||||||
|
ed_flights = self.ed_flights().head()
|
||||||
|
|
||||||
|
pd_describe = pd_flights.describe()
|
||||||
|
ed_describe = ed_flights.describe()
|
||||||
|
|
||||||
|
print(pd_describe)
|
||||||
|
print(ed_describe)
|
||||||
|
|
56
eland/tests/dataframe/test_drop_pytest.py
Normal file
56
eland/tests/dataframe/test_drop_pytest.py
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
# File called _pytest for PyCharm compatability
|
||||||
|
import pandas as pd
|
||||||
|
import eland as ed
|
||||||
|
|
||||||
|
from eland.tests.common import TestData
|
||||||
|
from eland.tests.common import (
|
||||||
|
assert_eland_frame_equal,
|
||||||
|
assert_pandas_eland_frame_equal,
|
||||||
|
assert_pandas_eland_series_equal
|
||||||
|
)
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
class TestDataFrameDrop(TestData):
|
||||||
|
|
||||||
|
def test_drop1(self):
|
||||||
|
ed_flights = self.ed_flights()
|
||||||
|
pd_flights = self.pd_flights()
|
||||||
|
|
||||||
|
# ['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID',
|
||||||
|
# 'DestCityName', 'DestCountry', 'DestLocation', 'DestRegion',
|
||||||
|
# 'DestWeather', 'DistanceKilometers', 'DistanceMiles', 'FlightDelay',
|
||||||
|
# 'FlightDelayMin', 'FlightDelayType', 'FlightNum', 'FlightTimeHour',
|
||||||
|
# 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
|
||||||
|
# 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather',
|
||||||
|
# 'dayOfWeek', 'timestamp']
|
||||||
|
pd_col0 = pd_flights.drop(['Carrier', 'DestCityName'], axis=1)
|
||||||
|
pd_col1 = pd_flights.drop(columns=['Carrier', 'DestCityName'])
|
||||||
|
|
||||||
|
ed_col0 = ed_flights.drop(['Carrier', 'DestCityName'], axis=1)
|
||||||
|
ed_col1 = ed_flights.drop(columns=['Carrier', 'DestCityName'])
|
||||||
|
|
||||||
|
#assert_pandas_eland_frame_equal(pd_col0, ed_col0)
|
||||||
|
#assert_pandas_eland_frame_equal(pd_col1, ed_col1)
|
||||||
|
|
||||||
|
# Drop rows by index
|
||||||
|
pd_idx0 = pd_flights.drop(['1', '2'])
|
||||||
|
ed_idx0 = ed_flights.drop(['1', '2'])
|
||||||
|
|
||||||
|
print(pd_idx0.info())
|
||||||
|
print(ed_idx0.info())
|
||||||
|
|
||||||
|
assert_pandas_eland_frame_equal(pd_idx0, ed_idx0)
|
||||||
|
|
||||||
|
"""
|
||||||
|
#assert_pandas_eland_frame_equal(pd_iloc0, ed_iloc0) # pd_iloc0 is Series
|
||||||
|
assert_pandas_eland_frame_equal(pd_iloc1, ed_iloc1)
|
||||||
|
assert_pandas_eland_frame_equal(pd_iloc2, ed_iloc2)
|
||||||
|
assert_pandas_eland_frame_equal(pd_iloc3, ed_iloc3)
|
||||||
|
assert_pandas_eland_frame_equal(pd_iloc4, ed_iloc4)
|
||||||
|
#assert_pandas_eland_frame_equal(pd_iloc5, ed_iloc5) # pd_iloc5 is numpy_bool
|
||||||
|
assert_pandas_eland_frame_equal(pd_iloc6, ed_iloc6)
|
||||||
|
assert_pandas_eland_frame_equal(pd_iloc7, ed_iloc7)
|
||||||
|
assert_pandas_eland_frame_equal(pd_iloc8, ed_iloc8)
|
||||||
|
assert_pandas_eland_frame_equal(pd_iloc9, ed_iloc9)
|
||||||
|
"""
|
@ -37,3 +37,19 @@ class TestDataFrameGetItem(TestData):
|
|||||||
pd_flights_OriginAirportID = pd_flights.OriginAirportID
|
pd_flights_OriginAirportID = pd_flights.OriginAirportID
|
||||||
|
|
||||||
assert_pandas_eland_series_equal(pd_flights_OriginAirportID, ed_flights_OriginAirportID)
|
assert_pandas_eland_series_equal(pd_flights_OriginAirportID, ed_flights_OriginAirportID)
|
||||||
|
|
||||||
|
def test_getitem4(self):
|
||||||
|
ed_flights = self.ed_flights().head(89)
|
||||||
|
pd_flights = self.pd_flights().head(89)
|
||||||
|
|
||||||
|
ed_col0 = ed_flights[['DestCityName', 'DestCountry', 'DestLocation', 'DestRegion']]
|
||||||
|
try:
|
||||||
|
ed_col1 = ed_col0['Carrier']
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
pd_col1 = pd_flights['DestCountry']
|
||||||
|
ed_col1 = ed_col0['DestCountry']
|
||||||
|
|
||||||
|
assert_pandas_eland_series_equal(pd_col1, ed_col1)
|
||||||
|
|
||||||
|
@ -12,28 +12,6 @@ import numpy as np
|
|||||||
|
|
||||||
class TestDataFrameiLoc(TestData):
|
class TestDataFrameiLoc(TestData):
|
||||||
|
|
||||||
def test_range(self):
|
|
||||||
columns = ['a', 'b', 'c', 'd', 'e']
|
|
||||||
|
|
||||||
r = pd.RangeIndex(0, 3, 1)
|
|
||||||
|
|
||||||
i = pd.Int64Index([1, 2])
|
|
||||||
|
|
||||||
dates = pd.date_range('1/1/2000', periods=8)
|
|
||||||
|
|
||||||
df = pd.DataFrame(np.random.randn(8, 4), index = dates, columns = ['A', 'B', 'C', 'D'])
|
|
||||||
|
|
||||||
print(df)
|
|
||||||
|
|
||||||
print("STEVE ", df.squeeze())
|
|
||||||
|
|
||||||
ii = slice(None)
|
|
||||||
rr = slice(None)
|
|
||||||
|
|
||||||
print(df.iloc[:, 0:3])
|
|
||||||
print(df.iloc[i, r])
|
|
||||||
print(df.iloc[ii, rr])
|
|
||||||
|
|
||||||
def test_iloc1(self):
|
def test_iloc1(self):
|
||||||
ed_flights = self.ed_flights()
|
ed_flights = self.ed_flights()
|
||||||
pd_flights = self.pd_flights()
|
pd_flights = self.pd_flights()
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
# File called _pytest for PyCharm compatability
|
# File called _pytest for PyCharm compatability
|
||||||
|
from io import StringIO
|
||||||
|
|
||||||
from eland.tests.common import TestData
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
@ -9,4 +10,16 @@ class TestDataFrameInfo(TestData):
|
|||||||
ed_flights = self.ed_flights()
|
ed_flights = self.ed_flights()
|
||||||
pd_flights = self.pd_flights()
|
pd_flights = self.pd_flights()
|
||||||
|
|
||||||
ed_flights.info()
|
ed_buf = StringIO()
|
||||||
|
pd_buf = StringIO()
|
||||||
|
|
||||||
|
# Ignore memory_usage and first line (class name)
|
||||||
|
ed_flights.info(buf=ed_buf, memory_usage=False)
|
||||||
|
pd_flights.info(buf=pd_buf, memory_usage=False)
|
||||||
|
|
||||||
|
ed_buf_lines = ed_buf.getvalue().split('\n')
|
||||||
|
pd_buf_lines = pd_buf.getvalue().split('\n')
|
||||||
|
|
||||||
|
assert pd_buf_lines[1:] == ed_buf_lines[1:]
|
||||||
|
|
||||||
|
print(self.ed_ecommerce().info())
|
||||||
|
46
eland/tests/dataframe/test_metrics_pytest.py
Normal file
46
eland/tests/dataframe/test_metrics_pytest.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
|
||||||
|
from pandas.util.testing import assert_series_equal
|
||||||
|
|
||||||
|
|
||||||
|
class TestDataFrameMean(TestData):
|
||||||
|
|
||||||
|
def test_to_mean(self):
|
||||||
|
pd_flights = self.pd_flights()
|
||||||
|
ed_flights = self.ed_flights()
|
||||||
|
|
||||||
|
pd_mean = pd_flights.mean()
|
||||||
|
ed_mean = ed_flights.mean()
|
||||||
|
|
||||||
|
assert_series_equal(pd_mean, ed_mean)
|
||||||
|
|
||||||
|
def test_to_sum(self):
|
||||||
|
pd_flights = self.pd_flights()
|
||||||
|
ed_flights = self.ed_flights()
|
||||||
|
|
||||||
|
pd_sum = pd_flights.sum(numeric_only=True)
|
||||||
|
ed_sum = ed_flights.sum(numeric_only=True)
|
||||||
|
|
||||||
|
assert_series_equal(pd_sum, ed_sum)
|
||||||
|
|
||||||
|
def test_to_min(self):
|
||||||
|
pd_flights = self.pd_flights()
|
||||||
|
ed_flights = self.ed_flights()
|
||||||
|
|
||||||
|
pd_min = pd_flights.min(numeric_only=True)
|
||||||
|
ed_min = ed_flights.min(numeric_only=True)
|
||||||
|
|
||||||
|
assert_series_equal(pd_min, ed_min)
|
||||||
|
|
||||||
|
def test_to_max(self):
|
||||||
|
pd_flights = self.pd_flights()
|
||||||
|
ed_flights = self.ed_flights()
|
||||||
|
|
||||||
|
pd_max = pd_flights.max(numeric_only=True)
|
||||||
|
ed_max = ed_flights.max(numeric_only=True)
|
||||||
|
|
||||||
|
assert_series_equal(pd_max, ed_max)
|
||||||
|
|
20
eland/tests/dataframe/test_sum_pytest.py
Normal file
20
eland/tests/dataframe/test_sum_pytest.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
|
||||||
|
from pandas.util.testing import assert_series_equal
|
||||||
|
|
||||||
|
|
||||||
|
class TestDataFrameSum(TestData):
|
||||||
|
|
||||||
|
def test_to_mean1(self):
|
||||||
|
pd_flights = self.pd_flights()
|
||||||
|
ed_flights = self.ed_flights()
|
||||||
|
|
||||||
|
pd_sum = pd_flights.sum(numeric_only=True)
|
||||||
|
ed_sum = ed_flights.sum(numeric_only=True)
|
||||||
|
|
||||||
|
assert_series_equal(pd_sum, ed_sum)
|
||||||
|
|
||||||
|
|
0
eland/tests/mappings/__init__.py
Normal file
0
eland/tests/mappings/__init__.py
Normal file
44
eland/tests/mappings/test_dtypes_pytest.py
Normal file
44
eland/tests/mappings/test_dtypes_pytest.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
from pandas.util.testing import assert_series_equal
|
||||||
|
|
||||||
|
class TestMappingsDtypes(TestData):
|
||||||
|
|
||||||
|
def test_dtypes1(self):
|
||||||
|
ed_flights = self.ed_flights()
|
||||||
|
pd_flights = self.pd_flights()
|
||||||
|
|
||||||
|
pd_dtypes = pd_flights.dtypes
|
||||||
|
ed_dtypes = ed_flights._query_compiler._mappings.dtypes()
|
||||||
|
|
||||||
|
assert_series_equal(pd_dtypes, ed_dtypes)
|
||||||
|
|
||||||
|
def test_dtypes2(self):
|
||||||
|
ed_flights = self.ed_flights()
|
||||||
|
pd_flights = self.pd_flights()[['Carrier', 'AvgTicketPrice', 'Cancelled']]
|
||||||
|
|
||||||
|
pd_dtypes = pd_flights.dtypes
|
||||||
|
ed_dtypes = ed_flights._query_compiler._mappings.dtypes(columns=['Carrier', 'AvgTicketPrice', 'Cancelled'])
|
||||||
|
|
||||||
|
assert_series_equal(pd_dtypes, ed_dtypes)
|
||||||
|
|
||||||
|
def test_get_dtype_counts1(self):
|
||||||
|
ed_flights = self.ed_flights()
|
||||||
|
pd_flights = self.pd_flights()
|
||||||
|
|
||||||
|
pd_dtypes = pd_flights.get_dtype_counts().sort_index()
|
||||||
|
ed_dtypes = ed_flights._query_compiler._mappings.get_dtype_counts().sort_index()
|
||||||
|
|
||||||
|
assert_series_equal(pd_dtypes, ed_dtypes)
|
||||||
|
|
||||||
|
def test_get_dtype_counts2(self):
|
||||||
|
ed_flights = self.ed_flights()
|
||||||
|
pd_flights = self.pd_flights()[['Carrier', 'AvgTicketPrice', 'Cancelled']]
|
||||||
|
|
||||||
|
pd_dtypes = pd_flights.get_dtype_counts().sort_index()
|
||||||
|
ed_dtypes = ed_flights._query_compiler._mappings.\
|
||||||
|
get_dtype_counts(columns=['Carrier', 'AvgTicketPrice', 'Cancelled']).sort_index()
|
||||||
|
|
||||||
|
assert_series_equal(pd_dtypes, ed_dtypes)
|
27
eland/tests/query/test_count_pytest.py
Normal file
27
eland/tests/query/test_count_pytest.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
# File called _pytest for PyCharm compatability
|
||||||
|
|
||||||
|
from eland.tests.common import TestData
|
||||||
|
|
||||||
|
from eland import Query
|
||||||
|
|
||||||
|
|
||||||
|
class TestQueryCopy(TestData):
|
||||||
|
|
||||||
|
def test_copy(self):
|
||||||
|
q = Query()
|
||||||
|
|
||||||
|
q.exists('field_a')
|
||||||
|
q.exists('field_b', must=False)
|
||||||
|
|
||||||
|
print(q.to_query())
|
||||||
|
|
||||||
|
q1 = Query(q)
|
||||||
|
|
||||||
|
q.exists('field_c', must=False)
|
||||||
|
q1.exists('field_c1', must=False)
|
||||||
|
|
||||||
|
print(q.to_query())
|
||||||
|
print(q1.to_query())
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -36,7 +36,7 @@ def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=
|
|||||||
mapping = Mappings._generate_es_mappings(df)
|
mapping = Mappings._generate_es_mappings(df)
|
||||||
|
|
||||||
# If table exists, check if_exists parameter
|
# If table exists, check if_exists parameter
|
||||||
if client.indices().exists(destination_index):
|
if client.index_exists(index=destination_index):
|
||||||
if if_exists == "fail":
|
if if_exists == "fail":
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Could not create the index [{0}] because it "
|
"Could not create the index [{0}] because it "
|
||||||
@ -45,12 +45,12 @@ def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=
|
|||||||
"'append' or 'replace' data.".format(destination_index)
|
"'append' or 'replace' data.".format(destination_index)
|
||||||
)
|
)
|
||||||
elif if_exists == "replace":
|
elif if_exists == "replace":
|
||||||
client.indices().delete(destination_index)
|
client.index_delete(index=destination_index)
|
||||||
client.indices().create(destination_index, mapping)
|
client.index_create(index=destination_index, mapping=mapping)
|
||||||
# elif if_exists == "append":
|
# elif if_exists == "append":
|
||||||
# TODO validate mapping is compatible
|
# TODO validate mapping is compatible
|
||||||
else:
|
else:
|
||||||
client.indices().create(destination_index, mapping)
|
client.index_create(index=destination_index, mapping=mapping)
|
||||||
|
|
||||||
# Now add data
|
# Now add data
|
||||||
actions = []
|
actions = []
|
||||||
|
Loading…
x
Reference in New Issue
Block a user