Adding drop + the ability for operations to have a query

Significant refactor - needs cleanup
This commit is contained in:
Stephen Dodson 2019-07-11 10:11:57 +00:00
parent dc07285aa1
commit d71ce9f50c
23 changed files with 1484 additions and 147 deletions

View File

@ -7,6 +7,7 @@ os.environ["MODIN_BACKEND"] = 'pandas'
from .client import * from .client import *
from .index import * from .index import *
from .mappings import * from .mappings import *
from .query import *
from .operations import * from .operations import *
from .query_compiler import * from .query_compiler import *
from .ndframe import * from .ndframe import *

View File

@ -12,7 +12,16 @@ class Client:
self._es = es._es self._es = es._es
else: else:
self._es = Elasticsearch(es) self._es = Elasticsearch(es)
def index_create(self, **kwargs):
return self._es.indices.create(**kwargs)
def index_delete(self, **kwargs):
return self._es.indices.delete(**kwargs)
def index_exists(self, **kwargs):
return self._es.indices.exists(**kwargs)
def get_mapping(self, **kwargs): def get_mapping(self, **kwargs):
return self._es.indices.get_mapping(**kwargs) return self._es.indices.get_mapping(**kwargs)

View File

@ -16,7 +16,7 @@ from eland import Series
class DataFrame(NDFrame): class DataFrame(NDFrame):
# TODO create effectively 2 constructors # This is effectively 2 constructors
# 1. client, index_pattern, columns, index_field # 1. client, index_pattern, columns, index_field
# 2. query_compiler # 2. query_compiler
def __init__(self, def __init__(self,
@ -74,6 +74,22 @@ class DataFrame(NDFrame):
return buf.getvalue() return buf.getvalue()
def count(self):
"""
Count non-NA cells for each column (TODO row)
Counts are based on exists queries against ES
This is inefficient, as it creates N queries (N is number of fields).
An alternative approach is to use value_count aggregations. However, they have issues in that:
1. They can only be used with aggregatable fields (e.g. keyword not text)
2. For list fields they return multiple counts. E.g. tags=['elastic', 'ml'] returns value_count=2
for a single document.
"""
return self._query_compiler.count()
def info_es(self): def info_es(self):
buf = StringIO() buf = StringIO()
@ -81,6 +97,130 @@ class DataFrame(NDFrame):
return buf.getvalue() return buf.getvalue()
def _index_summary(self):
head = self.head(1)._to_pandas().index[0]
tail = self.tail(1)._to_pandas().index[0]
index_summary = ', %s to %s' % (pprint_thing(head),
pprint_thing(tail))
name = "Index"
return '%s: %s entries%s' % (name, len(self), index_summary)
def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,
null_counts=None):
"""
Print a concise summary of a DataFrame.
This method prints information about a DataFrame including
the index dtype and column dtypes, non-null values and memory usage.
This copies a lot of code from pandas.DataFrame.info as it is difficult
to split out the appropriate code or creating a SparseDataFrame gives
incorrect results on types and counts.
"""
if buf is None: # pragma: no cover
buf = sys.stdout
lines = []
lines.append(str(type(self)))
lines.append(self._index_summary())
if len(self.columns) == 0:
lines.append('Empty {name}'.format(name=type(self).__name__))
fmt.buffer_put_lines(buf, lines)
return
cols = self.columns
# hack
if max_cols is None:
max_cols = pd.get_option('display.max_info_columns',
len(self.columns) + 1)
max_rows = pd.get_option('display.max_info_rows', len(self) + 1)
if null_counts is None:
show_counts = ((len(self.columns) <= max_cols) and
(len(self) < max_rows))
else:
show_counts = null_counts
exceeds_info_cols = len(self.columns) > max_cols
# From pandas.DataFrame
def _put_str(s, space):
return '{s}'.format(s=s)[:space].ljust(space)
def _verbose_repr():
lines.append('Data columns (total %d columns):' %
len(self.columns))
space = max(len(pprint_thing(k)) for k in self.columns) + 4
counts = None
tmpl = "{count}{dtype}"
if show_counts:
counts = self.count()
if len(cols) != len(counts): # pragma: no cover
raise AssertionError(
'Columns must equal counts '
'({cols:d} != {counts:d})'.format(
cols=len(cols), counts=len(counts)))
tmpl = "{count} non-null {dtype}"
dtypes = self.dtypes
for i, col in enumerate(self.columns):
dtype = dtypes.iloc[i]
col = pprint_thing(col)
count = ""
if show_counts:
count = counts.iloc[i]
lines.append(_put_str(col, space) + tmpl.format(count=count,
dtype=dtype))
def _non_verbose_repr():
lines.append(self.columns._summary(name='Columns'))
def _sizeof_fmt(num, size_qualifier):
# returns size in human readable format
for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
if num < 1024.0:
return ("{num:3.1f}{size_q} "
"{x}".format(num=num, size_q=size_qualifier, x=x))
num /= 1024.0
return "{num:3.1f}{size_q} {pb}".format(num=num,
size_q=size_qualifier,
pb='PB')
if verbose:
_verbose_repr()
elif verbose is False: # specifically set to False, not nesc None
_non_verbose_repr()
else:
if exceeds_info_cols:
_non_verbose_repr()
else:
_verbose_repr()
counts = self.get_dtype_counts()
dtypes = ['{k}({kk:d})'.format(k=k[0], kk=k[1]) for k
in sorted(counts.items())]
lines.append('dtypes: {types}'.format(types=', '.join(dtypes)))
if memory_usage is None:
memory_usage = pd.get_option('display.memory_usage')
if memory_usage:
# append memory usage of df to display
size_qualifier = ''
# TODO - this is different from pd.DataFrame as we shouldn't
# really hold much in memory. For now just approximate with getsizeof + ignore deep
mem_usage = sys.getsizeof(self)
lines.append("memory usage: {mem}\n".format(
mem=_sizeof_fmt(mem_usage, size_qualifier)))
fmt.buffer_put_lines(buf, lines)
def to_string(self, buf=None, columns=None, col_space=None, header=True, def to_string(self, buf=None, columns=None, col_space=None, header=True,
@ -153,7 +293,7 @@ class DataFrame(NDFrame):
def _getitem_column(self, key): def _getitem_column(self, key):
if key not in self.columns: if key not in self.columns:
raise KeyError("{}".format(key)) raise KeyError("Requested column is not in the DataFrame {}".format(key))
s = self._reduce_dimension(self._query_compiler.getitem_column_array([key])) s = self._reduce_dimension(self._query_compiler.getitem_column_array([key]))
s._parent = self s._parent = self
return s return s
@ -196,6 +336,17 @@ class DataFrame(NDFrame):
query_compiler=self._query_compiler.getitem_column_array(key) query_compiler=self._query_compiler.getitem_column_array(key)
) )
def _create_or_update_from_compiler(self, new_query_compiler, inplace=False):
"""Returns or updates a DataFrame given new query_compiler"""
assert (
isinstance(new_query_compiler, type(self._query_compiler))
or type(new_query_compiler) in self._query_compiler.__class__.__bases__
), "Invalid Query Compiler object: {}".format(type(new_query_compiler))
if not inplace:
return DataFrame(query_compiler=new_query_compiler)
else:
self._query_compiler=new_query_compiler
def _reduce_dimension(self, query_compiler): def _reduce_dimension(self, query_compiler):
return Series(query_compiler=query_compiler) return Series(query_compiler=query_compiler)

View File

@ -12,6 +12,428 @@ the method in the left column. ``Y`` stands for yes, ``N`` stands for no, ``P``
for partial (meaning some parameters may not be supported yet), and ``D`` stands for for partial (meaning some parameters may not be supported yet), and ``D`` stands for
default to pandas. default to pandas.
https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv represents a prioritised list.
+-------------------------+-------+------------------------------------------------+
| Method | Count | Notes |
+-------------------------+-------+------------------------------------------------+
| pd.read_csv | 1422 | Not implemented ed.read_es implemented instead |
+-------------------------+-------+------------------------------------------------+
| pd.DataFrame | 886 | y |
+-------------------------+-------+------------------------------------------------+
| df.append | 792 | Not implemented |
+-------------------------+-------+------------------------------------------------+
| df.mean | 783 | y |
+-------------------------+-------+------------------------------------------------+
| df.head | 783 | y |
+-------------------------+-------+------------------------------------------------+
| df.drop | 761 | |
+-------------------------+-------+------------------------------------------------+
| df.sum | 755 | y |
+-------------------------+-------+------------------------------------------------+
| df.to_csv | 693 | |
+-------------------------+-------+------------------------------------------------+
| df.get | 669 | |
+-------------------------+-------+------------------------------------------------+
| df.mode | 653 | |
+-------------------------+-------+------------------------------------------------+
| df.astype | 649 | |
+-------------------------+-------+------------------------------------------------+
| df.sub | 637 | |
+-------------------------+-------+------------------------------------------------+
| pd.concat | 582 | |
+-------------------------+-------+------------------------------------------------+
| df.apply | 577 | |
+-------------------------+-------+------------------------------------------------+
| df.groupby | 557 | |
+-------------------------+-------+------------------------------------------------+
| df.join | 544 | |
+-------------------------+-------+------------------------------------------------+
| df.fillna | 543 | |
+-------------------------+-------+------------------------------------------------+
| df.max | 508 | |
+-------------------------+-------+------------------------------------------------+
| df.reset_index | 434 | |
+-------------------------+-------+------------------------------------------------+
| pd.unique | 433 | |
+-------------------------+-------+------------------------------------------------+
| df.le | 405 | |
+-------------------------+-------+------------------------------------------------+
| df.count | 399 | |
+-------------------------+-------+------------------------------------------------+
| pd.value_counts | 397 | |
+-------------------------+-------+------------------------------------------------+
| df.sort_values | 390 | |
+-------------------------+-------+------------------------------------------------+
| df.transform | 387 | |
+-------------------------+-------+------------------------------------------------+
| df.merge | 376 | |
+-------------------------+-------+------------------------------------------------+
| df.add | 346 | |
+-------------------------+-------+------------------------------------------------+
| df.isnull | 338 | |
+-------------------------+-------+------------------------------------------------+
| df.min | 321 | |
+-------------------------+-------+------------------------------------------------+
| df.copy | 314 | |
+-------------------------+-------+------------------------------------------------+
| df.replace | 300 | |
+-------------------------+-------+------------------------------------------------+
| df.std | 261 | |
+-------------------------+-------+------------------------------------------------+
| df.hist | 246 | |
+-------------------------+-------+------------------------------------------------+
| df.filter | 234 | |
+-------------------------+-------+------------------------------------------------+
| df.describe | 220 | |
+-------------------------+-------+------------------------------------------------+
| df.ne | 218 | |
+-------------------------+-------+------------------------------------------------+
| df.corr | 217 | |
+-------------------------+-------+------------------------------------------------+
| df.median | 217 | |
+-------------------------+-------+------------------------------------------------+
| df.items | 212 | |
+-------------------------+-------+------------------------------------------------+
| pd.to_datetime | 204 | |
+-------------------------+-------+------------------------------------------------+
| df.isin | 203 | |
+-------------------------+-------+------------------------------------------------+
| df.dropna | 195 | |
+-------------------------+-------+------------------------------------------------+
| pd.get_dummies | 190 | |
+-------------------------+-------+------------------------------------------------+
| df.rename | 185 | |
+-------------------------+-------+------------------------------------------------+
| df.info | 180 | |
+-------------------------+-------+------------------------------------------------+
| df.set_index | 166 | |
+-------------------------+-------+------------------------------------------------+
| df.keys | 159 | |
+-------------------------+-------+------------------------------------------------+
| df.sample | 155 | |
+-------------------------+-------+------------------------------------------------+
| df.agg | 140 | |
+-------------------------+-------+------------------------------------------------+
| df.where | 138 | |
+-------------------------+-------+------------------------------------------------+
| df.boxplot | 134 | |
+-------------------------+-------+------------------------------------------------+
| df.clip | 116 | |
+-------------------------+-------+------------------------------------------------+
| df.round | 116 | |
+-------------------------+-------+------------------------------------------------+
| df.abs | 101 | |
+-------------------------+-------+------------------------------------------------+
| df.stack | 97 | |
+-------------------------+-------+------------------------------------------------+
| df.tail | 94 | |
+-------------------------+-------+------------------------------------------------+
| df.update | 92 | |
+-------------------------+-------+------------------------------------------------+
| df.iterrows | 90 | |
+-------------------------+-------+------------------------------------------------+
| df.transpose | 87 | |
+-------------------------+-------+------------------------------------------------+
| df.any | 85 | |
+-------------------------+-------+------------------------------------------------+
| df.pipe | 80 | |
+-------------------------+-------+------------------------------------------------+
| pd.eval | 73 | |
+-------------------------+-------+------------------------------------------------+
| df.eval | 73 | |
+-------------------------+-------+------------------------------------------------+
| pd.read_json | 72 | |
+-------------------------+-------+------------------------------------------------+
| df.nunique | 70 | |
+-------------------------+-------+------------------------------------------------+
| df.pivot | 70 | |
+-------------------------+-------+------------------------------------------------+
| df.select | 68 | |
+-------------------------+-------+------------------------------------------------+
| df.as_matrix | 67 | |
+-------------------------+-------+------------------------------------------------+
| df.notnull | 66 | |
+-------------------------+-------+------------------------------------------------+
| df.cumsum | 66 | |
+-------------------------+-------+------------------------------------------------+
| df.prod | 64 | |
+-------------------------+-------+------------------------------------------------+
| df.unstack | 64 | |
+-------------------------+-------+------------------------------------------------+
| df.drop_duplicates | 63 | |
+-------------------------+-------+------------------------------------------------+
| df.div | 63 | |
+-------------------------+-------+------------------------------------------------+
| pd.crosstab | 59 | |
+-------------------------+-------+------------------------------------------------+
| df.select_dtypes | 57 | |
+-------------------------+-------+------------------------------------------------+
| df.pow | 56 | |
+-------------------------+-------+------------------------------------------------+
| df.sort_index | 56 | |
+-------------------------+-------+------------------------------------------------+
| df.product | 52 | |
+-------------------------+-------+------------------------------------------------+
| df.isna | 51 | |
+-------------------------+-------+------------------------------------------------+
| df.dot | 46 | |
+-------------------------+-------+------------------------------------------------+
| pd.cut | 45 | |
+-------------------------+-------+------------------------------------------------+
| df.bool | 44 | |
+-------------------------+-------+------------------------------------------------+
| df.to_dict | 44 | |
+-------------------------+-------+------------------------------------------------+
| df.diff | 44 | |
+-------------------------+-------+------------------------------------------------+
| df.insert | 44 | |
+-------------------------+-------+------------------------------------------------+
| df.pop | 44 | |
+-------------------------+-------+------------------------------------------------+
| df.query | 43 | |
+-------------------------+-------+------------------------------------------------+
| df.var | 43 | |
+-------------------------+-------+------------------------------------------------+
| df.__init__ | 41 | |
+-------------------------+-------+------------------------------------------------+
| pd.to_numeric | 39 | |
+-------------------------+-------+------------------------------------------------+
| df.squeeze | 39 | |
+-------------------------+-------+------------------------------------------------+
| df.ge | 37 | |
+-------------------------+-------+------------------------------------------------+
| df.quantile | 37 | |
+-------------------------+-------+------------------------------------------------+
| df.reindex | 37 | |
+-------------------------+-------+------------------------------------------------+
| df.rolling | 35 | |
+-------------------------+-------+------------------------------------------------+
| pd.factorize | 32 | |
+-------------------------+-------+------------------------------------------------+
| pd.melt | 31 | |
+-------------------------+-------+------------------------------------------------+
| df.melt | 31 | |
+-------------------------+-------+------------------------------------------------+
| df.rank | 31 | |
+-------------------------+-------+------------------------------------------------+
| pd.read_table | 30 | |
+-------------------------+-------+------------------------------------------------+
| pd.pivot_table | 30 | |
+-------------------------+-------+------------------------------------------------+
| df.idxmax | 30 | |
+-------------------------+-------+------------------------------------------------+
| pd.test | 29 | |
+-------------------------+-------+------------------------------------------------+
| df.iteritems | 29 | |
+-------------------------+-------+------------------------------------------------+
| df.shift | 28 | |
+-------------------------+-------+------------------------------------------------+
| df.mul | 28 | |
+-------------------------+-------+------------------------------------------------+
| pd.qcut | 25 | |
+-------------------------+-------+------------------------------------------------+
| df.set_value | 25 | |
+-------------------------+-------+------------------------------------------------+
| df.all | 24 | |
+-------------------------+-------+------------------------------------------------+
| df.skew | 24 | |
+-------------------------+-------+------------------------------------------------+
| df.aggregate | 23 | |
+-------------------------+-------+------------------------------------------------+
| pd.match | 22 | |
+-------------------------+-------+------------------------------------------------+
| df.nlargest | 22 | |
+-------------------------+-------+------------------------------------------------+
| df.multiply | 21 | |
+-------------------------+-------+------------------------------------------------+
| df.set_axis | 19 | |
+-------------------------+-------+------------------------------------------------+
| df.eq | 18 | |
+-------------------------+-------+------------------------------------------------+
| df.resample | 18 | |
+-------------------------+-------+------------------------------------------------+
| pd.read_sql | 17 | |
+-------------------------+-------+------------------------------------------------+
| df.duplicated | 16 | |
+-------------------------+-------+------------------------------------------------+
| pd.date_range | 16 | |
+-------------------------+-------+------------------------------------------------+
| df.interpolate | 15 | |
+-------------------------+-------+------------------------------------------------+
| df.memory_usage | 15 | |
+-------------------------+-------+------------------------------------------------+
| df.divide | 14 | |
+-------------------------+-------+------------------------------------------------+
| df.cov | 13 | |
+-------------------------+-------+------------------------------------------------+
| df.assign | 12 | |
+-------------------------+-------+------------------------------------------------+
| df.subtract | 12 | |
+-------------------------+-------+------------------------------------------------+
| pd.read_pickle | 11 | |
+-------------------------+-------+------------------------------------------------+
| df.applymap | 11 | |
+-------------------------+-------+------------------------------------------------+
| df.first | 11 | |
+-------------------------+-------+------------------------------------------------+
| df.kurt | 10 | |
+-------------------------+-------+------------------------------------------------+
| df.truncate | 10 | |
+-------------------------+-------+------------------------------------------------+
| df.get_value | 9 | |
+-------------------------+-------+------------------------------------------------+
| pd.read_hdf | 9 | |
+-------------------------+-------+------------------------------------------------+
| df.to_html | 9 | |
+-------------------------+-------+------------------------------------------------+
| pd.read_sql_query | 9 | |
+-------------------------+-------+------------------------------------------------+
| df.take | 8 | |
+-------------------------+-------+------------------------------------------------+
| df.to_pickle | 7 | |
+-------------------------+-------+------------------------------------------------+
| df.itertuples | 7 | |
+-------------------------+-------+------------------------------------------------+
| df.to_string | 7 | |
+-------------------------+-------+------------------------------------------------+
| df.last | 7 | |
+-------------------------+-------+------------------------------------------------+
| df.sem | 7 | |
+-------------------------+-------+------------------------------------------------+
| pd.to_pickle | 7 | |
+-------------------------+-------+------------------------------------------------+
| df.to_json | 7 | |
+-------------------------+-------+------------------------------------------------+
| df.idxmin | 7 | |
+-------------------------+-------+------------------------------------------------+
| df.xs | 6 | |
+-------------------------+-------+------------------------------------------------+
| df.combine | 6 | |
+-------------------------+-------+------------------------------------------------+
| pd.rolling_mean | 6 | |
+-------------------------+-------+------------------------------------------------+
| df.to_period | 6 | |
+-------------------------+-------+------------------------------------------------+
| df.convert_objects | 5 | |
+-------------------------+-------+------------------------------------------------+
| df.mask | 4 | |
+-------------------------+-------+------------------------------------------------+
| df.pct_change | 4 | |
+-------------------------+-------+------------------------------------------------+
| df.add_prefix | 4 | |
+-------------------------+-------+------------------------------------------------+
| pd.read_excel | 4 | |
+-------------------------+-------+------------------------------------------------+
| pd.rolling_std | 3 | |
+-------------------------+-------+------------------------------------------------+
| df.to_records | 3 | |
+-------------------------+-------+------------------------------------------------+
| df.corrwith | 3 | |
+-------------------------+-------+------------------------------------------------+
| df.swapaxes | 3 | |
+-------------------------+-------+------------------------------------------------+
| df.__iter__ | 3 | |
+-------------------------+-------+------------------------------------------------+
| df.to_sql | 3 | |
+-------------------------+-------+------------------------------------------------+
| pd.read_feather | 3 | |
+-------------------------+-------+------------------------------------------------+
| df.to_feather | 3 | |
+-------------------------+-------+------------------------------------------------+
| df.__len__ | 3 | |
+-------------------------+-------+------------------------------------------------+
| df.kurtosis | 3 | |
+-------------------------+-------+------------------------------------------------+
| df.mod | 2 | |
+-------------------------+-------+------------------------------------------------+
| df.to_sparse | 2 | |
+-------------------------+-------+------------------------------------------------+
| df.get_values | 2 | |
+-------------------------+-------+------------------------------------------------+
| df.__eq__ | 2 | |
+-------------------------+-------+------------------------------------------------+
| pd.bdate_range | 2 | |
+-------------------------+-------+------------------------------------------------+
| df.get_dtype_counts | 2 | |
+-------------------------+-------+------------------------------------------------+
| df.combine_first | 2 | |
+-------------------------+-------+------------------------------------------------+
| df._get_numeric_data | 2 | |
+-------------------------+-------+------------------------------------------------+
| df.nsmallest | 2 | |
+-------------------------+-------+------------------------------------------------+
| pd.scatter_matrix | 2 | |
+-------------------------+-------+------------------------------------------------+
| df.rename_axis | 2 | |
+-------------------------+-------+------------------------------------------------+
| df.__setstate__ | 2 | |
+-------------------------+-------+------------------------------------------------+
| df.cumprod | 2 | |
+-------------------------+-------+------------------------------------------------+
| df.__getstate__ | 2 | |
+-------------------------+-------+------------------------------------------------+
| df.equals | 2 | |
+-------------------------+-------+------------------------------------------------+
| df.__getitem__ | 2 | |
+-------------------------+-------+------------------------------------------------+
| df.clip_upper | 2 | |
+-------------------------+-------+------------------------------------------------+
| df.floordiv | 2 | |
+-------------------------+-------+------------------------------------------------+
| df.to_excel | 2 | |
+-------------------------+-------+------------------------------------------------+
| df.reindex_axis | 1 | |
+-------------------------+-------+------------------------------------------------+
| pd.to_timedelta | 1 | |
+-------------------------+-------+------------------------------------------------+
| df.ewm | 1 | |
+-------------------------+-------+------------------------------------------------+
| df.tz_localize | 1 | |
+-------------------------+-------+------------------------------------------------+
| df.tz_convert | 1 | |
+-------------------------+-------+------------------------------------------------+
| df.to_hdf | 1 | |
+-------------------------+-------+------------------------------------------------+
| df.lookup | 1 | |
+-------------------------+-------+------------------------------------------------+
| pd.merge_ordered | 1 | |
+-------------------------+-------+------------------------------------------------+
| df.swaplevel | 1 | |
+-------------------------+-------+------------------------------------------------+
| df.first_valid_index | 1 | |
+-------------------------+-------+------------------------------------------------+
| df.lt | 1 | |
+-------------------------+-------+------------------------------------------------+
| df.add_suffix | 1 | |
+-------------------------+-------+------------------------------------------------+
| pd.rolling_median | 1 | |
+-------------------------+-------+------------------------------------------------+
| df.to_dense | 1 | |
+-------------------------+-------+------------------------------------------------+
| df.mad | 1 | |
+-------------------------+-------+------------------------------------------------+
| df.align | 1 | |
+-------------------------+-------+------------------------------------------------+
| df.__copy__ | 1 | |
+-------------------------+-------+------------------------------------------------+
| pd.set_eng_float_format | 1 | |
+-------------------------+-------+------------------------------------------------+
| df.add_suffix | 1 | |
+-------------------------+-------+------------------------------------------------+
| pd.rolling_median | 1 | |
+-------------------------+-------+------------------------------------------------+
| df.to_dense | 1 | |
+-------------------------+-------+------------------------------------------------+
| df.mad | 1 | |
+-------------------------+-------+------------------------------------------------+
| df.align | 1 | |
+-------------------------+-------+------------------------------------------------+
| df.__copy__ | 1 | |
+-------------------------+-------+------------------------------------------------+
| pd.set_eng_float_format | 1 | |
+-------------------------+-------+------------------------------------------------+
+---------------------------+---------------------------------+----------------------------------------------------+ +---------------------------+---------------------------------+----------------------------------------------------+
| DataFrame method | Eland Implementation? (Y/N/P/D) | Notes for Current implementation | | DataFrame method | Eland Implementation? (Y/N/P/D) | Notes for Current implementation |
+---------------------------+---------------------------------+----------------------------------------------------+ +---------------------------+---------------------------------+----------------------------------------------------+

View File

@ -53,6 +53,7 @@ class Index:
# Make iterable # Make iterable
def __next__(self): def __next__(self):
# TODO resolve this hack to make this 'iterable' # TODO resolve this hack to make this 'iterable'
print("In Index.__next__")
raise StopIteration() raise StopIteration()
def __iter__(self): def __iter__(self):

View File

@ -1,8 +1,9 @@
import warnings import warnings
import pandas as pd import pandas as pd
from pandas.core.dtypes.common import (is_float_dtype, is_bool_dtype, is_integer_dtype, is_datetime_or_timedelta_dtype,
is_string_dtype)
from pandas.core.dtypes.common import (is_float_dtype, is_bool_dtype, is_integer_dtype, is_datetime_or_timedelta_dtype, is_string_dtype)
class Mappings: class Mappings:
""" """
@ -33,8 +34,7 @@ class Mappings:
def __init__(self, def __init__(self,
client=None, client=None,
index_pattern=None, index_pattern=None,
mappings=None, mappings=None):
columns=None):
""" """
Parameters Parameters
---------- ----------
@ -48,9 +48,6 @@ class Mappings:
mappings: Mappings mappings: Mappings
Object to copy Object to copy
columns: list of str
Columns to copy
""" """
if (client is not None) and (index_pattern is not None): if (client is not None) and (index_pattern is not None):
get_mapping = client.get_mapping(index=index_pattern) get_mapping = client.get_mapping(index=index_pattern)
@ -203,11 +200,11 @@ class Mappings:
if 'non_aggregatable_indices' in vv: if 'non_aggregatable_indices' in vv:
warnings.warn("Field {} has conflicting aggregatable fields across indexes {}", warnings.warn("Field {} has conflicting aggregatable fields across indexes {}",
format(field_name, vv['non_aggregatable_indices']), format(field, vv['non_aggregatable_indices']),
UserWarning) UserWarning)
if 'non_searchable_indices' in vv: if 'non_searchable_indices' in vv:
warnings.warn("Field {} has conflicting searchable fields across indexes {}", warnings.warn("Field {} has conflicting searchable fields across indexes {}",
format(field_name, vv['non_searchable_indices']), format(field, vv['non_searchable_indices']),
UserWarning) UserWarning)
capability_matrix_df = pd.DataFrame.from_dict(capability_matrix, orient='index', columns=columns) capability_matrix_df = pd.DataFrame.from_dict(capability_matrix, orient='index', columns=columns)
@ -406,16 +403,23 @@ class Mappings:
return is_source_field return is_source_field
def numeric_source_fields(self): def numeric_source_fields(self, columns):
""" """
Returns Returns
------- -------
numeric_source_fields: list of str numeric_source_fields: list of str
List of source fields where pd_dtype == (int64 or float64) List of source fields where pd_dtype == (int64 or float64 or bool)
""" """
return self._mappings_capabilities[(self._mappings_capabilities._source == True) & if columns is not None:
((self._mappings_capabilities.pd_dtype == 'int64') | return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
(self._mappings_capabilities.pd_dtype == 'float64'))].index.tolist() ((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64') |
(self._mappings_capabilities.pd_dtype == 'bool'))].loc[columns].index.tolist()
else:
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64') |
(self._mappings_capabilities.pd_dtype == 'bool'))].index.tolist()
def source_fields(self): def source_fields(self):
""" """
@ -435,16 +439,20 @@ class Mappings:
""" """
return len(self.source_fields()) return len(self.source_fields())
def dtypes(self): def dtypes(self, columns=None):
""" """
Returns Returns
------- -------
dtypes: pd.Series dtypes: pd.Series
Source field name + pd_dtype Source field name + pd_dtype
""" """
if columns is not None:
return pd.Series(
{key: self._source_field_pd_dtypes[key] for key in columns})
return pd.Series(self._source_field_pd_dtypes) return pd.Series(self._source_field_pd_dtypes)
def get_dtype_counts(self): def get_dtype_counts(self, columns=None):
""" """
Return counts of unique dtypes in this object. Return counts of unique dtypes in this object.
@ -453,10 +461,17 @@ class Mappings:
get_dtype_counts : Series get_dtype_counts : Series
Series with the count of columns with each dtype. Series with the count of columns with each dtype.
""" """
return pd.Series(self._mappings_capabilities[self._mappings_capabilities._source == True].groupby('pd_dtype')[
'_source'].count().to_dict()) if columns is not None:
return pd.Series(self._mappings_capabilities[self._mappings_capabilities._source == True]
.loc[columns]
.groupby('pd_dtype')['_source']
.count().to_dict())
return pd.Series(self._mappings_capabilities[self._mappings_capabilities._source == True]
.groupby('pd_dtype')['_source']
.count().to_dict())
def info_es(self, buf): def info_es(self, buf):
buf.write("Mappings:\n") buf.write("Mappings:\n")
buf.write("\tcapabilities: {0}\n".format(self._mappings_capabilities)) buf.write("\tcapabilities: {0}\n".format(self._mappings_capabilities))

View File

@ -23,8 +23,13 @@ only Elasticsearch aggregatable fields can be aggregated or grouped.
""" """
import sys
import pandas as pd
from modin.pandas.base import BasePandasDataset from modin.pandas.base import BasePandasDataset
from modin.pandas.indexing import _iLocIndexer from modin.pandas.indexing import _iLocIndexer
from pandas.util._validators import validate_bool_kwarg
from pandas.core.dtypes.common import is_list_like
from eland import ElandQueryCompiler from eland import ElandQueryCompiler
@ -52,12 +57,18 @@ class NDFrame(BasePandasDataset):
index_field=index_field) index_field=index_field)
self._query_compiler = query_compiler self._query_compiler = query_compiler
def _get_index(self): def _get_index(self):
return self._query_compiler.index return self._query_compiler.index
index = property(_get_index) index = property(_get_index)
@property
def dtypes(self):
return self._query_compiler.dtypes
def get_dtype_counts(self):
return self._query_compiler.get_dtype_counts()
def _build_repr_df(self, num_rows, num_cols): def _build_repr_df(self, num_rows, num_cols):
# Overriden version of BasePandasDataset._build_repr_df # Overriden version of BasePandasDataset._build_repr_df
# to avoid issues with concat # to avoid issues with concat
@ -91,6 +102,10 @@ class NDFrame(BasePandasDataset):
return self[key] return self[key]
raise e raise e
def __sizeof__(self):
# Don't default to pandas, just return approximation TODO - make this more accurate
return sys.getsizeof(self._query_compiler)
@property @property
def iloc(self): def iloc(self):
"""Purely integer-location based indexing for selection by position. """Purely integer-location based indexing for selection by position.
@ -101,3 +116,117 @@ class NDFrame(BasePandasDataset):
def info_es(self, buf): def info_es(self, buf):
self._query_compiler.info_es(buf) self._query_compiler.info_es(buf)
def drop(
self,
labels=None,
axis=0,
index=None,
columns=None,
level=None,
inplace=False,
errors="raise",
):
"""Return new object with labels in requested axis removed.
Args:
labels: Index or column labels to drop.
axis: Whether to drop labels from the index (0 / 'index') or
columns (1 / 'columns').
index, columns: Alternative to specifying axis (labels, axis=1 is
equivalent to columns=labels).
level: For MultiIndex
inplace: If True, do operation inplace and return None.
errors: If 'ignore', suppress error and existing labels are
dropped.
Returns:
dropped : type of caller
(derived from modin.base.BasePandasDataset)
"""
# Level not supported
if level is not None:
raise NotImplementedError("level not supported {}".format(level))
inplace = validate_bool_kwarg(inplace, "inplace")
if labels is not None:
if index is not None or columns is not None:
raise ValueError("Cannot specify both 'labels' and 'index'/'columns'")
axis = pd.DataFrame()._get_axis_name(axis)
axes = {axis: labels}
elif index is not None or columns is not None:
axes, _ = pd.DataFrame()._construct_axes_from_arguments(
(index, columns), {}
)
else:
raise ValueError(
"Need to specify at least one of 'labels', 'index' or 'columns'"
)
# TODO Clean up this error checking
if "index" not in axes:
axes["index"] = None
elif axes["index"] is not None:
if not is_list_like(axes["index"]):
axes["index"] = [axes["index"]]
if errors == "raise":
# Check if axes['index'] values exists in index
count = self._query_compiler._index_matches_count(axes["index"])
if count != len(axes["index"]):
raise ValueError(
"number of labels {}!={} not contained in axis".format(count, len(axes["index"]))
)
else:
"""
axes["index"] = self._query_compiler.index_matches(axes["index"])
# If the length is zero, we will just do nothing
if not len(axes["index"]):
axes["index"] = None
"""
raise NotImplementedError()
if "columns" not in axes:
axes["columns"] = None
elif axes["columns"] is not None:
if not is_list_like(axes["columns"]):
axes["columns"] = [axes["columns"]]
if errors == "raise":
non_existant = [
obj for obj in axes["columns"] if obj not in self.columns
]
if len(non_existant):
raise ValueError(
"labels {} not contained in axis".format(non_existant)
)
else:
axes["columns"] = [
obj for obj in axes["columns"] if obj in self.columns
]
# If the length is zero, we will just do nothing
if not len(axes["columns"]):
axes["columns"] = None
new_query_compiler = self._query_compiler.drop(
index=axes["index"], columns=axes["columns"]
)
return self._create_or_update_from_compiler(new_query_compiler, inplace)
# TODO implement arguments
def mean(self):
return self._query_compiler.mean()
def sum(self, numeric_only=True):
if numeric_only == False:
raise NotImplementedError("Only sum of numeric fields is implemented")
return self._query_compiler.sum()
def min(self, numeric_only=True):
if numeric_only == False:
raise NotImplementedError("Only sum of numeric fields is implemented")
return self._query_compiler.min()
def max(self, numeric_only=True):
if numeric_only == False:
raise NotImplementedError("Only sum of numeric fields is implemented")
return self._query_compiler.max()
def describe(self):
return self._query_compiler.describe()

View File

@ -1,9 +1,11 @@
import copy
from enum import Enum from enum import Enum
from pandas.core.indexes.numeric import Int64Index import pandas as pd
from pandas.core.indexes.range import RangeIndex from elasticsearch_dsl import Search
import copy from eland import Index
from eland import Query
class Operations: class Operations:
@ -16,17 +18,6 @@ class Operations:
- a query to filter the results (e.g. df.A > 10) - a query to filter the results (e.g. df.A > 10)
This is maintained as a 'task graph' (inspired by dask) This is maintained as a 'task graph' (inspired by dask)
A task graph is a dictionary mapping keys to computations:
A key is any hashable value that is not a task:
```
{'x': 1,
'y': 2,
'z': (add, 'x', 'y'),
'w': (sum, ['x', 'y', 'z']),
'v': [(sum, ['w', 'z']), 2]}
```
(see https://docs.dask.org/en/latest/spec.html) (see https://docs.dask.org/en/latest/spec.html)
""" """
@ -54,7 +45,6 @@ class Operations:
return Operations.SortOrder.DESC return Operations.SortOrder.DESC
def __init__(self, tasks=None): def __init__(self, tasks=None):
if tasks == None: if tasks == None:
self._tasks = [] self._tasks = []
@ -84,6 +74,11 @@ class Operations:
# TODO - validate we are setting columns to a subset of last columns? # TODO - validate we are setting columns to a subset of last columns?
task = ('columns', columns) task = ('columns', columns)
self._tasks.append(task) self._tasks.append(task)
# Iterate backwards through task list looking for last 'columns' task
for task in reversed(self._tasks):
if task[0] == 'columns':
return task[1]
return None
def get_columns(self): def get_columns(self):
# Iterate backwards through task list looking for last 'columns' task # Iterate backwards through task list looking for last 'columns' task
@ -95,10 +90,132 @@ class Operations:
def __repr__(self): def __repr__(self):
return repr(self._tasks) return repr(self._tasks)
def to_pandas(self, query_compiler): def count(self, query_compiler):
query, post_processing = self._to_es_query() query_params, post_processing = self._resolve_tasks()
size, sort_params = Operations._query_to_params(query) # Elasticsearch _count is very efficient and so used to return results here. This means that
# data frames that have restricted size or sort params will not return valid results (_count doesn't support size).
# Longer term we may fall back to pandas, but this may result in loading all index into memory.
if self._size(query_params, post_processing) is not None:
raise NotImplementedError("Requesting count with additional query and processing parameters "
"not supported {0} {1}"
.format(query_params, post_processing))
# Only return requested columns
fields = query_compiler.columns
counts = {}
for field in fields:
body = Query(query_params['query'])
body.exists(field, must=True)
field_exists_count = query_compiler._client.count(index=query_compiler._index_pattern,
body=body.to_count_body())
counts[field] = field_exists_count
return pd.Series(data=counts, index=fields)
def mean(self, query_compiler):
return self._metric_aggs(query_compiler, 'avg')
def sum(self, query_compiler):
return self._metric_aggs(query_compiler, 'sum')
def max(self, query_compiler):
return self._metric_aggs(query_compiler, 'max')
def min(self, query_compiler):
return self._metric_aggs(query_compiler, 'min')
def _metric_aggs(self, query_compiler, func):
query_params, post_processing = self._resolve_tasks()
size = self._size(query_params, post_processing)
if size is not None:
raise NotImplementedError("Can not count field matches if size is set {}".format(size))
columns = self.get_columns()
numeric_source_fields = query_compiler._mappings.numeric_source_fields(columns)
body = Query(query_params['query'])
for field in numeric_source_fields:
body.metric_aggs(field, func, field)
response = query_compiler._client.search(
index=query_compiler._index_pattern,
size=0,
body=body.to_search_body())
# Results are of the form
# "aggregations" : {
# "AvgTicketPrice" : {
# "value" : 628.2536888148849
# }
# }
results = {}
for field in numeric_source_fields:
results[field] = response['aggregations'][field]['value']
s = pd.Series(data=results, index=numeric_source_fields)
return s
def describe(self, query_compiler):
query_params, post_processing = self._resolve_tasks()
size = self._size(query_params, post_processing)
if size is not None:
raise NotImplementedError("Can not count field matches if size is set {}".format(size))
columns = self.get_columns()
numeric_source_fields = query_compiler._mappings.numeric_source_fields(columns)
# for each field we compute:
# count, mean, std, min, 25%, 50%, 75%, max
body = Query(query_params['query'])
for field in numeric_source_fields:
body.metric_aggs('extended_stats_' + field, 'extended_stats', field)
body.metric_aggs('percentiles_' + field, 'percentiles', field)
print(body.to_search_body())
response = query_compiler._client.search(
index=query_compiler._index_pattern,
size=0,
body=body.to_search_body())
results = {}
for field in numeric_source_fields:
values = list()
values.append(response['aggregations']['extended_stats_' + field]['count'])
values.append(response['aggregations']['extended_stats_' + field]['avg'])
values.append(response['aggregations']['extended_stats_' + field]['std_deviation'])
values.append(response['aggregations']['extended_stats_' + field]['min'])
values.append(response['aggregations']['percentiles_' + field]['values']['25.0'])
values.append(response['aggregations']['percentiles_' + field]['values']['50.0'])
values.append(response['aggregations']['percentiles_' + field]['values']['75.0'])
values.append(response['aggregations']['extended_stats_' + field]['max'])
# if not None
if values.count(None) < len(values):
results[field] = values
df = pd.DataFrame(data=results, index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
return df
def to_pandas(self, query_compiler):
query_params, post_processing = self._resolve_tasks()
size, sort_params = Operations._query_params_to_size_and_sort(query_params)
body = Query(query_params['query'])
# Only return requested columns # Only return requested columns
columns = self.get_columns() columns = self.get_columns()
@ -110,10 +227,12 @@ class Operations:
index=query_compiler._index_pattern, index=query_compiler._index_pattern,
size=size, size=size,
sort=sort_params, sort=sort_params,
body=body.to_search_body(),
_source=columns) _source=columns)
else: else:
es_results = query_compiler._client.scan( es_results = query_compiler._client.scan(
index=query_compiler._index_pattern, index=query_compiler._index_pattern,
query=body.to_search_body(),
_source=columns) _source=columns)
# create post sort # create post sort
if sort_params is not None: if sort_params is not None:
@ -132,25 +251,65 @@ class Operations:
task = ('squeeze', axis) task = ('squeeze', axis)
self._tasks.append(task) self._tasks.append(task)
def to_count(self, query_compiler): def index_count(self, query_compiler, field):
query, post_processing = self._to_es_query() # field is the index field so count values
query_params, post_processing = self._resolve_tasks()
size = query['query_size'] # can be None size = self._size(query_params, post_processing)
pp_size = self._count_post_processing(post_processing)
if pp_size is not None:
if size is not None:
size = min(size, pp_size)
else:
size = pp_size
# Size is dictated by operations # Size is dictated by operations
if size is not None: if size is not None:
# TODO - this is not necessarily valid as the field may not exist in ALL these docs
return size return size
exists_query = {"query": {"exists": {"field": query_compiler.index.index_field}}} body = Query(query_params['query'])
body.exists(field, must=True)
return query_compiler._client.count(index=query_compiler._index_pattern, body=exists_query) return query_compiler._client.count(index=query_compiler._index_pattern, body=body.to_count_body())
def _validate_index_operation(self, items):
if not isinstance(items, list):
raise TypeError("list item required - not {}".format(type(items)))
# field is the index field so count values
query_params, post_processing = self._resolve_tasks()
size = self._size(query_params, post_processing)
# Size is dictated by operations
if size is not None:
raise NotImplementedError("Can not count field matches if size is set {}".format(size))
return query_params, post_processing
def index_matches_count(self, query_compiler, field, items):
query_params, post_processing = self._validate_index_operation(items)
body = Query(query_params['query'])
if field == Index.ID_INDEX_FIELD:
body.ids(items, must=True)
else:
body.terms(items, must=True)
return query_compiler._client.count(index=query_compiler._index_pattern, body=body.to_count_body())
def drop_index_values(self, query_compiler, field, items):
self._validate_index_operation(items)
# Putting boolean queries together
# i = 10
# not i = 20
# _id in [1,2,3]
# _id not in [1,2,3]
# a in ['a','b','c']
# b not in ['a','b','c']
# For now use term queries
if field == Index.ID_INDEX_FIELD:
task = ('query_ids', ('must_not', items))
else:
task = ('query_terms', ('must_not', (field, items)))
self._tasks.append(task)
@staticmethod @staticmethod
def _sort_params_to_postprocessing(input): def _sort_params_to_postprocessing(input):
@ -165,18 +324,21 @@ class Operations:
return task return task
@staticmethod @staticmethod
def _query_to_params(query): def _query_params_to_size_and_sort(query_params):
sort_params = None sort_params = None
if query['query_sort_field'] and query['query_sort_order']: if query_params['query_sort_field'] and query_params['query_sort_order']:
sort_params = query['query_sort_field'] + ":" + Operations.SortOrder.to_string(query['query_sort_order']) sort_params = query_params['query_sort_field'] + ":" + Operations.SortOrder.to_string(
query_params['query_sort_order'])
size = query['query_size'] size = query_params['query_size']
return size, sort_params return size, sort_params
1 1
@staticmethod @staticmethod
def _count_post_processing(post_processing): def _count_post_processing(post_processing):
size = None size = None
for action in post_processing: for action in post_processing:
if action[0] == 'head' or action[0] == 'tail': if action[0] == 'head' or action[0] == 'tail':
if size is None or action[1][1] < size: if size is None or action[1][1] < size:
@ -201,45 +363,48 @@ class Operations:
else: else:
df = df.sort_values(sort_field, False) df = df.sort_values(sort_field, False)
elif action[0] == 'iloc': elif action[0] == 'iloc':
index_indexer = action[1][0] index_indexer = action[1][0]
column_indexer = action[1][1] column_indexer = action[1][1]
if index_indexer is None: if index_indexer is None:
index_indexer = slice(None) index_indexer = slice(None)
if column_indexer is None: if column_indexer is None:
column_indexer = slice(None) column_indexer = slice(None)
df = df.iloc[index_indexer, column_indexer] df = df.iloc[index_indexer, column_indexer]
elif action[0] == 'squeeze': elif action[0] == 'squeeze':
print(df)
df = df.squeeze(axis=action[1]) df = df.squeeze(axis=action[1])
print(df)
return df return df
def _to_es_query(self): def _resolve_tasks(self):
# We now try and combine all tasks into an Elasticsearch query # We now try and combine all tasks into an Elasticsearch query
# Some operations can be simply combined into a single query # Some operations can be simply combined into a single query
# other operations require pre-queries and then combinations # other operations require pre-queries and then combinations
# other operations require in-core post-processing of results # other operations require in-core post-processing of results
query = {"query_sort_field": None, query_params = {"query_sort_field": None,
"query_sort_order": None, "query_sort_order": None,
"query_size": None, "query_size": None,
"query_fields": None} "query_fields": None,
"query": Query()}
post_processing = [] post_processing = []
for task in self._tasks: for task in self._tasks:
if task[0] == 'head': if task[0] == 'head':
query, post_processing = self._resolve_head(task, query, post_processing) query_params, post_processing = self._resolve_head(task, query_params, post_processing)
elif task[0] == 'tail': elif task[0] == 'tail':
query, post_processing = self._resolve_tail(task, query, post_processing) query_params, post_processing = self._resolve_tail(task, query_params, post_processing)
elif task[0] == 'iloc': elif task[0] == 'iloc':
query, post_processing = self._resolve_iloc(task, query, post_processing) query_params, post_processing = self._resolve_iloc(task, query_params, post_processing)
else: # a lot of operations simply post-process the dataframe - put these straight through elif task[0] == 'query_ids':
query, post_processing = self._resolve_post_processing_task(task, query, post_processing) query_params, post_processing = self._resolve_query_ids(task, query_params, post_processing)
elif task[0] == 'query_terms':
query_params, post_processing = self._resolve_query_terms(task, query_params, post_processing)
else: # a lot of operations simply post-process the dataframe - put these straight through
query_params, post_processing = self._resolve_post_processing_task(task, query_params, post_processing)
return query, post_processing return query_params, post_processing
def _resolve_head(self, item, query, post_processing): def _resolve_head(self, item, query_params, post_processing):
# head - sort asc, size n # head - sort asc, size n
# |12345-------------| # |12345-------------|
query_sort_field = item[1][0] query_sort_field = item[1][0]
@ -251,26 +416,26 @@ class Operations:
# overwriting previous head) # overwriting previous head)
if len(post_processing) > 0: if len(post_processing) > 0:
post_processing.append(item) post_processing.append(item)
return query, post_processing return query_params, post_processing
if query['query_sort_field'] is None: if query_params['query_sort_field'] is None:
query['query_sort_field'] = query_sort_field query_params['query_sort_field'] = query_sort_field
# if it is already sorted we use existing field # if it is already sorted we use existing field
if query['query_sort_order'] is None: if query_params['query_sort_order'] is None:
query['query_sort_order'] = query_sort_order query_params['query_sort_order'] = query_sort_order
# if it is already sorted we get head of existing order # if it is already sorted we get head of existing order
if query['query_size'] is None: if query_params['query_size'] is None:
query['query_size'] = query_size query_params['query_size'] = query_size
else: else:
# truncate if head is smaller # truncate if head is smaller
if query_size < query['query_size']: if query_size < query_params['query_size']:
query['query_size'] = query_size query_params['query_size'] = query_size
return query, post_processing return query_params, post_processing
def _resolve_tail(self, item, query, post_processing): def _resolve_tail(self, item, query_params, post_processing):
# tail - sort desc, size n, post-process sort asc # tail - sort desc, size n, post-process sort asc
# |-------------12345| # |-------------12345|
query_sort_field = item[1][0] query_sort_field = item[1][0]
@ -278,41 +443,41 @@ class Operations:
query_size = item[1][1] query_size = item[1][1]
# If this is a tail of a tail adjust settings and return # If this is a tail of a tail adjust settings and return
if query['query_size'] is not None and \ if query_params['query_size'] is not None and \
query['query_sort_order'] == query_sort_order and \ query_params['query_sort_order'] == query_sort_order and \
post_processing == [('sort_index')]: post_processing == [('sort_index')]:
if query_size < query['query_size']: if query_size < query_params['query_size']:
query['query_size'] = query_size query_params['query_size'] = query_size
return query, post_processing return query_params, post_processing
# If we are already postprocessing the query results, just get 'tail' of these # If we are already postprocessing the query results, just get 'tail' of these
# (note, currently we just append another tail, we don't optimise by # (note, currently we just append another tail, we don't optimise by
# overwriting previous tail) # overwriting previous tail)
if len(post_processing) > 0: if len(post_processing) > 0:
post_processing.append(item) post_processing.append(item)
return query, post_processing return query_params, post_processing
# If results are already constrained, just get 'tail' of these # If results are already constrained, just get 'tail' of these
# (note, currently we just append another tail, we don't optimise by # (note, currently we just append another tail, we don't optimise by
# overwriting previous tail) # overwriting previous tail)
if query['query_size'] is not None: if query_params['query_size'] is not None:
post_processing.append(item) post_processing.append(item)
return query, post_processing return query_params, post_processing
else: else:
query['query_size'] = query_size query_params['query_size'] = query_size
if query['query_sort_field'] is None: if query_params['query_sort_field'] is None:
query['query_sort_field'] = query_sort_field query_params['query_sort_field'] = query_sort_field
if query['query_sort_order'] is None: if query_params['query_sort_order'] is None:
query['query_sort_order'] = query_sort_order query_params['query_sort_order'] = query_sort_order
else: else:
# reverse sort order # reverse sort order
query['query_sort_order'] = Operations.SortOrder.reverse(query_sort_order) query_params['query_sort_order'] = Operations.SortOrder.reverse(query_sort_order)
post_processing.append(('sort_index')) post_processing.append(('sort_index'))
return query, post_processing return query_params, post_processing
def _resolve_iloc(self, item, query, post_processing): def _resolve_iloc(self, item, query_params, post_processing):
# tail - sort desc, size n, post-process sort asc # tail - sort desc, size n, post-process sort asc
# |---4--7-9---------| # |---4--7-9---------|
@ -322,33 +487,70 @@ class Operations:
last_item = int_index.max() last_item = int_index.max()
# If we have a query_size we do this post processing # If we have a query_size we do this post processing
if query['query_size'] is not None: if query_params['query_size'] is not None:
post_processing.append(item) post_processing.append(item)
return query, post_processing return query_params, post_processing
# size should be > last item # size should be > last item
query['query_size'] = last_item + 1 query_params['query_size'] = last_item + 1
post_processing.append(item) post_processing.append(item)
return query, post_processing return query_params, post_processing
def _resolve_post_processing_task(self, item, query, post_processing): def _resolve_query_ids(self, item, query_params, post_processing):
# task = ('query_ids', ('must_not', items))
must_clause = item[1][0]
ids = item[1][1]
if must_clause == 'must':
query_params['query'].ids(ids, must=True)
else:
query_params['query'].ids(ids, must=False)
return query_params, post_processing
def _resolve_query_terms(self, item, query_params, post_processing):
# task = ('query_terms', ('must_not', (field, terms)))
must_clause = item[1][0]
field = item[1][1][0]
terms = item[1][1][1]
if must_clause == 'must':
query_params['query'].terms(field, terms, must=True)
else:
query_params['query'].terms(field, terms, must=False)
return query_params, post_processing
def _resolve_post_processing_task(self, item, query_params, post_processing):
# Just do this in post-processing # Just do this in post-processing
post_processing.append(item) post_processing.append(item)
return query, post_processing return query_params, post_processing
def _size(self, query_params, post_processing):
# Shrink wrap code around checking if size parameter is set
size = query_params['query_size'] # can be None
pp_size = self._count_post_processing(post_processing)
if pp_size is not None:
if size is not None:
size = min(size, pp_size)
else:
size = pp_size
# This can return None
return size
def info_es(self, buf): def info_es(self, buf):
buf.write("Operations:\n") buf.write("Operations:\n")
buf.write("\ttasks: {0}\n".format(self._tasks)) buf.write("\ttasks: {0}\n".format(self._tasks))
query, post_processing = self._to_es_query() query_params, post_processing = self._resolve_tasks()
size, sort_params = Operations._query_to_params(query) size, sort_params = Operations._query_params_to_size_and_sort(query_params)
columns = self.get_columns() columns = self.get_columns()
buf.write("\tsize: {0}\n".format(size)) buf.write("\tsize: {0}\n".format(size))
buf.write("\tsort_params: {0}\n".format(sort_params)) buf.write("\tsort_params: {0}\n".format(sort_params))
buf.write("\tcolumns: {0}\n".format(columns)) buf.write("\tcolumns: {0}\n".format(columns))
buf.write("\tpost_processing: {0}\n".format(post_processing)) buf.write("\tpost_processing: {0}\n".format(post_processing))

93
eland/query.py Normal file
View File

@ -0,0 +1,93 @@
import warnings
from copy import deepcopy
class Query:
"""
Simple class to manage building Elasticsearch queries.
Specifically, this
"""
def __init__(self, query=None):
if query is None:
self._query = self._query_template()
self._aggs = {}
else:
# Deep copy the incoming query so we can change it
self._query = deepcopy(query._query)
self._aggs = deepcopy(query._aggs)
def exists(self, field, must=True):
"""
Add exists query
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-exists-query.html
"""
if must:
self._query['bool']['must'].append({'exists': {'field': field}})
else:
self._query['bool']['must_not'].append({'exists': {'field': field}})
def ids(self, items, must=True):
"""
Add ids query
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-ids-query.html
"""
if must:
self._query['bool']['must'].append({'ids': {'values': items}})
else:
self._query['bool']['must_not'].append({'ids': {'values': items}})
def terms(self, field, items, must=True):
"""
Add ids query
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-terms-query.html
"""
if must:
self._query['bool']['must'].append({'terms': {field: items}})
else:
self._query['bool']['must_not'].append({'terms': {field: items}})
def metric_aggs(self, name, func, field):
"""
Add metric agg e.g
"aggs": {
"name": {
"max": {
"field": "AvgTicketPrice"
}
}
}
"""
agg = {
func: {
"field": field
}
}
self._aggs[name] = agg
def to_search_body(self):
body = {"query": self._query, "aggs": self._aggs}
return body
def to_count_body(self):
if len(self._aggs) > 0:
warnings.warn('Requesting count for agg query {}', self)
body = {"query": self._query}
return body
def __repr__(self):
return repr(self.to_search_body())
@staticmethod
def _query_template():
template = {
"bool": {
"must": [],
"must_not": []
}
}
return deepcopy(template)

View File

@ -52,6 +52,17 @@ class ElandQueryCompiler(BaseQueryCompiler):
columns = property(_get_columns, _set_columns) columns = property(_get_columns, _set_columns)
index = property(_get_index) index = property(_get_index)
@property
def dtypes(self):
columns = self._operations.get_columns()
return self._mappings.dtypes(columns)
def get_dtype_counts(self):
columns = self._operations.get_columns()
return self._mappings.get_dtype_counts(columns)
# END Index, columns, and dtypes objects # END Index, columns, and dtypes objects
def _es_results_to_pandas(self, results): def _es_results_to_pandas(self, results):
@ -226,7 +237,25 @@ class ElandQueryCompiler(BaseQueryCompiler):
index_count: int index_count: int
Count of docs where index_field exists Count of docs where index_field exists
""" """
return self._operations.to_count(self) return self._operations.index_count(self, self.index.index_field)
def _index_matches_count(self, items):
"""
Returns
-------
index_count: int
Count of docs where items exist
"""
return self._operations.index_matches_count(self, self.index.index_field, items)
def _index_matches(self, items):
"""
Returns
-------
index_count: int
Count of list of the items that match
"""
return self._operations.index_matches(self, self.index.index_field, items)
def copy(self): def copy(self):
return self.__constructor__( return self.__constructor__(
@ -295,6 +324,31 @@ class ElandQueryCompiler(BaseQueryCompiler):
return result return result
def drop(self, index=None, columns=None):
result = self.copy()
# Drop gets all columns and removes drops
if columns is not None:
# columns is a pandas.Index so we can use pandas drop feature
new_columns = self.columns.drop(columns)
result._operations.set_columns(new_columns.to_list())
if index is not None:
result._operations.drop_index_values(self, self.index.index_field, index)
return result
def count(self):
return self._operations.count(self)
def mean(self):
return self._operations.mean(self)
def sum(self):
return self._operations.sum(self)
def min(self):
return self._operations.min(self)
def max(self):
return self._operations.max(self)
def info_es(self, buf): def info_es(self, buf):
buf.write("index_pattern: {index_pattern}\n".format(index_pattern=self._index_pattern)) buf.write("index_pattern: {index_pattern}\n".format(index_pattern=self._index_pattern))
@ -302,3 +356,5 @@ class ElandQueryCompiler(BaseQueryCompiler):
self._mappings.info_es(buf) self._mappings.info_es(buf)
self._operations.info_es(buf) self._operations.info_es(buf)
def describe(self):
return self._operations.describe(self)

View File

@ -56,6 +56,18 @@ def assert_pandas_eland_frame_equal(left, right):
# Use pandas tests to check similarity # Use pandas tests to check similarity
assert_frame_equal(left, right._to_pandas()) assert_frame_equal(left, right._to_pandas())
def assert_eland_frame_equal(left, right):
if not isinstance(left, ed.DataFrame):
raise AssertionError("Expected type {exp_type}, found {act_type} instead".format(
exp_type='ed.DataFrame', act_type=type(left)))
if not isinstance(right, ed.DataFrame):
raise AssertionError("Expected type {exp_type}, found {act_type} instead".format(
exp_type='ed.DataFrame', act_type=type(right)))
# Use pandas tests to check similarity
assert_frame_equal(left._to_pandas(), right._to_pandas())
def assert_pandas_eland_series_equal(left, right): def assert_pandas_eland_series_equal(left, right):
if not isinstance(left, pd.Series): if not isinstance(left, pd.Series):

View File

@ -5,9 +5,15 @@ from eland.tests.common import TestData
class TestDataFrameCount(TestData): class TestDataFrameCount(TestData):
def test_to_string1(self): def test_to_count1(self):
ed_flights = self.ed_flights() pd_ecommerce = self.pd_ecommerce()
pd_flights = self.pd_flights() ed_ecommerce = self.ed_ecommerce()
pd_count = pd_ecommerce.count()
ed_count = ed_ecommerce.count()
print(pd_count)
print(ed_count)
#ed_count = ed_flights.count()

View File

@ -0,0 +1,40 @@
# File called _pytest for PyCharm compatability
from io import StringIO
from eland.tests.common import TestData
class TestDataFrameInfo(TestData):
def test_to_describe1(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
pd_describe = pd_flights.describe()
ed_describe = ed_flights.describe()
# TODO - this fails now as ES aggregations are approximate
# if ES percentile agg uses
# "hdr": {
# "number_of_significant_value_digits": 3
# }
# this works
# assert_almost_equal(pd_flights_describe, ed_flights_describe)
pd_ecommerce_describe = self.pd_ecommerce().describe()
ed_ecommerce_describe = self.ed_ecommerce().describe()
# We don't compare ecommerce here as the default dtypes in pandas from read_json
# don't match the mapping types. This is mainly because the products field is
# nested and so can be treated as a multi-field in ES, but not in pandas
def test_to_describe2(self):
pd_flights = self.pd_flights().head()
ed_flights = self.ed_flights().head()
pd_describe = pd_flights.describe()
ed_describe = ed_flights.describe()
print(pd_describe)
print(ed_describe)

View File

@ -0,0 +1,56 @@
# File called _pytest for PyCharm compatability
import pandas as pd
import eland as ed
from eland.tests.common import TestData
from eland.tests.common import (
assert_eland_frame_equal,
assert_pandas_eland_frame_equal,
assert_pandas_eland_series_equal
)
import numpy as np
class TestDataFrameDrop(TestData):
def test_drop1(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
# ['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID',
# 'DestCityName', 'DestCountry', 'DestLocation', 'DestRegion',
# 'DestWeather', 'DistanceKilometers', 'DistanceMiles', 'FlightDelay',
# 'FlightDelayMin', 'FlightDelayType', 'FlightNum', 'FlightTimeHour',
# 'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
# 'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather',
# 'dayOfWeek', 'timestamp']
pd_col0 = pd_flights.drop(['Carrier', 'DestCityName'], axis=1)
pd_col1 = pd_flights.drop(columns=['Carrier', 'DestCityName'])
ed_col0 = ed_flights.drop(['Carrier', 'DestCityName'], axis=1)
ed_col1 = ed_flights.drop(columns=['Carrier', 'DestCityName'])
#assert_pandas_eland_frame_equal(pd_col0, ed_col0)
#assert_pandas_eland_frame_equal(pd_col1, ed_col1)
# Drop rows by index
pd_idx0 = pd_flights.drop(['1', '2'])
ed_idx0 = ed_flights.drop(['1', '2'])
print(pd_idx0.info())
print(ed_idx0.info())
assert_pandas_eland_frame_equal(pd_idx0, ed_idx0)
"""
#assert_pandas_eland_frame_equal(pd_iloc0, ed_iloc0) # pd_iloc0 is Series
assert_pandas_eland_frame_equal(pd_iloc1, ed_iloc1)
assert_pandas_eland_frame_equal(pd_iloc2, ed_iloc2)
assert_pandas_eland_frame_equal(pd_iloc3, ed_iloc3)
assert_pandas_eland_frame_equal(pd_iloc4, ed_iloc4)
#assert_pandas_eland_frame_equal(pd_iloc5, ed_iloc5) # pd_iloc5 is numpy_bool
assert_pandas_eland_frame_equal(pd_iloc6, ed_iloc6)
assert_pandas_eland_frame_equal(pd_iloc7, ed_iloc7)
assert_pandas_eland_frame_equal(pd_iloc8, ed_iloc8)
assert_pandas_eland_frame_equal(pd_iloc9, ed_iloc9)
"""

View File

@ -37,3 +37,19 @@ class TestDataFrameGetItem(TestData):
pd_flights_OriginAirportID = pd_flights.OriginAirportID pd_flights_OriginAirportID = pd_flights.OriginAirportID
assert_pandas_eland_series_equal(pd_flights_OriginAirportID, ed_flights_OriginAirportID) assert_pandas_eland_series_equal(pd_flights_OriginAirportID, ed_flights_OriginAirportID)
def test_getitem4(self):
ed_flights = self.ed_flights().head(89)
pd_flights = self.pd_flights().head(89)
ed_col0 = ed_flights[['DestCityName', 'DestCountry', 'DestLocation', 'DestRegion']]
try:
ed_col1 = ed_col0['Carrier']
except KeyError:
pass
pd_col1 = pd_flights['DestCountry']
ed_col1 = ed_col0['DestCountry']
assert_pandas_eland_series_equal(pd_col1, ed_col1)

View File

@ -12,28 +12,6 @@ import numpy as np
class TestDataFrameiLoc(TestData): class TestDataFrameiLoc(TestData):
def test_range(self):
columns = ['a', 'b', 'c', 'd', 'e']
r = pd.RangeIndex(0, 3, 1)
i = pd.Int64Index([1, 2])
dates = pd.date_range('1/1/2000', periods=8)
df = pd.DataFrame(np.random.randn(8, 4), index = dates, columns = ['A', 'B', 'C', 'D'])
print(df)
print("STEVE ", df.squeeze())
ii = slice(None)
rr = slice(None)
print(df.iloc[:, 0:3])
print(df.iloc[i, r])
print(df.iloc[ii, rr])
def test_iloc1(self): def test_iloc1(self):
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
pd_flights = self.pd_flights() pd_flights = self.pd_flights()

View File

@ -1,4 +1,5 @@
# File called _pytest for PyCharm compatability # File called _pytest for PyCharm compatability
from io import StringIO
from eland.tests.common import TestData from eland.tests.common import TestData
@ -9,4 +10,16 @@ class TestDataFrameInfo(TestData):
ed_flights = self.ed_flights() ed_flights = self.ed_flights()
pd_flights = self.pd_flights() pd_flights = self.pd_flights()
ed_flights.info() ed_buf = StringIO()
pd_buf = StringIO()
# Ignore memory_usage and first line (class name)
ed_flights.info(buf=ed_buf, memory_usage=False)
pd_flights.info(buf=pd_buf, memory_usage=False)
ed_buf_lines = ed_buf.getvalue().split('\n')
pd_buf_lines = pd_buf.getvalue().split('\n')
assert pd_buf_lines[1:] == ed_buf_lines[1:]
print(self.ed_ecommerce().info())

View File

@ -0,0 +1,46 @@
# File called _pytest for PyCharm compatability
from eland.tests.common import TestData
from pandas.util.testing import assert_series_equal
class TestDataFrameMean(TestData):
def test_to_mean(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
pd_mean = pd_flights.mean()
ed_mean = ed_flights.mean()
assert_series_equal(pd_mean, ed_mean)
def test_to_sum(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
pd_sum = pd_flights.sum(numeric_only=True)
ed_sum = ed_flights.sum(numeric_only=True)
assert_series_equal(pd_sum, ed_sum)
def test_to_min(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
pd_min = pd_flights.min(numeric_only=True)
ed_min = ed_flights.min(numeric_only=True)
assert_series_equal(pd_min, ed_min)
def test_to_max(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
pd_max = pd_flights.max(numeric_only=True)
ed_max = ed_flights.max(numeric_only=True)
assert_series_equal(pd_max, ed_max)

View File

@ -0,0 +1,20 @@
# File called _pytest for PyCharm compatability
from eland.tests.common import TestData
from pandas.util.testing import assert_series_equal
class TestDataFrameSum(TestData):
def test_to_mean1(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
pd_sum = pd_flights.sum(numeric_only=True)
ed_sum = ed_flights.sum(numeric_only=True)
assert_series_equal(pd_sum, ed_sum)

View File

View File

@ -0,0 +1,44 @@
# File called _pytest for PyCharm compatability
from eland.tests.common import TestData
from pandas.util.testing import assert_series_equal
class TestMappingsDtypes(TestData):
def test_dtypes1(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
pd_dtypes = pd_flights.dtypes
ed_dtypes = ed_flights._query_compiler._mappings.dtypes()
assert_series_equal(pd_dtypes, ed_dtypes)
def test_dtypes2(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()[['Carrier', 'AvgTicketPrice', 'Cancelled']]
pd_dtypes = pd_flights.dtypes
ed_dtypes = ed_flights._query_compiler._mappings.dtypes(columns=['Carrier', 'AvgTicketPrice', 'Cancelled'])
assert_series_equal(pd_dtypes, ed_dtypes)
def test_get_dtype_counts1(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
pd_dtypes = pd_flights.get_dtype_counts().sort_index()
ed_dtypes = ed_flights._query_compiler._mappings.get_dtype_counts().sort_index()
assert_series_equal(pd_dtypes, ed_dtypes)
def test_get_dtype_counts2(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()[['Carrier', 'AvgTicketPrice', 'Cancelled']]
pd_dtypes = pd_flights.get_dtype_counts().sort_index()
ed_dtypes = ed_flights._query_compiler._mappings.\
get_dtype_counts(columns=['Carrier', 'AvgTicketPrice', 'Cancelled']).sort_index()
assert_series_equal(pd_dtypes, ed_dtypes)

View File

@ -0,0 +1,27 @@
# File called _pytest for PyCharm compatability
from eland.tests.common import TestData
from eland import Query
class TestQueryCopy(TestData):
def test_copy(self):
q = Query()
q.exists('field_a')
q.exists('field_b', must=False)
print(q.to_query())
q1 = Query(q)
q.exists('field_c', must=False)
q1.exists('field_c1', must=False)
print(q.to_query())
print(q1.to_query())

View File

@ -36,7 +36,7 @@ def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=
mapping = Mappings._generate_es_mappings(df) mapping = Mappings._generate_es_mappings(df)
# If table exists, check if_exists parameter # If table exists, check if_exists parameter
if client.indices().exists(destination_index): if client.index_exists(index=destination_index):
if if_exists == "fail": if if_exists == "fail":
raise ValueError( raise ValueError(
"Could not create the index [{0}] because it " "Could not create the index [{0}] because it "
@ -45,12 +45,12 @@ def pandas_to_es(df, es_params, destination_index, if_exists='fail', chunk_size=
"'append' or 'replace' data.".format(destination_index) "'append' or 'replace' data.".format(destination_index)
) )
elif if_exists == "replace": elif if_exists == "replace":
client.indices().delete(destination_index) client.index_delete(index=destination_index)
client.indices().create(destination_index, mapping) client.index_create(index=destination_index, mapping=mapping)
# elif if_exists == "append": # elif if_exists == "append":
# TODO validate mapping is compatible # TODO validate mapping is compatible
else: else:
client.indices().create(destination_index, mapping) client.index_create(index=destination_index, mapping=mapping)
# Now add data # Now add data
actions = [] actions = []