mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Merge pull request #13 from stevedodson/master
Adding eland.Index feature
This commit is contained in:
commit
99279724f6
58
NOTES.md
Normal file
58
NOTES.md
Normal file
@ -0,0 +1,58 @@
|
||||
# Implementation Notes
|
||||
|
||||
The goal of an `eland.DataFrame` is to enable users who are familiar with `pandas.DataFrame`
|
||||
to access, explore and manipulate data that resides in Elasticsearch.
|
||||
|
||||
Ideally, all data should reside in Elasticsearch and not to reside in memory.
|
||||
This restricts the API, but allows access to huge data sets that do not fit into memory, and allows
|
||||
use of powerful Elasticsearch features such as aggrergations.
|
||||
|
||||
## Implementation Details
|
||||
|
||||
### 3rd Party System Access
|
||||
|
||||
Generally, integrations with [3rd party storage systems](https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html)
|
||||
(SQL, Google Big Query etc.) involve accessing these systems and reading all external data into an
|
||||
in-core pandas data structure. This also applies to [Apache Arrow](https://arrow.apache.org/docs/python/pandas.html)
|
||||
structures.
|
||||
|
||||
Whilst this provides access to data in these systems, for large datasets this can require significant
|
||||
in-core memory, and for systems such as Elasticsearch, bulk export of data can be an inefficient way
|
||||
of exploring the data.
|
||||
|
||||
An alternative option is to create an API that proxies `pandas.DataFrame`-like calls to Elasticsearch
|
||||
queries and operations. This could allow the Elasticsearch cluster to perform operations such as
|
||||
aggregations rather than exporting all the data and performing this operation in-core.
|
||||
|
||||
### Implementation Options
|
||||
|
||||
An option would be to replace the `pandas.DataFrame` backend in-core memory structures with Elasticsearch
|
||||
accessors. This would allow full access to the `pandas.DataFrame` APIs. However, this has issues:
|
||||
|
||||
* If a `pandas.DataFrame` instance maps to an index, typical manipulation of a `pandas.DataFrame`
|
||||
may involve creating many derived `pandas.DataFrame` instances. Constructing an index per
|
||||
`pandas.DataFrame` may result in many Elasticsearch indexes and a significant load on Elasticsearch.
|
||||
For example, `df_a = df['a']` should not require Elasticsearch indices `df` and `df_a`
|
||||
|
||||
* Not all `pandas.DataFrame` APIs map to things we may want to do in Elasticsearch. In particular,
|
||||
API calls that involve exporting all data from Elasticsearch into memory e.g. `df.to_dict()`.
|
||||
|
||||
* The backend `pandas.DataFrame` structures are not easily abstractable and are deeply embedded in
|
||||
the implementation.
|
||||
|
||||
Another option is to create a `eland.DataFrame` API that mimics appropriate aspects of
|
||||
the `pandas.DataFrame` API. This resolves some of the issues above as:
|
||||
|
||||
* `df_a = df['a']` could be implemented as a change to the Elasticsearch query used, rather
|
||||
than a new index
|
||||
|
||||
* Instead of supporting the enitre `pandas.DataFrame` API we can support a subset appropriate for
|
||||
Elasticsearch. If addition calls are required, we could to create a `eland.DataFrame.to_pandas()`
|
||||
method which would explicitly export all data to a `pandas.DataFrame`
|
||||
|
||||
* Creating a new `eland.DataFrame` API gives us full flexibility in terms of implementation. However,
|
||||
it does create a large amount of work which may duplicate a lot of the `pandas` code - for example,
|
||||
printing objects etc. - this creates maintenance issues etc.
|
||||
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
from .utils import *
|
||||
from .frame import *
|
||||
from .dataframe import *
|
||||
from .client import *
|
||||
from .mappings import *
|
||||
from .index import *
|
||||
|
@ -23,21 +23,19 @@ Similarly, only Elasticsearch searchable fields can be searched or filtered, and
|
||||
only Elasticsearch aggregatable fields can be aggregated or grouped.
|
||||
|
||||
"""
|
||||
import eland as ed
|
||||
|
||||
from elasticsearch import Elasticsearch
|
||||
from elasticsearch_dsl import Search
|
||||
import sys
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from pandas.core.arrays.sparse import BlockIndex
|
||||
|
||||
from elasticsearch_dsl import Search
|
||||
from pandas.compat import StringIO
|
||||
from pandas.core import common as com
|
||||
from pandas.io.common import _expand_user, _stringify_path
|
||||
from pandas.io.formats import format as fmt
|
||||
from pandas.io.formats.printing import pprint_thing
|
||||
from pandas.io.formats import console
|
||||
|
||||
from io import StringIO
|
||||
import eland as ed
|
||||
|
||||
import sys
|
||||
|
||||
class DataFrame():
|
||||
"""
|
||||
@ -79,26 +77,24 @@ class DataFrame():
|
||||
object is created, the object is not rebuilt and so inconsistencies can occur.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
client,
|
||||
index_pattern,
|
||||
mappings=None,
|
||||
operations=None):
|
||||
self.client = ed.Client(client)
|
||||
self.index_pattern = index_pattern
|
||||
index_field=None):
|
||||
|
||||
self._client = ed.Client(client)
|
||||
self._index_pattern = index_pattern
|
||||
|
||||
# Get and persist mappings, this allows us to correctly
|
||||
# map returned types from Elasticsearch to pandas datatypes
|
||||
if mappings is None:
|
||||
self.mappings = ed.Mappings(self.client, self.index_pattern)
|
||||
self._mappings = ed.Mappings(self._client, self._index_pattern)
|
||||
else:
|
||||
self.mappings = mappings
|
||||
self._mappings = mappings
|
||||
|
||||
# Initialise a list of 'operations'
|
||||
# these are filters
|
||||
self.operations = []
|
||||
if operations is not None:
|
||||
self.operations.extend(operations)
|
||||
self._index = ed.Index(index_field)
|
||||
|
||||
def _es_results_to_pandas(self, results):
|
||||
"""
|
||||
@ -187,6 +183,7 @@ class DataFrame():
|
||||
TODO - an option here is to use Elasticsearch's multi-field matching instead of pandas treatment of lists (which isn't great)
|
||||
NOTE - using this lists is generally not a good way to use this API
|
||||
"""
|
||||
|
||||
def flatten_dict(y):
|
||||
out = {}
|
||||
|
||||
@ -197,7 +194,7 @@ class DataFrame():
|
||||
is_source_field = False
|
||||
pd_dtype = 'object'
|
||||
else:
|
||||
is_source_field, pd_dtype = self.mappings.source_field_pd_dtype(name[:-1])
|
||||
is_source_field, pd_dtype = self._mappings.source_field_pd_dtype(name[:-1])
|
||||
|
||||
if not is_source_field and type(x) is dict:
|
||||
for a in x:
|
||||
@ -205,7 +202,7 @@ class DataFrame():
|
||||
elif not is_source_field and type(x) is list:
|
||||
for a in x:
|
||||
flatten(a, name)
|
||||
elif is_source_field == True: # only print source fields from mappings (TODO - not so efficient for large number of fields and filtered mapping)
|
||||
elif is_source_field == True: # only print source fields from mappings (TODO - not so efficient for large number of fields and filtered mapping)
|
||||
field_name = name[:-1]
|
||||
|
||||
# Coerce types - for now just datetime
|
||||
@ -227,14 +224,22 @@ class DataFrame():
|
||||
return out
|
||||
|
||||
rows = []
|
||||
index = []
|
||||
for hit in results['hits']['hits']:
|
||||
row = hit['_source']
|
||||
|
||||
# get index value - can be _id or can be field value in source
|
||||
if self._index.is_source_field:
|
||||
index_field = row[self._index.index_field]
|
||||
else:
|
||||
index_field = hit[self._index.index_field]
|
||||
index.append(index_field)
|
||||
|
||||
# flatten row to map correctly to 2D DataFrame
|
||||
rows.append(flatten_dict(row))
|
||||
|
||||
# Create pandas DataFrame
|
||||
df = pd.DataFrame(data=rows)
|
||||
df = pd.DataFrame(data=rows, index=index)
|
||||
|
||||
# _source may not contain all columns in the mapping
|
||||
# therefore, fill in missing columns
|
||||
@ -242,7 +247,7 @@ class DataFrame():
|
||||
missing_columns = list(set(self.columns) - set(df.columns))
|
||||
|
||||
for missing in missing_columns:
|
||||
is_source_field, pd_dtype = self.mappings.source_field_pd_dtype(missing)
|
||||
is_source_field, pd_dtype = self._mappings.source_field_pd_dtype(missing)
|
||||
df[missing] = None
|
||||
df[missing].astype(pd_dtype)
|
||||
|
||||
@ -252,20 +257,32 @@ class DataFrame():
|
||||
return df
|
||||
|
||||
def head(self, n=5):
|
||||
results = self.client.search(index=self.index_pattern, size=n)
|
||||
sort_params = self._index.sort_field + ":asc"
|
||||
|
||||
results = self._client.search(index=self._index_pattern, size=n, sort=sort_params)
|
||||
|
||||
return self._es_results_to_pandas(results)
|
||||
|
||||
|
||||
def tail(self, n=5):
|
||||
sort_params = self._index.sort_field + ":desc"
|
||||
|
||||
results = self._client.search(index=self._index_pattern, size=n, sort=sort_params)
|
||||
|
||||
df = self._es_results_to_pandas(results)
|
||||
|
||||
# reverse order (index ascending)
|
||||
return df.sort_index()
|
||||
|
||||
def describe(self):
|
||||
numeric_source_fields = self.mappings.numeric_source_fields()
|
||||
numeric_source_fields = self._mappings.numeric_source_fields()
|
||||
|
||||
# for each field we compute:
|
||||
# count, mean, std, min, 25%, 50%, 75%, max
|
||||
search = Search(using=self.client, index=self.index_pattern).extra(size=0)
|
||||
search = Search(using=self._client, index=self._index_pattern).extra(size=0)
|
||||
|
||||
for field in numeric_source_fields:
|
||||
search.aggs.metric('extended_stats_'+field, 'extended_stats', field=field)
|
||||
search.aggs.metric('percentiles_'+field, 'percentiles', field=field)
|
||||
search.aggs.metric('extended_stats_' + field, 'extended_stats', field=field)
|
||||
search.aggs.metric('percentiles_' + field, 'percentiles', field=field)
|
||||
|
||||
response = search.execute()
|
||||
|
||||
@ -273,21 +290,21 @@ class DataFrame():
|
||||
|
||||
for field in numeric_source_fields:
|
||||
values = []
|
||||
values.append(response.aggregations['extended_stats_'+field]['count'])
|
||||
values.append(response.aggregations['extended_stats_'+field]['avg'])
|
||||
values.append(response.aggregations['extended_stats_'+field]['std_deviation'])
|
||||
values.append(response.aggregations['extended_stats_'+field]['min'])
|
||||
values.append(response.aggregations['percentiles_'+field]['values']['25.0'])
|
||||
values.append(response.aggregations['percentiles_'+field]['values']['50.0'])
|
||||
values.append(response.aggregations['percentiles_'+field]['values']['75.0'])
|
||||
values.append(response.aggregations['extended_stats_'+field]['max'])
|
||||
|
||||
values.append(response.aggregations['extended_stats_' + field]['count'])
|
||||
values.append(response.aggregations['extended_stats_' + field]['avg'])
|
||||
values.append(response.aggregations['extended_stats_' + field]['std_deviation'])
|
||||
values.append(response.aggregations['extended_stats_' + field]['min'])
|
||||
values.append(response.aggregations['percentiles_' + field]['values']['25.0'])
|
||||
values.append(response.aggregations['percentiles_' + field]['values']['50.0'])
|
||||
values.append(response.aggregations['percentiles_' + field]['values']['75.0'])
|
||||
values.append(response.aggregations['extended_stats_' + field]['max'])
|
||||
|
||||
# if not None
|
||||
if (values.count(None) < len(values)):
|
||||
results[field] = values
|
||||
|
||||
df = pd.DataFrame(data=results, index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
|
||||
|
||||
|
||||
return df
|
||||
|
||||
def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,
|
||||
@ -305,12 +322,10 @@ class DataFrame():
|
||||
if buf is None: # pragma: no cover
|
||||
buf = sys.stdout
|
||||
|
||||
fake_df = self.__fake_dataframe__()
|
||||
|
||||
lines = []
|
||||
|
||||
lines.append(str(type(self)))
|
||||
lines.append(fake_df.index._summary())
|
||||
lines.append(self.index_summary())
|
||||
|
||||
if len(self.columns) == 0:
|
||||
lines.append('Empty {name}'.format(name=type(self).__name__))
|
||||
@ -322,7 +337,7 @@ class DataFrame():
|
||||
# hack
|
||||
if max_cols is None:
|
||||
max_cols = pd.get_option('display.max_info_columns',
|
||||
len(self.columns) + 1)
|
||||
len(self.columns) + 1)
|
||||
|
||||
max_rows = pd.get_option('display.max_info_rows', len(self) + 1)
|
||||
|
||||
@ -404,7 +419,6 @@ class DataFrame():
|
||||
|
||||
fmt.buffer_put_lines(buf, lines)
|
||||
|
||||
|
||||
@property
|
||||
def shape(self):
|
||||
"""
|
||||
@ -423,14 +437,32 @@ class DataFrame():
|
||||
|
||||
@property
|
||||
def columns(self):
|
||||
return pd.Index(self.mappings.source_fields())
|
||||
return pd.Index(self._mappings.source_fields())
|
||||
|
||||
@property
|
||||
def index(self):
|
||||
return self._index
|
||||
|
||||
def set_index(self, index_field):
|
||||
copy = self.copy()
|
||||
copy._index = ed.Index(index_field)
|
||||
return copy
|
||||
|
||||
def index_summary(self):
|
||||
head = self.head(1).index[0]
|
||||
tail = self.tail(1).index[0]
|
||||
index_summary = ', %s to %s' % (pprint_thing(head),
|
||||
pprint_thing(tail))
|
||||
|
||||
name = "Index"
|
||||
return '%s: %s entries%s' % (name, len(self), index_summary)
|
||||
|
||||
@property
|
||||
def dtypes(self):
|
||||
return self.mappings.dtypes()
|
||||
return self._mappings.dtypes()
|
||||
|
||||
def get_dtype_counts(self):
|
||||
return self.mappings.get_dtype_counts()
|
||||
return self._mappings.get_dtype_counts()
|
||||
|
||||
def count(self):
|
||||
"""
|
||||
@ -446,63 +478,155 @@ class DataFrame():
|
||||
for a single document.
|
||||
"""
|
||||
counts = {}
|
||||
for field in self.mappings.source_fields():
|
||||
exists_query = {"query":{"exists":{"field":field}}}
|
||||
field_exists_count = self.client.count(index=self.index_pattern, body=exists_query)
|
||||
for field in self._mappings.source_fields():
|
||||
exists_query = {"query": {"exists": {"field": field}}}
|
||||
field_exists_count = self._client.count(index=self._index_pattern, body=exists_query)
|
||||
counts[field] = field_exists_count
|
||||
|
||||
count = pd.Series(data=counts, index=self.mappings.source_fields())
|
||||
count = pd.Series(data=counts, index=self._mappings.source_fields())
|
||||
|
||||
return count
|
||||
|
||||
def index_count(self):
|
||||
"""
|
||||
Returns
|
||||
-------
|
||||
index_count: int
|
||||
Count of docs where index_field exists
|
||||
"""
|
||||
exists_query = {"query": {"exists": {"field": self._index.index_field}}}
|
||||
|
||||
def __getitem__(self, item):
|
||||
# df['a'] -> item == str
|
||||
# df['a', 'b'] -> item == (str, str) tuple
|
||||
index_count = self._client.count(index=self._index_pattern, body=exists_query)
|
||||
|
||||
return index_count
|
||||
|
||||
def _filter_by_columns(self, columns):
|
||||
# Return new eland.DataFrame with modified mappings
|
||||
mappings = ed.Mappings(mappings=self._mappings, columns=columns)
|
||||
|
||||
return DataFrame(self._client, self._index_pattern, mappings=mappings)
|
||||
|
||||
def __getitem__(self, key):
|
||||
# NOTE: there is a difference between pandas here.
|
||||
# e.g. df['a'] returns pd.Series, df[['a','b']] return pd.DataFrame
|
||||
# we always return DataFrame - TODO maybe create eland.Series at some point...
|
||||
|
||||
# Implementation mainly copied from pandas v0.24.2
|
||||
# (https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html)
|
||||
key = com.apply_if_callable(key, self)
|
||||
|
||||
# TODO - add slice capabilities - need to add index features first
|
||||
# e.g. set index etc.
|
||||
# Do we have a slicer (on rows)?
|
||||
"""
|
||||
indexer = convert_to_index_sliceable(self, key)
|
||||
if indexer is not None:
|
||||
return self._slice(indexer, axis=0)
|
||||
# Do we have a (boolean) DataFrame?
|
||||
if isinstance(key, DataFrame):
|
||||
return self._getitem_frame(key)
|
||||
"""
|
||||
|
||||
# Do we have a (boolean) 1d indexer?
|
||||
"""
|
||||
if com.is_bool_indexer(key):
|
||||
return self._getitem_bool_array(key)
|
||||
"""
|
||||
|
||||
# We are left with two options: a single key, and a collection of keys,
|
||||
columns = []
|
||||
if isinstance(item, str):
|
||||
if not self.mappings.is_source_field(item):
|
||||
raise TypeError('Column does not exist: [{0}]'.format(item))
|
||||
columns.append(item)
|
||||
elif isinstance(item, tuple):
|
||||
columns.extend(list(item))
|
||||
elif isinstance(item, list):
|
||||
columns.extend(item)
|
||||
|
||||
if len(columns) > 0:
|
||||
# Return new eland.DataFrame with modified mappings
|
||||
mappings = ed.Mappings(mappings=self.mappings, columns=columns)
|
||||
|
||||
return DataFrame(self.client, self.index_pattern, mappings=mappings)
|
||||
"""
|
||||
elif isinstance(item, BooleanFilter):
|
||||
self._filter = item.build()
|
||||
return self
|
||||
if isinstance(key, str):
|
||||
if not self._mappings.is_source_field(key):
|
||||
raise TypeError('Column does not exist: [{0}]'.format(key))
|
||||
columns.append(key)
|
||||
elif isinstance(key, list):
|
||||
columns.extend(key)
|
||||
else:
|
||||
raise TypeError('Unsupported expr: [{0}]'.format(item))
|
||||
"""
|
||||
raise TypeError('__getitem__ arguments invalid: [{0}]'.format(key))
|
||||
|
||||
return self._filter_by_columns(columns)
|
||||
|
||||
def __len__(self):
|
||||
"""
|
||||
Returns length of info axis, but here we use the index.
|
||||
"""
|
||||
return self.client.count(index=self.index_pattern)
|
||||
return self._client.count(index=self._index_pattern)
|
||||
|
||||
def copy(self):
|
||||
# TODO - test and validate...may need deep copying
|
||||
return ed.DataFrame(self._client,
|
||||
self._index_pattern,
|
||||
self._mappings,
|
||||
self._index)
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Rendering Methods
|
||||
|
||||
def __repr__(self):
|
||||
"""
|
||||
Return a string representation for a particular DataFrame.
|
||||
From pandas
|
||||
"""
|
||||
return self.to_string()
|
||||
buf = StringIO()
|
||||
|
||||
max_rows = pd.get_option("display.max_rows")
|
||||
max_cols = pd.get_option("display.max_columns")
|
||||
show_dimensions = pd.get_option("display.show_dimensions")
|
||||
if pd.get_option("display.expand_frame_repr"):
|
||||
width, _ = console.get_console_size()
|
||||
else:
|
||||
width = None
|
||||
self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols,
|
||||
line_width=width, show_dimensions=show_dimensions)
|
||||
|
||||
return buf.getvalue()
|
||||
|
||||
def to_string(self, buf=None, columns=None, col_space=None, header=True,
|
||||
index=True, na_rep='NaN', formatters=None, float_format=None,
|
||||
sparsify=None, index_names=True, justify=None,
|
||||
max_rows=None, max_cols=None, show_dimensions=True,
|
||||
decimal='.', line_width=None):
|
||||
"""
|
||||
From pandas
|
||||
"""
|
||||
if max_rows == None:
|
||||
max_rows = pd.get_option('display.max_rows')
|
||||
|
||||
sdf = self.__fake_dataframe__(max_rows=max_rows+1)
|
||||
|
||||
_show_dimensions = show_dimensions
|
||||
|
||||
if buf is not None:
|
||||
_buf = _expand_user(_stringify_path(buf))
|
||||
else:
|
||||
_buf = StringIO()
|
||||
|
||||
sdf.to_string(buf=_buf, columns=columns,
|
||||
col_space=col_space, na_rep=na_rep,
|
||||
formatters=formatters,
|
||||
float_format=float_format,
|
||||
sparsify=sparsify, justify=justify,
|
||||
index_names=index_names,
|
||||
header=header, index=index,
|
||||
max_rows=max_rows,
|
||||
max_cols=max_cols,
|
||||
show_dimensions=False, # print this outside of this call
|
||||
decimal=decimal,
|
||||
line_width=line_width)
|
||||
|
||||
if _show_dimensions:
|
||||
_buf.write("\n\n[{nrows} rows x {ncols} columns]"
|
||||
.format(nrows=self.index_count(), ncols=len(self.columns)))
|
||||
|
||||
if buf is None:
|
||||
result = _buf.getvalue()
|
||||
return result
|
||||
|
||||
|
||||
def __fake_dataframe__(self, max_rows=1):
|
||||
head_rows = max_rows / 2 + 1
|
||||
head_rows = int(max_rows / 2) + max_rows % 2
|
||||
tail_rows = max_rows - head_rows
|
||||
|
||||
head = self.head(max_rows)
|
||||
head = self.head(head_rows)
|
||||
tail = self.tail(tail_rows)
|
||||
|
||||
num_rows = len(self)
|
||||
|
||||
@ -514,8 +638,9 @@ class DataFrame():
|
||||
# to use the pandas IO methods.
|
||||
# TODO - if data is indexed by time series, return top/bottom of
|
||||
# time series, rather than first max_rows items
|
||||
"""
|
||||
if tail_rows > 0:
|
||||
locations = [0, num_rows-tail_rows]
|
||||
locations = [0, num_rows - tail_rows]
|
||||
lengths = [head_rows, tail_rows]
|
||||
else:
|
||||
locations = [0]
|
||||
@ -526,21 +651,13 @@ class DataFrame():
|
||||
BlockIndex(
|
||||
num_rows, locations, lengths))
|
||||
for item in self.columns})
|
||||
|
||||
return sdf
|
||||
|
||||
return head
|
||||
"""
|
||||
return pd.concat([head, tail])
|
||||
|
||||
|
||||
def to_string(self):
|
||||
# TODO - this doesn't return 'notebook' friendly results yet..
|
||||
# TODO - don't hard code max_rows - use pandas default/ES default
|
||||
max_rows = 60
|
||||
|
||||
df = self.__fake_dataframe__(max_rows=max_rows)
|
||||
|
||||
return df.to_string(max_rows=max_rows, show_dimensions=True)
|
||||
return pd.concat([head, tail])
|
||||
|
||||
|
||||
# From pandas.DataFrame
|
||||
def _put_str(s, space):
|
||||
return '{s}'.format(s=s)[:space].ljust(space)
|
||||
return '{s}'.format(s=s)[:space].ljust(space)
|
46
eland/index.py
Normal file
46
eland/index.py
Normal file
@ -0,0 +1,46 @@
|
||||
"""
|
||||
class Index
|
||||
|
||||
The index for an eland.DataFrame.
|
||||
|
||||
Currently, the index is a field that exists in every document in an Elasticsearch index.
|
||||
For slicing and sorting operations it must be a docvalues field. By default _id is used,
|
||||
which can't be used for range queries and is inefficient for sorting:
|
||||
|
||||
https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-id-field.html
|
||||
(The value of the _id field is also accessible in aggregations or for sorting,
|
||||
but doing so is discouraged as it requires to load a lot of data in memory.
|
||||
In case sorting or aggregating on the _id field is required, it is advised to duplicate
|
||||
the content of the _id field in another field that has doc_values enabled.)
|
||||
|
||||
"""
|
||||
class Index:
|
||||
ID_INDEX_FIELD = '_id'
|
||||
ID_SORT_FIELD = '_doc' # if index field is _id, sort by _doc
|
||||
|
||||
def __init__(self, index_field=None):
|
||||
# Calls setter
|
||||
self.index_field = index_field
|
||||
|
||||
@property
|
||||
def sort_field(self):
|
||||
if self._index_field == self.ID_INDEX_FIELD:
|
||||
return self.ID_SORT_FIELD
|
||||
return self._index_field
|
||||
|
||||
@property
|
||||
def is_source_field(self):
|
||||
return self._is_source_field
|
||||
|
||||
@property
|
||||
def index_field(self):
|
||||
return self._index_field
|
||||
|
||||
@index_field.setter
|
||||
def index_field(self, index_field):
|
||||
if index_field == None:
|
||||
self._index_field = Index.ID_INDEX_FIELD
|
||||
self._is_source_field = False
|
||||
else:
|
||||
self._index_field = index_field
|
||||
self._is_source_field = True
|
@ -64,18 +64,22 @@ class Mappings():
|
||||
|
||||
# Populate capability matrix of fields
|
||||
# field_name, es_dtype, pd_dtype, is_searchable, is_aggregtable, is_source
|
||||
self.mappings_capabilities = Mappings._create_capability_matrix(all_fields, source_fields, all_fields_caps)
|
||||
self._mappings_capabilities = Mappings._create_capability_matrix(all_fields, source_fields, all_fields_caps)
|
||||
else:
|
||||
# Reference object and restrict mapping columns
|
||||
self.mappings_capabilities = mappings.mappings_capabilities.loc[columns]
|
||||
if columns is not None:
|
||||
# Reference object and restrict mapping columns
|
||||
self._mappings_capabilities = mappings._mappings_capabilities.loc[columns]
|
||||
else:
|
||||
# straight copy
|
||||
self._mappings_capabilities = mappings._mappings_capabilities.copy()
|
||||
|
||||
# Cache source field types for efficient lookup
|
||||
# (this massively improves performance of DataFrame.flatten)
|
||||
self.source_field_pd_dtypes = {}
|
||||
self._source_field_pd_dtypes = {}
|
||||
|
||||
for field_name in self.mappings_capabilities[self.mappings_capabilities._source == True].index:
|
||||
pd_dtype = self.mappings_capabilities.loc[field_name]['pd_dtype']
|
||||
self.source_field_pd_dtypes[field_name] = pd_dtype
|
||||
for field_name in self._mappings_capabilities[self._mappings_capabilities._source == True].index:
|
||||
pd_dtype = self._mappings_capabilities.loc[field_name]['pd_dtype']
|
||||
self._source_field_pd_dtypes[field_name] = pd_dtype
|
||||
|
||||
def _extract_fields_from_mapping(mappings, source_only=False):
|
||||
"""
|
||||
@ -262,24 +266,29 @@ class Mappings():
|
||||
all_fields: list
|
||||
All typed fields in the index mapping
|
||||
"""
|
||||
return self.mappings_capabilities.index.tolist()
|
||||
return self._mappings_capabilities.index.tolist()
|
||||
|
||||
def field_capabilities(self, field_name):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
field_name: str
|
||||
|
||||
"""
|
||||
def pd_dtypes_groupby_source_fields(self):
|
||||
Returns
|
||||
-------
|
||||
groups: dict
|
||||
Calls pandas.core.groupby.GroupBy.groups for _source fields
|
||||
E.g.
|
||||
{
|
||||
'bool': Index(['Cancelled', 'FlightDelay'], dtype='object'),
|
||||
'datetime64[ns]': Index(['timestamp'], dtype='object'),
|
||||
'float64': Index(['AvgTicketPrice', 'DistanceKilometers', 'DistanceMiles',...
|
||||
}
|
||||
return self.mappings_capabilities[self.mappings_capabilities._source == True].groupby('pd_dtype').groups
|
||||
|
||||
def pd_dtype
|
||||
"""
|
||||
mappings_capabilities: pd.Series with index values:
|
||||
_source: bool
|
||||
Is this field name a top-level source field?
|
||||
ed_dtype: str
|
||||
The Elasticsearch data type
|
||||
pd_dtype: str
|
||||
The pandas data type
|
||||
searchable: bool
|
||||
Is the field searchable in Elasticsearch?
|
||||
aggregatable: bool
|
||||
Is the field aggregatable in Elasticsearch?
|
||||
"""
|
||||
return self._mappings_capabilities.loc[field_name]
|
||||
|
||||
def source_field_pd_dtype(self, field_name):
|
||||
"""
|
||||
@ -297,9 +306,9 @@ class Mappings():
|
||||
pd_dtype = 'object'
|
||||
is_source_field = False
|
||||
|
||||
if field_name in self.source_field_pd_dtypes:
|
||||
if field_name in self._source_field_pd_dtypes:
|
||||
is_source_field = True
|
||||
pd_dtype = self.source_field_pd_dtypes[field_name]
|
||||
pd_dtype = self._source_field_pd_dtypes[field_name]
|
||||
|
||||
return is_source_field, pd_dtype
|
||||
|
||||
@ -316,7 +325,7 @@ class Mappings():
|
||||
"""
|
||||
is_source_field = False
|
||||
|
||||
if field_name in self.source_field_pd_dtypes:
|
||||
if field_name in self._source_field_pd_dtypes:
|
||||
is_source_field = True
|
||||
|
||||
return is_source_field
|
||||
@ -328,9 +337,9 @@ class Mappings():
|
||||
numeric_source_fields: list of str
|
||||
List of source fields where pd_dtype == (int64 or float64)
|
||||
"""
|
||||
return self.mappings_capabilities[(self.mappings_capabilities._source == True) &
|
||||
((self.mappings_capabilities.pd_dtype == 'int64') |
|
||||
(self.mappings_capabilities.pd_dtype == 'float64'))].index.tolist()
|
||||
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
|
||||
((self._mappings_capabilities.pd_dtype == 'int64') |
|
||||
(self._mappings_capabilities.pd_dtype == 'float64'))].index.tolist()
|
||||
|
||||
def source_fields(self):
|
||||
"""
|
||||
@ -339,7 +348,7 @@ class Mappings():
|
||||
source_fields: list of str
|
||||
List of source fields
|
||||
"""
|
||||
return self.source_field_pd_dtypes.keys()
|
||||
return self._source_field_pd_dtypes.keys()
|
||||
|
||||
def count_source_fields(self):
|
||||
"""
|
||||
@ -357,7 +366,7 @@ class Mappings():
|
||||
dtypes: pd.Series
|
||||
Source field name + pd_dtype
|
||||
"""
|
||||
return pd.Series(self.source_field_pd_dtypes)
|
||||
return pd.Series(self._source_field_pd_dtypes)
|
||||
|
||||
def get_dtype_counts(self):
|
||||
"""
|
||||
@ -368,5 +377,5 @@ class Mappings():
|
||||
get_dtype_counts : Series
|
||||
Series with the count of columns with each dtype.
|
||||
"""
|
||||
return pd.Series(self.mappings_capabilities[self.mappings_capabilities._source == True].groupby('pd_dtype')[
|
||||
return pd.Series(self._mappings_capabilities[self._mappings_capabilities._source == True].groupby('pd_dtype')[
|
||||
'_source'].count().to_dict())
|
||||
|
@ -69,3 +69,22 @@ class TestMapping(TestData):
|
||||
expected_get_dtype_counts = pd.Series({'datetime64[ns]': 1, 'float64': 1, 'int64': 5, 'object': 11})
|
||||
|
||||
assert_series_equal(expected_get_dtype_counts, mappings.get_dtype_counts())
|
||||
|
||||
def test_mapping_capabilities(self):
|
||||
mappings = ed.Mappings(ed.Client(ELASTICSEARCH_HOST), TEST_MAPPING1_INDEX_NAME)
|
||||
|
||||
field_capabilities = mappings.field_capabilities('city')
|
||||
|
||||
assert True == field_capabilities['_source']
|
||||
assert 'text' == field_capabilities['es_dtype']
|
||||
assert 'object' == field_capabilities['pd_dtype']
|
||||
assert True == field_capabilities['searchable']
|
||||
assert False == field_capabilities['aggregatable']
|
||||
|
||||
field_capabilities = mappings.field_capabilities('city.raw')
|
||||
|
||||
assert False == field_capabilities['_source']
|
||||
assert 'keyword' == field_capabilities['es_dtype']
|
||||
assert 'object' == field_capabilities['pd_dtype']
|
||||
assert True == field_capabilities['searchable']
|
||||
assert True == field_capabilities['aggregatable']
|
||||
|
@ -16,6 +16,7 @@ from eland.tests import FLIGHTS_DF_FILE_NAME, FLIGHTS_INDEX_NAME,\
|
||||
_pd_flights = pd.read_json(FLIGHTS_DF_FILE_NAME).sort_index()
|
||||
_pd_flights['timestamp'] = \
|
||||
pd.to_datetime(_pd_flights['timestamp'])
|
||||
_pd_flights.index = _pd_flights.index.map(str) # make index 'object' not int
|
||||
_ed_flights = ed.read_es(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME)
|
||||
|
||||
_pd_ecommerce = pd.read_json(ECOMMERCE_DF_FILE_NAME).sort_index()
|
||||
@ -24,6 +25,7 @@ _pd_ecommerce['order_date'] = \
|
||||
_pd_ecommerce['products.created_on'] = \
|
||||
_pd_ecommerce['products.created_on'].apply(lambda x: pd.to_datetime(x))
|
||||
_pd_ecommerce.insert(2, 'customer_birth_date', None)
|
||||
_pd_ecommerce.index = _pd_ecommerce.index.map(str) # make index 'object' not int
|
||||
_pd_ecommerce['customer_birth_date'].astype('datetime64')
|
||||
_ed_ecommerce = ed.read_es(ELASTICSEARCH_HOST, ECOMMERCE_INDEX_NAME)
|
||||
|
||||
|
63
eland/tests/dataframe/test_getitem_pytest.py
Normal file
63
eland/tests/dataframe/test_getitem_pytest.py
Normal file
@ -0,0 +1,63 @@
|
||||
# File called _pytest for PyCharm compatability
|
||||
from eland.tests.common import TestData
|
||||
|
||||
import pandas as pd
|
||||
import io
|
||||
|
||||
from pandas.util.testing import (
|
||||
assert_series_equal, assert_frame_equal)
|
||||
|
||||
class TestDataFrameGetItem(TestData):
|
||||
|
||||
def test_getitem_basic(self):
|
||||
# Test 1 attribute
|
||||
pd_carrier = self.pd_flights()['Carrier']
|
||||
ed_carrier = self.ed_flights()['Carrier']
|
||||
|
||||
# pandas returns a Series here
|
||||
assert_frame_equal(pd.DataFrame(pd_carrier.head(100)), ed_carrier.head(100))
|
||||
|
||||
pd_3_items = self.pd_flights()[['Dest','Carrier','FlightDelay']]
|
||||
ed_3_items = self.ed_flights()[['Dest','Carrier','FlightDelay']]
|
||||
|
||||
assert_frame_equal(pd_3_items.head(100), ed_3_items.head(100))
|
||||
|
||||
# Test numerics
|
||||
numerics = ['DistanceMiles', 'AvgTicketPrice', 'FlightTimeMin']
|
||||
ed_numerics = self.ed_flights()[numerics]
|
||||
pd_numerics = self.pd_flights()[numerics]
|
||||
|
||||
assert_frame_equal(pd_numerics.head(100), ed_numerics.head(100))
|
||||
|
||||
# just test headers
|
||||
ed_numerics_describe = ed_numerics.describe()
|
||||
assert ed_numerics_describe.columns.tolist() == numerics
|
||||
|
||||
def test_getattr_basic(self):
|
||||
# Test 1 attribute
|
||||
pd_carrier = self.pd_flights().Carrier
|
||||
#ed_carrier = self.ed_flights().Carrier
|
||||
|
||||
print(type(pd_carrier))
|
||||
print(pd_carrier)
|
||||
|
||||
def test_boolean(self):
|
||||
# Test 1 attribute
|
||||
pd_carrier = self.pd_flights()['Carrier == "Kibana Airlines"']
|
||||
#ed_carrier = self.ed_flights().Carrier
|
||||
|
||||
print(type(pd_carrier))
|
||||
print(pd_carrier)
|
||||
|
||||
|
||||
def test_loc(self):
|
||||
pd = self.pd_flights().loc[10:15, ['Dest', 'Carrier']]
|
||||
|
||||
print(type(pd))
|
||||
print(pd)
|
||||
|
||||
pd = self.pd_flights().loc[10]
|
||||
|
||||
print(type(pd))
|
||||
print(pd)
|
||||
|
@ -10,8 +10,8 @@ from pandas.util.testing import (
|
||||
class TestDataFrameIndexing(TestData):
|
||||
|
||||
def test_mapping(self):
|
||||
ed_flights_mappings = pd.DataFrame(self.ed_flights().mappings.mappings_capabilities
|
||||
[self.ed_flights().mappings.mappings_capabilities._source==True]
|
||||
ed_flights_mappings = pd.DataFrame(self.ed_flights()._mappings._mappings_capabilities
|
||||
[self.ed_flights()._mappings._mappings_capabilities._source==True]
|
||||
['pd_dtype'])
|
||||
pd_flights_mappings = pd.DataFrame(self.pd_flights().dtypes, columns = ['pd_dtype'])
|
||||
|
||||
@ -25,6 +25,8 @@ class TestDataFrameIndexing(TestData):
|
||||
pd_flights_head = self.pd_flights().head()
|
||||
ed_flights_head = self.ed_flights().head()
|
||||
|
||||
print(ed_flights_head)
|
||||
|
||||
assert_frame_equal(pd_flights_head, ed_flights_head)
|
||||
|
||||
pd_ecommerce_head = self.pd_ecommerce().head()
|
||||
@ -32,10 +34,25 @@ class TestDataFrameIndexing(TestData):
|
||||
|
||||
assert_frame_equal(pd_ecommerce_head, ed_ecommerce_head)
|
||||
|
||||
def test_tail(self):
|
||||
pd_flights_tail = self.pd_flights().tail()
|
||||
ed_flights_tail = self.ed_flights().tail()
|
||||
|
||||
print(ed_flights_tail)
|
||||
|
||||
assert_frame_equal(pd_flights_tail, ed_flights_tail)
|
||||
|
||||
pd_ecommerce_tail = self.pd_ecommerce().tail()
|
||||
ed_ecommerce_tail = self.ed_ecommerce().tail()
|
||||
|
||||
assert_frame_equal(pd_ecommerce_tail, ed_ecommerce_tail)
|
||||
|
||||
def test_describe(self):
|
||||
pd_flights_describe = self.pd_flights().describe()
|
||||
ed_flights_describe = self.ed_flights().describe()
|
||||
|
||||
print(ed_flights_describe)
|
||||
|
||||
# TODO - this fails now as ES aggregations are approximate
|
||||
# if ES percentile agg uses
|
||||
# "hdr": {
|
||||
@ -47,6 +64,8 @@ class TestDataFrameIndexing(TestData):
|
||||
pd_ecommerce_describe = self.pd_ecommerce().describe()
|
||||
ed_ecommerce_describe = self.ed_ecommerce().describe()
|
||||
|
||||
print(ed_ecommerce_describe)
|
||||
|
||||
# We don't compare ecommerce here as the default dtypes in pandas from read_json
|
||||
# don't match the mapping types. This is mainly because the products field is
|
||||
# nested and so can be treated as a multi-field in ES, but not in pandas
|
||||
@ -57,52 +76,7 @@ class TestDataFrameIndexing(TestData):
|
||||
|
||||
def test_to_string(self):
|
||||
print(self.ed_flights())
|
||||
|
||||
def test_getitem(self):
|
||||
# Test 1 attribute
|
||||
ed_carrier = self.ed_flights()['Carrier']
|
||||
|
||||
carrier_head = ed_carrier.head(5)
|
||||
|
||||
carrier_head_expected = pd.DataFrame(
|
||||
{'Carrier':[
|
||||
'Kibana Airlines',
|
||||
'Logstash Airways',
|
||||
'Logstash Airways',
|
||||
'Kibana Airlines',
|
||||
'Kibana Airlines'
|
||||
]})
|
||||
|
||||
assert_frame_equal(carrier_head_expected, carrier_head)
|
||||
|
||||
#carrier_to_string = ed_carrier.to_string()
|
||||
#print(carrier_to_string)
|
||||
|
||||
# Test multiple attributes (out of order)
|
||||
ed_3_items = self.ed_flights()['Dest','Carrier','FlightDelay']
|
||||
|
||||
ed_3_items_head = ed_3_items.head(5)
|
||||
|
||||
ed_3_items_expected = pd.DataFrame(dict(
|
||||
Dest={0: 'Sydney Kingsford Smith International Airport', 1: 'Venice Marco Polo Airport',
|
||||
2: 'Venice Marco Polo Airport', 3: "Treviso-Sant'Angelo Airport",
|
||||
4: "Xi'an Xianyang International Airport"},
|
||||
Carrier={0: 'Kibana Airlines', 1: 'Logstash Airways', 2: 'Logstash Airways', 3: 'Kibana Airlines',
|
||||
4: 'Kibana Airlines'},
|
||||
FlightDelay={0: False, 1: False, 2: False, 3: True, 4: False}))
|
||||
|
||||
assert_frame_equal(ed_3_items_expected, ed_3_items_head)
|
||||
|
||||
#ed_3_items_to_string = ed_3_items.to_string()
|
||||
#print(ed_3_items_to_string)
|
||||
|
||||
# Test numerics
|
||||
numerics = ['DistanceMiles', 'AvgTicketPrice', 'FlightTimeMin']
|
||||
ed_numerics = self.ed_flights()[numerics]
|
||||
|
||||
# just test headers
|
||||
ed_numerics_describe = ed_numerics.describe()
|
||||
assert ed_numerics_describe.columns.tolist() == numerics
|
||||
print(self.ed_flights().to_string())
|
||||
|
||||
def test_info(self):
|
||||
ed_flights_info_buf = io.StringIO()
|
||||
@ -111,6 +85,8 @@ class TestDataFrameIndexing(TestData):
|
||||
self.ed_flights().info(buf=ed_flights_info_buf)
|
||||
self.pd_flights().info(buf=pd_flights_info_buf)
|
||||
|
||||
print(ed_flights_info_buf.getvalue())
|
||||
|
||||
ed_flights_info = (ed_flights_info_buf.getvalue().splitlines())
|
||||
pd_flights_info = (pd_flights_info_buf.getvalue().splitlines())
|
||||
|
||||
@ -148,7 +124,7 @@ class TestDataFrameIndexing(TestData):
|
||||
|
||||
assert_series_equal(pd_flights_get_dtype_counts, ed_flights_get_dtype_counts)
|
||||
|
||||
def test_properties(self):
|
||||
def test_get_properties(self):
|
||||
pd_flights_shape = self.pd_flights().shape
|
||||
ed_flights_shape = self.ed_flights().shape
|
||||
|
||||
@ -164,3 +140,16 @@ class TestDataFrameIndexing(TestData):
|
||||
|
||||
assert_series_equal(pd_flights_dtypes, ed_flights_dtypes)
|
||||
|
||||
def test_index(self):
|
||||
pd_flights = self.pd_flights()
|
||||
pd_flights_timestamp = pd_flights.set_index('timestamp')
|
||||
pd_flights.info()
|
||||
pd_flights_timestamp.info()
|
||||
pd_flights.info()
|
||||
|
||||
ed_flights = self.ed_flights()
|
||||
ed_flights_timestamp = ed_flights.set_index('timestamp')
|
||||
ed_flights.info()
|
||||
ed_flights_timestamp.info()
|
||||
ed_flights.info()
|
||||
|
@ -33,7 +33,8 @@ def _setup_data(es):
|
||||
# make timestamp datetime 2018-01-01T12:09:35
|
||||
#values['timestamp'] = datetime.strptime(values['timestamp'], '%Y-%m-%dT%H:%M:%S')
|
||||
|
||||
action = {'_index': index_name, '_source': values}
|
||||
# Use integer as id field for repeatable results
|
||||
action = {'_index': index_name, '_source': values, '_id': str(n)}
|
||||
|
||||
actions.append(action)
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user