Adding __repr__ method based on SparseDataFrames

This commit is contained in:
Stephen Dodson 2019-06-24 13:34:06 +00:00
parent c723633526
commit d4250640f1
7 changed files with 107 additions and 4 deletions

View File

@ -21,3 +21,8 @@ class Client():
def field_caps(self, **kwargs):
return self.es.field_caps(**kwargs)
def count(self, **kwargs):
count_json = self.es.count(**kwargs)
return count_json['count']

View File

@ -30,6 +30,8 @@ from elasticsearch_dsl import Search
import pandas as pd
from pandas.core.arrays.sparse import BlockIndex
class DataFrame():
"""
pandas.DataFrame like API that proxies into Elasticsearch index(es).
@ -251,3 +253,71 @@ class DataFrame():
df = pd.DataFrame(data=results, index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
return df
@property
def shape(self):
"""
Return a tuple representing the dimensionality of the DataFrame.
Returns
-------
shape: tuple
0 - number of rows
1 - number of columns
"""
num_rows = len(self)
num_columns = self.columns
return num_rows, num_columns
@property
def columns(self):
return self.mappings.source_fields()
def __getitem__(self, item):
if isinstance(item, str):
if item not in self.mappings.is_source_field(item):
raise TypeError('Column does not exist: [{0}]'.format(item))
return Column(item)
elif isinstance(item, BooleanFilter):
self._filter = item.build()
return self
else:
raise TypeError('Unsupported expr: [{0}]'.format(item))
def __len__(self):
"""
Returns length of info axis, but here we use the index.
"""
return self.client.count(index=self.index_pattern)
# ----------------------------------------------------------------------
# Rendering Methods
def __repr__(self):
# The return for this is display.options.max_rows
max_rows = 60
head_rows = max_rows / 2
tail_rows = max_rows - head_rows
head = self.head(max_rows)
num_rows = len(self)
if (num_rows > max_rows):
# If we have a lot of rows, create a SparseDataFrame and use
# pandas to_string logic
# NOTE: this sparse DataFrame can't be used as the middle
# section is all NaNs. However, it gives us potentially a nice way
# to use the pandas IO methods.
sdf = pd.DataFrame({item: pd.SparseArray(data=head[item],
sparse_index=
BlockIndex(
num_rows, [0, num_rows-tail_rows], [head_rows, tail_rows]))
for item in self.columns})
# TODO - don't hard code max_rows - use pandas default/ES default
return sdf.to_string(max_rows=max_rows)
return head.to_string(max_rows=max_rows)

View File

@ -298,3 +298,21 @@ class Mappings():
((self.mappings_capabilities.pd_dtype == 'int64') |
(self.mappings_capabilities.pd_dtype == 'float64'))].index.tolist()
def source_fields(self):
"""
Returns
-------
source_fields: list of str
List of source fields
"""
return self.mappings_capabilities[self.mappings_capabilities._source == True].index.tolist()
def count_source_fields(self):
"""
Returns
-------
count_source_fields: int
Number of source fields in mapping
"""
return len(self.mappings_capabilities[self.mappings_capabilities._source == True].index)

View File

@ -438,6 +438,7 @@ TEST_MAPPING1_EXPECTED = {
}
TEST_MAPPING1_EXPECTED_DF = pd.DataFrame.from_dict(data=TEST_MAPPING1_EXPECTED, orient='index', columns=['es_dtype'])
TEST_MAPPING1_EXPECTED_SOURCE_FIELD_COUNT = len(TEST_MAPPING1_EXPECTED_DF.index) - 4
TEST_NESTED_USER_GROUP_INDEX_NAME = 'nested_user_group'
TEST_NESTED_USER_GROUP_MAPPING = {

View File

@ -12,11 +12,13 @@ class TestMapping():
# Requires 'setup_tests.py' to be run prior to this
def test_mapping(self):
mapping = ed.Mappings(ed.Client(ELASTICSEARCH_HOST), TEST_MAPPING1_INDEX_NAME)
mappings = ed.Mappings(ed.Client(ELASTICSEARCH_HOST), TEST_MAPPING1_INDEX_NAME)
assert mapping.all_fields() == TEST_MAPPING1_EXPECTED_DF.index.tolist()
assert mappings.all_fields() == TEST_MAPPING1_EXPECTED_DF.index.tolist()
assert_frame_equal(TEST_MAPPING1_EXPECTED_DF, pd.DataFrame(mapping.mappings_capabilities['es_dtype']))
assert_frame_equal(TEST_MAPPING1_EXPECTED_DF, pd.DataFrame(mappings.mappings_capabilities['es_dtype']))
assert mappings.count_source_fields() == TEST_MAPPING1_EXPECTED_SOURCE_FIELD_COUNT

View File

@ -52,3 +52,10 @@ class TestDataFrameIndexing(TestData):
# don't match the mapping types. This is mainly because the products field is
# nested and so can be treated as a multi-field in ES, but not in pandas
def test_size(self):
assert self.pd_flights().shape == self.ed_flights().shape
assert len(self.pd_flights()) == len(self.ed_flights())
def test_to_string(self):
print(self.ed_flights())