mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Adding __repr__ method based on SparseDataFrames
This commit is contained in:
parent
c723633526
commit
d4250640f1
@ -21,3 +21,8 @@ class Client():
|
||||
|
||||
def field_caps(self, **kwargs):
|
||||
return self.es.field_caps(**kwargs)
|
||||
|
||||
def count(self, **kwargs):
|
||||
count_json = self.es.count(**kwargs)
|
||||
return count_json['count']
|
||||
|
||||
|
@ -30,6 +30,8 @@ from elasticsearch_dsl import Search
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from pandas.core.arrays.sparse import BlockIndex
|
||||
|
||||
class DataFrame():
|
||||
"""
|
||||
pandas.DataFrame like API that proxies into Elasticsearch index(es).
|
||||
@ -251,3 +253,71 @@ class DataFrame():
|
||||
df = pd.DataFrame(data=results, index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
|
||||
|
||||
return df
|
||||
|
||||
@property
|
||||
def shape(self):
|
||||
"""
|
||||
Return a tuple representing the dimensionality of the DataFrame.
|
||||
|
||||
Returns
|
||||
-------
|
||||
shape: tuple
|
||||
0 - number of rows
|
||||
1 - number of columns
|
||||
"""
|
||||
num_rows = len(self)
|
||||
num_columns = self.columns
|
||||
|
||||
return num_rows, num_columns
|
||||
|
||||
@property
|
||||
def columns(self):
|
||||
return self.mappings.source_fields()
|
||||
|
||||
def __getitem__(self, item):
|
||||
if isinstance(item, str):
|
||||
if item not in self.mappings.is_source_field(item):
|
||||
raise TypeError('Column does not exist: [{0}]'.format(item))
|
||||
return Column(item)
|
||||
elif isinstance(item, BooleanFilter):
|
||||
self._filter = item.build()
|
||||
return self
|
||||
else:
|
||||
raise TypeError('Unsupported expr: [{0}]'.format(item))
|
||||
|
||||
def __len__(self):
|
||||
"""
|
||||
Returns length of info axis, but here we use the index.
|
||||
"""
|
||||
return self.client.count(index=self.index_pattern)
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Rendering Methods
|
||||
|
||||
def __repr__(self):
|
||||
# The return for this is display.options.max_rows
|
||||
max_rows = 60
|
||||
head_rows = max_rows / 2
|
||||
tail_rows = max_rows - head_rows
|
||||
|
||||
head = self.head(max_rows)
|
||||
|
||||
num_rows = len(self)
|
||||
|
||||
if (num_rows > max_rows):
|
||||
# If we have a lot of rows, create a SparseDataFrame and use
|
||||
# pandas to_string logic
|
||||
# NOTE: this sparse DataFrame can't be used as the middle
|
||||
# section is all NaNs. However, it gives us potentially a nice way
|
||||
# to use the pandas IO methods.
|
||||
sdf = pd.DataFrame({item: pd.SparseArray(data=head[item],
|
||||
sparse_index=
|
||||
BlockIndex(
|
||||
num_rows, [0, num_rows-tail_rows], [head_rows, tail_rows]))
|
||||
for item in self.columns})
|
||||
|
||||
# TODO - don't hard code max_rows - use pandas default/ES default
|
||||
return sdf.to_string(max_rows=max_rows)
|
||||
|
||||
return head.to_string(max_rows=max_rows)
|
||||
|
||||
|
@ -298,3 +298,21 @@ class Mappings():
|
||||
((self.mappings_capabilities.pd_dtype == 'int64') |
|
||||
(self.mappings_capabilities.pd_dtype == 'float64'))].index.tolist()
|
||||
|
||||
def source_fields(self):
|
||||
"""
|
||||
Returns
|
||||
-------
|
||||
source_fields: list of str
|
||||
List of source fields
|
||||
"""
|
||||
return self.mappings_capabilities[self.mappings_capabilities._source == True].index.tolist()
|
||||
|
||||
def count_source_fields(self):
|
||||
"""
|
||||
Returns
|
||||
-------
|
||||
count_source_fields: int
|
||||
Number of source fields in mapping
|
||||
"""
|
||||
return len(self.mappings_capabilities[self.mappings_capabilities._source == True].index)
|
||||
|
||||
|
@ -438,6 +438,7 @@ TEST_MAPPING1_EXPECTED = {
|
||||
}
|
||||
|
||||
TEST_MAPPING1_EXPECTED_DF = pd.DataFrame.from_dict(data=TEST_MAPPING1_EXPECTED, orient='index', columns=['es_dtype'])
|
||||
TEST_MAPPING1_EXPECTED_SOURCE_FIELD_COUNT = len(TEST_MAPPING1_EXPECTED_DF.index) - 4
|
||||
|
||||
TEST_NESTED_USER_GROUP_INDEX_NAME = 'nested_user_group'
|
||||
TEST_NESTED_USER_GROUP_MAPPING = {
|
||||
|
@ -12,11 +12,13 @@ class TestMapping():
|
||||
|
||||
# Requires 'setup_tests.py' to be run prior to this
|
||||
def test_mapping(self):
|
||||
mapping = ed.Mappings(ed.Client(ELASTICSEARCH_HOST), TEST_MAPPING1_INDEX_NAME)
|
||||
mappings = ed.Mappings(ed.Client(ELASTICSEARCH_HOST), TEST_MAPPING1_INDEX_NAME)
|
||||
|
||||
assert mapping.all_fields() == TEST_MAPPING1_EXPECTED_DF.index.tolist()
|
||||
assert mappings.all_fields() == TEST_MAPPING1_EXPECTED_DF.index.tolist()
|
||||
|
||||
assert_frame_equal(TEST_MAPPING1_EXPECTED_DF, pd.DataFrame(mapping.mappings_capabilities['es_dtype']))
|
||||
assert_frame_equal(TEST_MAPPING1_EXPECTED_DF, pd.DataFrame(mappings.mappings_capabilities['es_dtype']))
|
||||
|
||||
assert mappings.count_source_fields() == TEST_MAPPING1_EXPECTED_SOURCE_FIELD_COUNT
|
||||
|
||||
|
||||
|
||||
|
@ -52,3 +52,10 @@ class TestDataFrameIndexing(TestData):
|
||||
# don't match the mapping types. This is mainly because the products field is
|
||||
# nested and so can be treated as a multi-field in ES, but not in pandas
|
||||
|
||||
def test_size(self):
|
||||
assert self.pd_flights().shape == self.ed_flights().shape
|
||||
assert len(self.pd_flights()) == len(self.ed_flights())
|
||||
|
||||
def test_to_string(self):
|
||||
print(self.ed_flights())
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user