mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Adding __repr__ method based on SparseDataFrames
This commit is contained in:
parent
c723633526
commit
d4250640f1
@ -1,4 +1,4 @@
|
|||||||
from .utils import *
|
from .utils import *
|
||||||
from .frame import *
|
from .frame import *
|
||||||
from .client import *
|
from .client import *
|
||||||
from .mappings import *
|
from .mappings import *
|
||||||
|
@ -21,3 +21,8 @@ class Client():
|
|||||||
|
|
||||||
def field_caps(self, **kwargs):
|
def field_caps(self, **kwargs):
|
||||||
return self.es.field_caps(**kwargs)
|
return self.es.field_caps(**kwargs)
|
||||||
|
|
||||||
|
def count(self, **kwargs):
|
||||||
|
count_json = self.es.count(**kwargs)
|
||||||
|
return count_json['count']
|
||||||
|
|
||||||
|
@ -30,6 +30,8 @@ from elasticsearch_dsl import Search
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
from pandas.core.arrays.sparse import BlockIndex
|
||||||
|
|
||||||
class DataFrame():
|
class DataFrame():
|
||||||
"""
|
"""
|
||||||
pandas.DataFrame like API that proxies into Elasticsearch index(es).
|
pandas.DataFrame like API that proxies into Elasticsearch index(es).
|
||||||
@ -251,3 +253,71 @@ class DataFrame():
|
|||||||
df = pd.DataFrame(data=results, index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
|
df = pd.DataFrame(data=results, index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
|
||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
@property
|
||||||
|
def shape(self):
|
||||||
|
"""
|
||||||
|
Return a tuple representing the dimensionality of the DataFrame.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
shape: tuple
|
||||||
|
0 - number of rows
|
||||||
|
1 - number of columns
|
||||||
|
"""
|
||||||
|
num_rows = len(self)
|
||||||
|
num_columns = self.columns
|
||||||
|
|
||||||
|
return num_rows, num_columns
|
||||||
|
|
||||||
|
@property
|
||||||
|
def columns(self):
|
||||||
|
return self.mappings.source_fields()
|
||||||
|
|
||||||
|
def __getitem__(self, item):
|
||||||
|
if isinstance(item, str):
|
||||||
|
if item not in self.mappings.is_source_field(item):
|
||||||
|
raise TypeError('Column does not exist: [{0}]'.format(item))
|
||||||
|
return Column(item)
|
||||||
|
elif isinstance(item, BooleanFilter):
|
||||||
|
self._filter = item.build()
|
||||||
|
return self
|
||||||
|
else:
|
||||||
|
raise TypeError('Unsupported expr: [{0}]'.format(item))
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
"""
|
||||||
|
Returns length of info axis, but here we use the index.
|
||||||
|
"""
|
||||||
|
return self.client.count(index=self.index_pattern)
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Rendering Methods
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
# The return for this is display.options.max_rows
|
||||||
|
max_rows = 60
|
||||||
|
head_rows = max_rows / 2
|
||||||
|
tail_rows = max_rows - head_rows
|
||||||
|
|
||||||
|
head = self.head(max_rows)
|
||||||
|
|
||||||
|
num_rows = len(self)
|
||||||
|
|
||||||
|
if (num_rows > max_rows):
|
||||||
|
# If we have a lot of rows, create a SparseDataFrame and use
|
||||||
|
# pandas to_string logic
|
||||||
|
# NOTE: this sparse DataFrame can't be used as the middle
|
||||||
|
# section is all NaNs. However, it gives us potentially a nice way
|
||||||
|
# to use the pandas IO methods.
|
||||||
|
sdf = pd.DataFrame({item: pd.SparseArray(data=head[item],
|
||||||
|
sparse_index=
|
||||||
|
BlockIndex(
|
||||||
|
num_rows, [0, num_rows-tail_rows], [head_rows, tail_rows]))
|
||||||
|
for item in self.columns})
|
||||||
|
|
||||||
|
# TODO - don't hard code max_rows - use pandas default/ES default
|
||||||
|
return sdf.to_string(max_rows=max_rows)
|
||||||
|
|
||||||
|
return head.to_string(max_rows=max_rows)
|
||||||
|
|
||||||
|
@ -298,3 +298,21 @@ class Mappings():
|
|||||||
((self.mappings_capabilities.pd_dtype == 'int64') |
|
((self.mappings_capabilities.pd_dtype == 'int64') |
|
||||||
(self.mappings_capabilities.pd_dtype == 'float64'))].index.tolist()
|
(self.mappings_capabilities.pd_dtype == 'float64'))].index.tolist()
|
||||||
|
|
||||||
|
def source_fields(self):
|
||||||
|
"""
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
source_fields: list of str
|
||||||
|
List of source fields
|
||||||
|
"""
|
||||||
|
return self.mappings_capabilities[self.mappings_capabilities._source == True].index.tolist()
|
||||||
|
|
||||||
|
def count_source_fields(self):
|
||||||
|
"""
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
count_source_fields: int
|
||||||
|
Number of source fields in mapping
|
||||||
|
"""
|
||||||
|
return len(self.mappings_capabilities[self.mappings_capabilities._source == True].index)
|
||||||
|
|
||||||
|
@ -438,6 +438,7 @@ TEST_MAPPING1_EXPECTED = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TEST_MAPPING1_EXPECTED_DF = pd.DataFrame.from_dict(data=TEST_MAPPING1_EXPECTED, orient='index', columns=['es_dtype'])
|
TEST_MAPPING1_EXPECTED_DF = pd.DataFrame.from_dict(data=TEST_MAPPING1_EXPECTED, orient='index', columns=['es_dtype'])
|
||||||
|
TEST_MAPPING1_EXPECTED_SOURCE_FIELD_COUNT = len(TEST_MAPPING1_EXPECTED_DF.index) - 4
|
||||||
|
|
||||||
TEST_NESTED_USER_GROUP_INDEX_NAME = 'nested_user_group'
|
TEST_NESTED_USER_GROUP_INDEX_NAME = 'nested_user_group'
|
||||||
TEST_NESTED_USER_GROUP_MAPPING = {
|
TEST_NESTED_USER_GROUP_MAPPING = {
|
||||||
|
@ -12,11 +12,13 @@ class TestMapping():
|
|||||||
|
|
||||||
# Requires 'setup_tests.py' to be run prior to this
|
# Requires 'setup_tests.py' to be run prior to this
|
||||||
def test_mapping(self):
|
def test_mapping(self):
|
||||||
mapping = ed.Mappings(ed.Client(ELASTICSEARCH_HOST), TEST_MAPPING1_INDEX_NAME)
|
mappings = ed.Mappings(ed.Client(ELASTICSEARCH_HOST), TEST_MAPPING1_INDEX_NAME)
|
||||||
|
|
||||||
assert mapping.all_fields() == TEST_MAPPING1_EXPECTED_DF.index.tolist()
|
assert mappings.all_fields() == TEST_MAPPING1_EXPECTED_DF.index.tolist()
|
||||||
|
|
||||||
assert_frame_equal(TEST_MAPPING1_EXPECTED_DF, pd.DataFrame(mapping.mappings_capabilities['es_dtype']))
|
assert_frame_equal(TEST_MAPPING1_EXPECTED_DF, pd.DataFrame(mappings.mappings_capabilities['es_dtype']))
|
||||||
|
|
||||||
|
assert mappings.count_source_fields() == TEST_MAPPING1_EXPECTED_SOURCE_FIELD_COUNT
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -52,3 +52,10 @@ class TestDataFrameIndexing(TestData):
|
|||||||
# don't match the mapping types. This is mainly because the products field is
|
# don't match the mapping types. This is mainly because the products field is
|
||||||
# nested and so can be treated as a multi-field in ES, but not in pandas
|
# nested and so can be treated as a multi-field in ES, but not in pandas
|
||||||
|
|
||||||
|
def test_size(self):
|
||||||
|
assert self.pd_flights().shape == self.ed_flights().shape
|
||||||
|
assert len(self.pd_flights()) == len(self.ed_flights())
|
||||||
|
|
||||||
|
def test_to_string(self):
|
||||||
|
print(self.ed_flights())
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user