Adding __repr__ method based on SparseDataFrames

2025-07-11 00:02:14 +08:00 · 2019-06-24 13:34:06 +00:00 · 2019-06-24 13:34:06 +00:00 · d4250640f1
commit d4250640f1
parent c723633526
7 changed files with 107 additions and 4 deletions
--- a/eland/init.py
+++ b/eland/init.py
@ -1,4 +1,4 @@
 from .utils import *
 from .frame import *
 from .client import *
-from .mappings import *
+from .mappings import *
--- a/eland/client.py
+++ b/eland/client.py
@ -21,3 +21,8 @@ class Client():
    def field_caps(self, **kwargs):
        return self.es.field_caps(**kwargs)
    def count(self, **kwargs):
        count_json = self.es.count(**kwargs)
        return count_json['count']
--- a/eland/frame.py
+++ b/eland/frame.py
@ -30,6 +30,8 @@ from elasticsearch_dsl import Search
 import pandas as pd
 from pandas.core.arrays.sparse import BlockIndex
 class DataFrame():
    """
    pandas.DataFrame like API that proxies into Elasticsearch index(es).
@ -251,3 +253,71 @@ class DataFrame():
        df = pd.DataFrame(data=results, index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
        return df
    @property
    def shape(self):
        """
        Return a tuple representing the dimensionality of the DataFrame.
        Returns
        -------
        shape: tuple
            0 - number of rows
            1 - number of columns
        """
        num_rows = len(self)
        num_columns = self.columns
        return num_rows, num_columns
    @property
    def columns(self):
        return self.mappings.source_fields()
    def __getitem__(self, item):
        if isinstance(item, str):
            if item not in self.mappings.is_source_field(item):
                raise TypeError('Column does not exist: [{0}]'.format(item))
            return Column(item)
        elif isinstance(item, BooleanFilter):
            self._filter = item.build()
            return self
        else:
            raise TypeError('Unsupported expr: [{0}]'.format(item))
    def __len__(self):
        """
        Returns length of info axis, but here we use the index.
        """
        return self.client.count(index=self.index_pattern)
    # ----------------------------------------------------------------------
    # Rendering Methods
    def __repr__(self):
        # The return for this is display.options.max_rows
        max_rows = 60
        head_rows = max_rows / 2
        tail_rows = max_rows - head_rows
        head = self.head(max_rows)
        num_rows = len(self)
        if (num_rows > max_rows):
            # If we have a lot of rows, create a SparseDataFrame and use
            # pandas to_string logic
            # NOTE: this sparse DataFrame can't be used as the middle
            # section is all NaNs. However, it gives us potentially a nice way
            # to use the pandas IO methods.
            sdf = pd.DataFrame({item: pd.SparseArray(data=head[item],
                                                     sparse_index=
                                                     BlockIndex(
                                                         num_rows, [0, num_rows-tail_rows], [head_rows, tail_rows]))
                                for item in self.columns})
            # TODO - don't hard code max_rows - use pandas default/ES default
            return sdf.to_string(max_rows=max_rows)
        return head.to_string(max_rows=max_rows)
--- a/eland/mappings.py
+++ b/eland/mappings.py
@ -298,3 +298,21 @@ class Mappings():
                                          ((self.mappings_capabilities.pd_dtype == 'int64') |
                                           (self.mappings_capabilities.pd_dtype == 'float64'))].index.tolist()
    def source_fields(self):
        """
        Returns
        -------
        source_fields: list of str
            List of source fields
        """
        return self.mappings_capabilities[self.mappings_capabilities._source == True].index.tolist()
    def count_source_fields(self):
        """
        Returns
        -------
        count_source_fields: int
            Number of source fields in mapping
        """
        return len(self.mappings_capabilities[self.mappings_capabilities._source == True].index)
--- a/eland/tests/init.py
+++ b/eland/tests/init.py
@ -438,6 +438,7 @@ TEST_MAPPING1_EXPECTED = {
 }
 TEST_MAPPING1_EXPECTED_DF = pd.DataFrame.from_dict(data=TEST_MAPPING1_EXPECTED, orient='index', columns=['es_dtype'])
 TEST_MAPPING1_EXPECTED_SOURCE_FIELD_COUNT = len(TEST_MAPPING1_EXPECTED_DF.index) - 4
 TEST_NESTED_USER_GROUP_INDEX_NAME = 'nested_user_group'
 TEST_NESTED_USER_GROUP_MAPPING = {
--- a/eland/tests/client/test_mappings_pytest.py
+++ b/eland/tests/client/test_mappings_pytest.py
@ -12,11 +12,13 @@ class TestMapping():
    # Requires 'setup_tests.py' to be run prior to this
    def test_mapping(self):
-        mapping = ed.Mappings(ed.Client(ELASTICSEARCH_HOST), TEST_MAPPING1_INDEX_NAME)
+        mappings = ed.Mappings(ed.Client(ELASTICSEARCH_HOST), TEST_MAPPING1_INDEX_NAME)
-        assert mapping.all_fields() == TEST_MAPPING1_EXPECTED_DF.index.tolist()
+        assert mappings.all_fields() == TEST_MAPPING1_EXPECTED_DF.index.tolist()
-        assert_frame_equal(TEST_MAPPING1_EXPECTED_DF, pd.DataFrame(mapping.mappings_capabilities['es_dtype']))
+        assert_frame_equal(TEST_MAPPING1_EXPECTED_DF, pd.DataFrame(mappings.mappings_capabilities['es_dtype']))
        assert mappings.count_source_fields() == TEST_MAPPING1_EXPECTED_SOURCE_FIELD_COUNT
--- a/eland/tests/frame/test_indexing_pytest.py
+++ b/eland/tests/frame/test_indexing_pytest.py
@ -52,3 +52,10 @@ class TestDataFrameIndexing(TestData):
        # don't match the mapping types. This is mainly because the products field is
        # nested and so can be treated as a multi-field in ES, but not in pandas
    def test_size(self):
        assert self.pd_flights().shape == self.ed_flights().shape
        assert len(self.pd_flights()) == len(self.ed_flights())
    def test_to_string(self):
        print(self.ed_flights())