Adding __repr__ method based on SparseDataFrames

2025-07-11 00:02:14 +08:00 · 2019-06-24 13:34:06 +00:00 · 2019-06-24 13:34:06 +00:00 · d4250640f1
commit d4250640f1
parent c723633526
7 changed files with 107 additions and 4 deletions
--- a/eland/client.py
+++ b/eland/client.py
@ -21,3 +21,8 @@ class Client():

    def field_caps(self, **kwargs):
        return self.es.field_caps(**kwargs)
+
+    def count(self, **kwargs):
+        count_json = self.es.count(**kwargs)
+        return count_json['count']
+
--- a/eland/frame.py
+++ b/eland/frame.py
@ -30,6 +30,8 @@ from elasticsearch_dsl import Search

 import pandas as pd

+from pandas.core.arrays.sparse import BlockIndex
+
 class DataFrame():
    """
    pandas.DataFrame like API that proxies into Elasticsearch index(es).
@ -251,3 +253,71 @@ class DataFrame():
        df = pd.DataFrame(data=results, index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
            
        return df
+
+    @property
+    def shape(self):
+        """
+        Return a tuple representing the dimensionality of the DataFrame.
+
+        Returns
+        -------
+        shape: tuple
+            0 - number of rows
+            1 - number of columns
+        """
+        num_rows = len(self)
+        num_columns = self.columns
+
+        return num_rows, num_columns
+
+    @property
+    def columns(self):
+        return self.mappings.source_fields()
+
+    def __getitem__(self, item):
+        if isinstance(item, str):
+            if item not in self.mappings.is_source_field(item):
+                raise TypeError('Column does not exist: [{0}]'.format(item))
+            return Column(item)
+        elif isinstance(item, BooleanFilter):
+            self._filter = item.build()
+            return self
+        else:
+            raise TypeError('Unsupported expr: [{0}]'.format(item))
+
+    def __len__(self):
+        """
+        Returns length of info axis, but here we use the index.
+        """
+        return self.client.count(index=self.index_pattern)
+
+    # ----------------------------------------------------------------------
+    # Rendering Methods
+
+    def __repr__(self):
+        # The return for this is display.options.max_rows
+        max_rows = 60
+        head_rows = max_rows / 2
+        tail_rows = max_rows - head_rows
+
+        head = self.head(max_rows)
+
+        num_rows = len(self)
+
+        if (num_rows > max_rows):
+            # If we have a lot of rows, create a SparseDataFrame and use
+            # pandas to_string logic
+            # NOTE: this sparse DataFrame can't be used as the middle
+            # section is all NaNs. However, it gives us potentially a nice way
+            # to use the pandas IO methods.
+            sdf = pd.DataFrame({item: pd.SparseArray(data=head[item],
+                                                     sparse_index=
+                                                     BlockIndex(
+                                                         num_rows, [0, num_rows-tail_rows], [head_rows, tail_rows]))
+                                for item in self.columns})
+
+            # TODO - don't hard code max_rows - use pandas default/ES default
+            return sdf.to_string(max_rows=max_rows)
+
+        return head.to_string(max_rows=max_rows)
+
--- a/eland/mappings.py
+++ b/eland/mappings.py
@ -298,3 +298,21 @@ class Mappings():
                                          ((self.mappings_capabilities.pd_dtype == 'int64') |
                                           (self.mappings_capabilities.pd_dtype == 'float64'))].index.tolist()

+    def source_fields(self):
+        """
+        Returns
+        -------
+        source_fields: list of str
+            List of source fields
+        """
+        return self.mappings_capabilities[self.mappings_capabilities._source == True].index.tolist()
+
+    def count_source_fields(self):
+        """
+        Returns
+        -------
+        count_source_fields: int
+            Number of source fields in mapping
+        """
+        return len(self.mappings_capabilities[self.mappings_capabilities._source == True].index)
+
--- a/eland/tests/init.py
+++ b/eland/tests/init.py
@ -438,6 +438,7 @@ TEST_MAPPING1_EXPECTED = {
 }

 TEST_MAPPING1_EXPECTED_DF = pd.DataFrame.from_dict(data=TEST_MAPPING1_EXPECTED, orient='index', columns=['es_dtype'])
+TEST_MAPPING1_EXPECTED_SOURCE_FIELD_COUNT = len(TEST_MAPPING1_EXPECTED_DF.index) - 4

 TEST_NESTED_USER_GROUP_INDEX_NAME = 'nested_user_group'
 TEST_NESTED_USER_GROUP_MAPPING = {
--- a/eland/tests/client/test_mappings_pytest.py
+++ b/eland/tests/client/test_mappings_pytest.py
@ -12,11 +12,13 @@ class TestMapping():

    # Requires 'setup_tests.py' to be run prior to this
    def test_mapping(self):
-        mapping = ed.Mappings(ed.Client(ELASTICSEARCH_HOST), TEST_MAPPING1_INDEX_NAME)
+        mappings = ed.Mappings(ed.Client(ELASTICSEARCH_HOST), TEST_MAPPING1_INDEX_NAME)

-        assert mapping.all_fields() == TEST_MAPPING1_EXPECTED_DF.index.tolist()
+        assert mappings.all_fields() == TEST_MAPPING1_EXPECTED_DF.index.tolist()

-        assert_frame_equal(TEST_MAPPING1_EXPECTED_DF, pd.DataFrame(mapping.mappings_capabilities['es_dtype']))
+        assert_frame_equal(TEST_MAPPING1_EXPECTED_DF, pd.DataFrame(mappings.mappings_capabilities['es_dtype']))
+
+        assert mappings.count_source_fields() == TEST_MAPPING1_EXPECTED_SOURCE_FIELD_COUNT



--- a/eland/tests/frame/test_indexing_pytest.py
+++ b/eland/tests/frame/test_indexing_pytest.py
@ -52,3 +52,10 @@ class TestDataFrameIndexing(TestData):
        # don't match the mapping types. This is mainly because the products field is
        # nested and so can be treated as a multi-field in ES, but not in pandas

+    def test_size(self):
+        assert self.pd_flights().shape == self.ed_flights().shape
+        assert len(self.pd_flights()) == len(self.ed_flights())
+
+    def test_to_string(self):
+        print(self.ed_flights())
+