Add Elasticsearch storage usage to df.info()

2025-07-11 00:02:14 +08:00 · 2020-11-16 21:37:28 +05:30 · 2020-11-16 21:37:28 +05:30 · 56f6ba6c8b
commit 56f6ba6c8b
parent 789f8959bc
4 changed files with 17 additions and 7 deletions
--- a/README.md
+++ b/README.md
@ -121,6 +121,7 @@ Data columns (total 27 columns):
 26  timestamp           13059 non-null  datetime64[ns]
 dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17)
 memory usage: 80.0 bytes
+Elasticsearch storage usage: 5.043 MB

 # Filtering of rows using comparisons
 >>> df[(df.Carrier=="Kibana Airlines") & (df.AvgTicketPrice > 900.0) & (df.Cancelled == True)].head()
--- a/docs/sphinx/examples/demo_notebook.ipynb
+++ b/docs/sphinx/examples/demo_notebook.ipynb
@ -88,7 +88,7 @@
       "eland.dataframe.DataFrame"
      ]
     },
-     "execution_count": 1,
+     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -3122,7 +3122,8 @@
      " 25  dayOfWeek           13059 non-null  int64         \n",
      " 26  timestamp           13059 non-null  datetime64[ns]\n",
      "dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17)\n",
-      "memory usage: 64.0 bytes\n"
+      "memory usage: 64.000 bytes\n",
+      "Elasticsearch storage usage: 5.043 MB\n"
     ]
    }
   ],
@ -4065,7 +4066,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.6"
+   "version": "3.8.5"
  },
  "pycharm": {
   "stem_cell": {
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -831,6 +831,7 @@ class DataFrame(NDFrame):
         1   geoip.city_name      4094 non-null   object
        dtypes: object(2)
        memory usage: ...
+        Elasticsearch storage usage: ...
        """
        if buf is None:  # pragma: no cover
            buf = sys.stdout
@ -940,9 +941,9 @@ class DataFrame(NDFrame):
            # returns size in human readable format
            for x in ["bytes", "KB", "MB", "GB", "TB"]:
                if num < 1024.0:
-                    return f"{num:3.1f}{size_qualifier} {x}"
+                    return f"{num:3.3f}{size_qualifier} {x}"
                num /= 1024.0
-            return f"{num:3.1f}{size_qualifier} PB"
+            return f"{num:3.3f}{size_qualifier} PB"

        if verbose:
            _verbose_repr()
@ -972,7 +973,13 @@ class DataFrame(NDFrame):
            # TODO - this is different from pd.DataFrame as we shouldn't
            #   really hold much in memory. For now just approximate with getsizeof + ignore deep
            mem_usage = sys.getsizeof(self)
-            lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n")
+            lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}")
+            storage_usage = self._query_compiler._client.indices.stats(
+                index=self._query_compiler._index_pattern, metric=["store"]
+            )["_all"]["total"]["store"]["size_in_bytes"]
+            lines.append(
+                f"Elasticsearch storage usage: {_sizeof_fmt(storage_usage,size_qualifier)}\n"
+            )

        fmt.buffer_put_lines(buf, lines)

--- a/eland/tests/tests_notebook/test_demo_notebook.ipynb
+++ b/eland/tests/tests_notebook/test_demo_notebook.ipynb
@ -2870,7 +2870,8 @@
      " 25  dayOfWeek           13059 non-null  int64         \n",
      " 26  timestamp           13059 non-null  datetime64[ns]\n",
      "dtypes: bool(2), datetime64[ns](1), float64(5), int64(2), object(17)\n",
-      "memory usage: 64.0 bytes\n"
+      "memory usage: 64.000 bytes\n",
+      "Elasticsearch storage usage: 5.043 MB\n"
     ]
    }
   ],