Implement eland.DataFrame.to_json (#661)

Co-authored-by: Quentin Pradet <quentin.pradet@elastic.co>
2025-07-11 00:02:14 +08:00 · 2024-02-15 08:32:54 +01:00 · 2024-02-15 08:32:54 +01:00 · 33cf029efe
commit 33cf029efe
parent 9d492b03aa
9 changed files with 242 additions and 2 deletions
--- a/docs/sphinx/reference/api/eland.DataFrame.rst
+++ b/docs/sphinx/reference/api/eland.DataFrame.rst
@ -49,6 +49,7 @@
      ~DataFrame.tail
      ~DataFrame.to_csv
      ~DataFrame.to_html
+      ~DataFrame.to_json
      ~DataFrame.to_numpy
      ~DataFrame.to_pandas
      ~DataFrame.to_string
--- a/docs/sphinx/reference/api/eland.DataFrame.to_json.rst
+++ b/docs/sphinx/reference/api/eland.DataFrame.to_json.rst
@ -0,0 +1,6 @@
+eland.DataFrame.to\_json
+=======================
+
+.. currentmodule:: eland
+
+.. automethod:: DataFrame.to_json
--- a/docs/sphinx/reference/dataframe.rst
+++ b/docs/sphinx/reference/dataframe.rst
@ -140,5 +140,6 @@ Serialization / IO / Conversion
   DataFrame.to_numpy
   DataFrame.to_csv
   DataFrame.to_html
+   DataFrame.to_json
   DataFrame.to_string
   DataFrame.to_pandas
--- a/docs/sphinx/reference/supported_apis.rst
+++ b/docs/sphinx/reference/supported_apis.rst
@ -395,7 +395,7 @@ script instead of being modified manually.
 +---------------------------------------+------------+
 | ``ed.DataFrame.to_html()``            | **Yes**    |
 +---------------------------------------+------------+
-| ``ed.DataFrame.to_json()``            | No         |
+| ``ed.DataFrame.to_json()``            | **Yes**    |
 +---------------------------------------+------------+
 | ``ed.DataFrame.to_latex()``           | No         |
 +---------------------------------------+------------+
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -1342,6 +1342,48 @@ class DataFrame(NDFrame):
        }
        return self._query_compiler.to_csv(**kwargs)

+    def to_json(
+        self,
+        path_or_buf=None,
+        orient=None,
+        date_format=None,
+        double_precision=10,
+        force_ascii=True,
+        date_unit="ms",
+        default_handler=None,
+        lines=False,
+        compression="infer",
+        index=True,
+        indent=None,
+        storage_options=None,
+    ):
+        """Write Elasticsearch data to a json file.
+
+        By setting the ``lines`` parameter to ``True``, and ``orient`` to ``'records'``,
+        the entire DataFrame can be written in a streaming manner.
+        Doing so avoids the need to have the entire DataFrame in memory.
+        This format is known as JSON lines and can use the file extension ``.jsonl``.
+
+        See Also
+        --------
+        :pandas_api_docs:`pandas.DataFrame.to_json`
+        """
+        kwargs = {
+            "path_or_buf": path_or_buf,
+            "orient": orient,
+            "date_format": date_format,
+            "double_precision": double_precision,
+            "force_ascii": force_ascii,
+            "date_unit": date_unit,
+            "default_handler": default_handler,
+            "lines": lines,
+            "compression": compression,
+            "index": index,
+            "indent": indent,
+            "storage_options": storage_options,
+        }
+        return self._query_compiler.to_json(**kwargs)
+
    def to_pandas(self, show_progress: bool = False) -> pd.DataFrame:
        """
        Utility method to convert eland.Dataframe to pandas.Dataframe
--- a/eland/operations.py
+++ b/eland/operations.py
@ -16,6 +16,7 @@
 #  under the License.

 import copy
+import os
 import warnings
 from collections import defaultdict
 from datetime import datetime
@ -1250,6 +1251,46 @@ class Operations:
        if path_or_buf is None:
            return "".join(result)

+    def to_json(  # type: ignore
+        self,
+        query_compiler: "QueryCompiler",
+        path_or_buf=None,
+        orient=None,
+        lines=False,
+        **kwargs,
+    ):
+        if orient == "records" and lines is True:
+            result: List[str] = []
+            our_filehandle = False
+            if isinstance(path_or_buf, os.PathLike):
+                buf = open(path_or_buf, "w")
+                our_filehandle = True
+            elif isinstance(path_or_buf, str):
+                buf = open(path_or_buf, "w")
+                our_filehandle = True
+            else:
+                buf = path_or_buf
+            for i, df in enumerate(
+                self.search_yield_pandas_dataframes(query_compiler=query_compiler)
+            ):
+                output = df.to_json(
+                    orient=orient,
+                    lines=lines,
+                    **kwargs,
+                )
+                if buf is None:
+                    result.append(output)
+                else:
+                    buf.write(output)
+            # If we opened the file ourselves, we should close it
+            if our_filehandle:
+                buf.close()
+            return "".join(result) or None
+        else:
+            return self.to_pandas(query_compiler=query_compiler).to_json(
+                path_or_buf, orient=orient, lines=lines, **kwargs
+            )
+
    def to_pandas(
        self, query_compiler: "QueryCompiler", show_progress: bool = False
    ) -> pd.DataFrame:
--- a/eland/query_compiler.py
+++ b/eland/query_compiler.py
@ -514,6 +514,14 @@ class QueryCompiler:
        """
        return self._operations.to_csv(query_compiler=self, **kwargs)

+    def to_json(self, **kwargs) -> Optional[str]:
+        """Serialises Eland Dataframe to CSV
+
+        Returns:
+            If path_or_buf is None, returns the resulting json as a string.
+        """
+        return self._operations.to_json(query_compiler=self, **kwargs)
+
    def search_yield_pandas_dataframes(self) -> Generator["pd.DataFrame", None, None]:
        return self._operations.search_yield_pandas_dataframes(self)

--- a/tests/dataframe/test_to_csv_pytest.py
+++ b/tests/dataframe/test_to_csv_pytest.py
@ -15,7 +15,7 @@
 #  specific language governing permissions and limitations
 #  under the License.

-# File called _pytest for PyCharm compatability
+# File called _pytest for PyCharm compatibility

 import ast
 import time
--- a/tests/dataframe/test_to_json_pytest.py
+++ b/tests/dataframe/test_to_json_pytest.py
@ -0,0 +1,141 @@
+#  Licensed to Elasticsearch B.V. under one or more contributor
+#  license agreements. See the NOTICE file distributed with
+#  this work for additional information regarding copyright
+#  ownership. Elasticsearch B.V. licenses this file to you under
+#  the Apache License, Version 2.0 (the "License"); you may
+#  not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+# 	http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+# File called _pytest for PyCharm compatibility
+
+from io import StringIO
+from pathlib import Path
+
+import pandas
+from pandas.testing import assert_frame_equal
+
+from tests.common import ROOT_DIR, TestData
+
+
+class TestDataFrameToJSON(TestData):
+
+    def test_to_json_default_arguments(self):
+        ed_flights = self.ed_flights()
+        pd_flights = self.pd_flights()
+        ed_flights.to_json(ROOT_DIR + "/dataframe/results/eland_to_json.jsonl")
+        pd_flights.to_json(ROOT_DIR + "/dataframe/results/pandas_to_json.jsonl")
+
+        assert_frame_equal(
+            pandas.read_json(ROOT_DIR + "/dataframe/results/eland_to_json.jsonl"),
+            pandas.read_json(ROOT_DIR + "/dataframe/results/pandas_to_json.jsonl"),
+        )
+
+    def test_to_json_streaming_mode(self):
+        ed_flights = self.ed_flights()
+        pd_flights = self.pd_flights()
+        ed_flights.to_json(
+            ROOT_DIR + "/dataframe/results/streaming_eland_to_json.jsonl",
+            lines=True,
+            orient="records",
+        )
+        pd_flights.to_json(
+            ROOT_DIR + "/dataframe/results/streaming_pandas_to_json.jsonl",
+            lines=True,
+            orient="records",
+        )
+
+        assert_frame_equal(
+            pandas.read_json(
+                ROOT_DIR + "/dataframe/results/streaming_eland_to_json.jsonl",
+                lines=True,
+                orient="records",
+            ),
+            pandas.read_json(
+                ROOT_DIR + "/dataframe/results/streaming_pandas_to_json.jsonl",
+                lines=True,
+                orient="records",
+            ),
+        )
+
+    def test_to_json_streaming_mode_pathlib(self):
+        root_dir = Path(ROOT_DIR)
+
+        ed_flights = self.ed_flights()
+        pd_flights = self.pd_flights()
+        ed_flights.to_json(
+            root_dir / "dataframe" / "results" / "pathlib_eland_to_json.jsonl",
+            lines=True,
+            orient="records",
+        )
+        pd_flights.to_json(
+            root_dir / "dataframe" / "results" / "pathlib_pandas_to_json.jsonl",
+            lines=True,
+            orient="records",
+        )
+
+        assert_frame_equal(
+            pandas.read_json(
+                root_dir / "dataframe" / "results" / "pathlib_eland_to_json.jsonl",
+                lines=True,
+                orient="records",
+            ),
+            pandas.read_json(
+                root_dir / "dataframe" / "results" / "pathlib_pandas_to_json.jsonl",
+                lines=True,
+                orient="records",
+            ),
+        )
+
+    def test_to_json_with_other_buffer(self):
+
+        ed_flights = self.ed_flights()
+        pd_flights = self.pd_flights()
+        output_buffer = StringIO()
+        ed_flights.to_json(output_buffer, lines=True, orient="records")
+        output_string = pd_flights.to_json(lines=True, orient="records")
+
+        output_buffer.seek(0)  # rewind our StringIO object
+
+        assert_frame_equal(
+            pandas.read_json(output_buffer, lines=True, orient="records"),
+            pandas.read_json(
+                StringIO(output_string),
+                lines=True,
+                orient="records",
+            ),
+        )
+
+    def test_to_json_with_file_handle(self):
+        root_dir = Path(ROOT_DIR)
+
+        ed_flights = self.ed_flights()
+        pd_flights = self.pd_flights()
+        with open(
+            root_dir / "dataframe" / "results" / "fh_eland_to_json.jsonl", "w"
+        ) as w:
+            ed_flights.to_json(w)
+        pd_flights.to_json(
+            root_dir / "dataframe" / "results" / "check_pandas_to_json.jsonl"
+        )
+
+        assert_frame_equal(
+            pandas.read_json(
+                ROOT_DIR + "/dataframe/results/fh_eland_to_json.jsonl",
+                lines=True,
+                orient="records",
+            ),
+            pandas.read_json(
+                ROOT_DIR + "/dataframe/results/check_pandas_to_json.jsonl",
+                lines=True,
+                orient="records",
+            ),
+        )