Implement eland.DataFrame.to_json (#661)

Co-authored-by: Quentin Pradet <quentin.pradet@elastic.co>
2025-07-11 00:02:14 +08:00 · 2024-02-15 08:32:54 +01:00 · 2024-02-15 08:32:54 +01:00 · 33cf029efe
commit 33cf029efe
parent 9d492b03aa
9 changed files with 242 additions and 2 deletions
--- a/docs/sphinx/reference/api/eland.DataFrame.rst
+++ b/docs/sphinx/reference/api/eland.DataFrame.rst
@ -49,6 +49,7 @@
      ~DataFrame.tail
      ~DataFrame.to_csv
      ~DataFrame.to_html
      ~DataFrame.to_json
      ~DataFrame.to_numpy
      ~DataFrame.to_pandas
      ~DataFrame.to_string
--- a/docs/sphinx/reference/api/eland.DataFrame.to_json.rst
+++ b/docs/sphinx/reference/api/eland.DataFrame.to_json.rst
@ -0,0 +1,6 @@
 eland.DataFrame.to\_json
 =======================
 .. currentmodule:: eland
 .. automethod:: DataFrame.to_json
--- a/docs/sphinx/reference/dataframe.rst
+++ b/docs/sphinx/reference/dataframe.rst
@ -140,5 +140,6 @@ Serialization / IO / Conversion
   DataFrame.to_numpy
   DataFrame.to_csv
   DataFrame.to_html
   DataFrame.to_json
   DataFrame.to_string
   DataFrame.to_pandas
--- a/docs/sphinx/reference/supported_apis.rst
+++ b/docs/sphinx/reference/supported_apis.rst
@ -395,7 +395,7 @@ script instead of being modified manually.
 +---------------------------------------+------------+
 | ``ed.DataFrame.to_html()``            | **Yes**    |
 +---------------------------------------+------------+
-| ``ed.DataFrame.to_json()``            | No         |
+| ``ed.DataFrame.to_json()``            | **Yes**    |
 +---------------------------------------+------------+
 | ``ed.DataFrame.to_latex()``           | No         |
 +---------------------------------------+------------+
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -1342,6 +1342,48 @@ class DataFrame(NDFrame):
        }
        return self._query_compiler.to_csv(**kwargs)
    def to_json(
        self,
        path_or_buf=None,
        orient=None,
        date_format=None,
        double_precision=10,
        force_ascii=True,
        date_unit="ms",
        default_handler=None,
        lines=False,
        compression="infer",
        index=True,
        indent=None,
        storage_options=None,
    ):
        """Write Elasticsearch data to a json file.
        By setting the ``lines`` parameter to ``True``, and ``orient`` to ``'records'``,
        the entire DataFrame can be written in a streaming manner.
        Doing so avoids the need to have the entire DataFrame in memory.
        This format is known as JSON lines and can use the file extension ``.jsonl``.
        See Also
        --------
        :pandas_api_docs:`pandas.DataFrame.to_json`
        """
        kwargs = {
            "path_or_buf": path_or_buf,
            "orient": orient,
            "date_format": date_format,
            "double_precision": double_precision,
            "force_ascii": force_ascii,
            "date_unit": date_unit,
            "default_handler": default_handler,
            "lines": lines,
            "compression": compression,
            "index": index,
            "indent": indent,
            "storage_options": storage_options,
        }
        return self._query_compiler.to_json(**kwargs)
    def to_pandas(self, show_progress: bool = False) -> pd.DataFrame:
        """
        Utility method to convert eland.Dataframe to pandas.Dataframe
--- a/eland/operations.py
+++ b/eland/operations.py
@ -16,6 +16,7 @@
 #  under the License.
 import copy
 import os
 import warnings
 from collections import defaultdict
 from datetime import datetime
@ -1250,6 +1251,46 @@ class Operations:
        if path_or_buf is None:
            return "".join(result)
    def to_json(  # type: ignore
        self,
        query_compiler: "QueryCompiler",
        path_or_buf=None,
        orient=None,
        lines=False,
        **kwargs,
    ):
        if orient == "records" and lines is True:
            result: List[str] = []
            our_filehandle = False
            if isinstance(path_or_buf, os.PathLike):
                buf = open(path_or_buf, "w")
                our_filehandle = True
            elif isinstance(path_or_buf, str):
                buf = open(path_or_buf, "w")
                our_filehandle = True
            else:
                buf = path_or_buf
            for i, df in enumerate(
                self.search_yield_pandas_dataframes(query_compiler=query_compiler)
            ):
                output = df.to_json(
                    orient=orient,
                    lines=lines,
                    **kwargs,
                )
                if buf is None:
                    result.append(output)
                else:
                    buf.write(output)
            # If we opened the file ourselves, we should close it
            if our_filehandle:
                buf.close()
            return "".join(result) or None
        else:
            return self.to_pandas(query_compiler=query_compiler).to_json(
                path_or_buf, orient=orient, lines=lines, **kwargs
            )
    def to_pandas(
        self, query_compiler: "QueryCompiler", show_progress: bool = False
    ) -> pd.DataFrame:
--- a/eland/query_compiler.py
+++ b/eland/query_compiler.py
@ -514,6 +514,14 @@ class QueryCompiler:
        """
        return self._operations.to_csv(query_compiler=self, **kwargs)
    def to_json(self, **kwargs) -> Optional[str]:
        """Serialises Eland Dataframe to CSV
        Returns:
            If path_or_buf is None, returns the resulting json as a string.
        """
        return self._operations.to_json(query_compiler=self, **kwargs)
    def search_yield_pandas_dataframes(self) -> Generator["pd.DataFrame", None, None]:
        return self._operations.search_yield_pandas_dataframes(self)
--- a/tests/dataframe/test_to_csv_pytest.py
+++ b/tests/dataframe/test_to_csv_pytest.py
@ -15,7 +15,7 @@
 #  specific language governing permissions and limitations
 #  under the License.
-# File called _pytest for PyCharm compatability
+# File called _pytest for PyCharm compatibility
 import ast
 import time
--- a/tests/dataframe/test_to_json_pytest.py
+++ b/tests/dataframe/test_to_json_pytest.py
@ -0,0 +1,141 @@
 #  Licensed to Elasticsearch B.V. under one or more contributor
 #  license agreements. See the NOTICE file distributed with
 #  this work for additional information regarding copyright
 #  ownership. Elasticsearch B.V. licenses this file to you under
 #  the Apache License, Version 2.0 (the "License"); you may
 #  not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 # 	http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing,
 #  software distributed under the License is distributed on an
 #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 #  KIND, either express or implied.  See the License for the
 #  specific language governing permissions and limitations
 #  under the License.
 # File called _pytest for PyCharm compatibility
 from io import StringIO
 from pathlib import Path
 import pandas
 from pandas.testing import assert_frame_equal
 from tests.common import ROOT_DIR, TestData
 class TestDataFrameToJSON(TestData):
    def test_to_json_default_arguments(self):
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()
        ed_flights.to_json(ROOT_DIR + "/dataframe/results/eland_to_json.jsonl")
        pd_flights.to_json(ROOT_DIR + "/dataframe/results/pandas_to_json.jsonl")
        assert_frame_equal(
            pandas.read_json(ROOT_DIR + "/dataframe/results/eland_to_json.jsonl"),
            pandas.read_json(ROOT_DIR + "/dataframe/results/pandas_to_json.jsonl"),
        )
    def test_to_json_streaming_mode(self):
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()
        ed_flights.to_json(
            ROOT_DIR + "/dataframe/results/streaming_eland_to_json.jsonl",
            lines=True,
            orient="records",
        )
        pd_flights.to_json(
            ROOT_DIR + "/dataframe/results/streaming_pandas_to_json.jsonl",
            lines=True,
            orient="records",
        )
        assert_frame_equal(
            pandas.read_json(
                ROOT_DIR + "/dataframe/results/streaming_eland_to_json.jsonl",
                lines=True,
                orient="records",
            ),
            pandas.read_json(
                ROOT_DIR + "/dataframe/results/streaming_pandas_to_json.jsonl",
                lines=True,
                orient="records",
            ),
        )
    def test_to_json_streaming_mode_pathlib(self):
        root_dir = Path(ROOT_DIR)
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()
        ed_flights.to_json(
            root_dir / "dataframe" / "results" / "pathlib_eland_to_json.jsonl",
            lines=True,
            orient="records",
        )
        pd_flights.to_json(
            root_dir / "dataframe" / "results" / "pathlib_pandas_to_json.jsonl",
            lines=True,
            orient="records",
        )
        assert_frame_equal(
            pandas.read_json(
                root_dir / "dataframe" / "results" / "pathlib_eland_to_json.jsonl",
                lines=True,
                orient="records",
            ),
            pandas.read_json(
                root_dir / "dataframe" / "results" / "pathlib_pandas_to_json.jsonl",
                lines=True,
                orient="records",
            ),
        )
    def test_to_json_with_other_buffer(self):
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()
        output_buffer = StringIO()
        ed_flights.to_json(output_buffer, lines=True, orient="records")
        output_string = pd_flights.to_json(lines=True, orient="records")
        output_buffer.seek(0)  # rewind our StringIO object
        assert_frame_equal(
            pandas.read_json(output_buffer, lines=True, orient="records"),
            pandas.read_json(
                StringIO(output_string),
                lines=True,
                orient="records",
            ),
        )
    def test_to_json_with_file_handle(self):
        root_dir = Path(ROOT_DIR)
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()
        with open(
            root_dir / "dataframe" / "results" / "fh_eland_to_json.jsonl", "w"
        ) as w:
            ed_flights.to_json(w)
        pd_flights.to_json(
            root_dir / "dataframe" / "results" / "check_pandas_to_json.jsonl"
        )
        assert_frame_equal(
            pandas.read_json(
                ROOT_DIR + "/dataframe/results/fh_eland_to_json.jsonl",
                lines=True,
                orient="records",
            ),
            pandas.read_json(
                ROOT_DIR + "/dataframe/results/check_pandas_to_json.jsonl",
                lines=True,
                orient="records",
            ),
        )