Added example notebooks + pytest for notebooks (#87)

* Added example notebooks + pytest for these notebooks1 * Fixed paths * Fixing link in docs * Adding cleaner demo_notebook
2025-07-11 00:02:14 +08:00 · 2019-12-10 15:27:13 +01:00 · 2019-12-10 15:27:13 +01:00 · 133b227b93
commit 133b227b93
parent 206276c5fa
20 changed files with 5105 additions and 12376 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -154,9 +154,9 @@ currently using a minimum version of PyCharm 2019.2.4.
 -   Setup Elasticsearch instance (assumes `localhost:9200`), and run
    `python -m eland.tests.setup_tests` to setup test environment -*note
    this modifies Elasticsearch indices*
-   Run `pytest --doctest-modules` to validate install
+-   Run `pytest --nbval  --doctest-modules` to validate install

 ### Documentation

 -   Install documentation requirements. Open terminal in virtual
-    environment and run `pip install -r requirements-dev.txt`
+    environment and run `pip install -r docs/requirements-docs.txt`
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@ -4,3 +4,4 @@ matplotlib
 pytest>=5.2.1
 git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master
 numpydoc==0.8
+nbsphinx
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -55,6 +55,7 @@ extensions = [
    'numpydoc',
    "matplotlib.sphinxext.plot_directive",
    "sphinx.ext.todo",
+    "nbsphinx",
 ]

 doctest_global_setup = '''
@ -91,7 +92,7 @@ templates_path = ['_templates']
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = []
+exclude_patterns = ['**.ipynb_checkpoints']

 # -- Options for HTML output -------------------------------------------------

--- a/docs/source/examples/data/online-retail.csv.gz
+++ b/docs/source/examples/data/online-retail.csv.gz
--- a/docs/source/examples/demo_notebook.ipynb
+++ b/docs/source/examples/demo_notebook.ipynb
--- a/docs/source/examples/index.rst
+++ b/docs/source/examples/index.rst
@ -0,0 +1,11 @@
+.. _examples:
+
+========
+Examples
+========
+
+.. toctree::
+   :maxdepth: 2
+
+   demo_notebook
+   online_retail_analysis
--- a/docs/source/examples/online_retail_analysis.ipynb
+++ b/docs/source/examples/online_retail_analysis.ipynb
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -25,6 +25,8 @@ In general, the data resides in elasticsearch and not in memory, which allows el
   reference/index
   implementation/index
   development/index
+   examples/index
+    

 * :doc:`reference/index`

@ -43,3 +45,6 @@ In general, the data resides in elasticsearch and not in memory, which allows el
 * :doc:`development/index`

  * :doc:`development/contributing`
+
+* :doc:`examples/index`
+
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -312,8 +312,7 @@ class DataFrame(NDFrame):
                max_rows = min_rows

            return self.to_html(max_rows=max_rows, max_cols=max_cols,
-                                show_dimensions=show_dimensions, notebook=True,
-                                bold_rows=False)  # set for consistency with pandas output
+                                show_dimensions=show_dimensions, notebook=True)
        else:
            return None

@ -384,20 +383,35 @@ class DataFrame(NDFrame):
         index_field: _id
         is_source_field: False
        Mappings:
-         capabilities:                 _source   es_dtype        pd_dtype  searchable  aggregatable
-        AvgTicketPrice     True      float         float64        True          True
-        Cancelled          True    boolean            bool        True          True
-        Carrier            True    keyword          object        True          True
-        Dest               True    keyword          object        True          True
-        DestAirportID      True    keyword          object        True          True
-        ...                 ...        ...             ...         ...           ...
-        OriginLocation     True  geo_point          object        True          True
-        OriginRegion       True    keyword          object        True          True
-        OriginWeather      True    keyword          object        True          True
-        dayOfWeek          True    integer           int64        True          True
-        timestamp          True       date  datetime64[ns]        True          True
-        <BLANKLINE>
-        [27 rows x 5 columns]
+         capabilities:                     _source   es_dtype        pd_dtype  searchable  aggregatable
+        AvgTicketPrice         True      float         float64        True          True
+        Cancelled              True    boolean            bool        True          True
+        Carrier                True    keyword          object        True          True
+        Dest                   True    keyword          object        True          True
+        DestAirportID          True    keyword          object        True          True
+        DestCityName           True    keyword          object        True          True
+        DestCountry            True    keyword          object        True          True
+        DestLocation           True  geo_point          object        True          True
+        DestRegion             True    keyword          object        True          True
+        DestWeather            True    keyword          object        True          True
+        DistanceKilometers     True      float         float64        True          True
+        DistanceMiles          True      float         float64        True          True
+        FlightDelay            True    boolean            bool        True          True
+        FlightDelayMin         True    integer           int64        True          True
+        FlightDelayType        True    keyword          object        True          True
+        FlightNum              True    keyword          object        True          True
+        FlightTimeHour         True      float         float64        True          True
+        FlightTimeMin          True      float         float64        True          True
+        Origin                 True    keyword          object        True          True
+        OriginAirportID        True    keyword          object        True          True
+        OriginCityName         True    keyword          object        True          True
+        OriginCountry          True    keyword          object        True          True
+        OriginLocation         True  geo_point          object        True          True
+        OriginRegion           True    keyword          object        True          True
+        OriginWeather          True    keyword          object        True          True
+        dayOfWeek              True    integer           int64        True          True
+        timestamp              True       date  datetime64[ns]        True          True
+         date_fields_format: {}
        Operations:
         tasks: [('boolean_filter': ('boolean_filter': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}})), ('tail': ('sort_field': '_doc', 'count': 5))]
         size: 5
--- a/eland/mappings.py
+++ b/eland/mappings.py
@ -541,4 +541,5 @@ class Mappings:

    def info_es(self, buf):
        buf.write("Mappings:\n")
-        buf.write(" capabilities: {0}\n".format(self._mappings_capabilities))
+        buf.write(" capabilities: {0}\n".format(self._mappings_capabilities.to_string()))
+        buf.write(" date_fields_format: {0}\n".format(self._date_fields_format))
--- a/eland/query_compiler.py
+++ b/eland/query_compiler.py
@ -564,7 +564,7 @@ class ElandQueryCompiler:
            raise ValueError(
                "Can not perform arithmetic operations on non aggregatable fields"
                "One of [{}, {}] is not aggregatable.".format(self_field, right_field)
-        )
+            )

    def arithmetic_op_fields(self, new_field_name, op, left_field, right_field, op_type=None):
        result = self.copy()
@ -667,6 +667,7 @@ class ElandQueryCompiler:
            buf.write("'field_to_display_names': {}\n".format(self._field_to_display_names))
            buf.write("'display_to_field_names': {}\n".format(self._display_to_field_names))

+
 def elasticsearch_date_to_pandas_date(value: Union[int, str], date_format: str) -> pd.Timestamp:
    """
    Given a specific Elasticsearch format for a date datatype, returns the
--- a/eland/tests/Eland
+++ b/eland/tests/Eland
--- a/eland/tests/plotting/test_dataframe_hist_pytest.ipynb
+++ b/eland/tests/plotting/test_dataframe_hist_pytest.ipynb
--- a/eland/utils.py
+++ b/eland/utils.py
@ -280,6 +280,7 @@ def read_csv(filepath_or_buffer,
    kwds = dict()

    kwds.update(
+        sep=sep,
        delimiter=delimiter,
        engine=engine,
        dialect=dialect,
--- a/Analysis.ipynb
+++ b/Analysis.ipynb
--- a/example/README.md
+++ b/example/README.md
@ -1,17 +0,0 @@
-# Example Walkthrough for eland
-
-This example demonstrate the functionality of `eland` through a walkthrough of a simple analysis of the [Online Retail Dataset](https://archive.ics.uci.edu/ml/datasets/online+retail).
-
-To run this example, make sure that you have an elasticsearch cluster running on port 9200 and please install any additional dependencies in addition to `eland`:
-
-```
-pip install -r requirements-example.txt
-```
-
-Once these requirements are satisfied, load the data using the provided script:
-
-```
-python load.py
-```
-
-This will create an index called `online-retail` with a mapping defined in `load.py`.
--- a/example/load_data.py
+++ b/example/load_data.py
@ -1,149 +0,0 @@
-#  Copyright 2019 Elasticsearch BV
-#
-#      Licensed under the Apache License, Version 2.0 (the "License");
-#      you may not use this file except in compliance with the License.
-#      You may obtain a copy of the License at
-#
-#          http://www.apache.org/licenses/LICENSE-2.0
-#
-#      Unless required by applicable law or agreed to in writing, software
-#      distributed under the License is distributed on an "AS IS" BASIS,
-#      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#      See the License for the specific language governing permissions and
-#      limitations under the License.
-
-import argparse
-import csv
-
-from elasticsearch import Elasticsearch, helpers
-from elasticsearch.exceptions import TransportError
-
-
-def create_index(es, index):
-    mapping = {
-        "mappings": {
-            "properties": {
-                "invoice_no": {"type": "keyword"},
-                "stock_code": {"type": "keyword"},
-                "description": {"type": "keyword"},
-                "quantity": {"type": "integer"},
-                "invoice_date": {"type": "date", "format": "MM/dd/yyyy HH:mm"},
-                "unit_price": {"type": "float"},
-                "customer_id": {"type": "keyword"},
-                "country": {"type": "keyword"}
-            }
-        }
-    }
-
-    # create an empty index
-    try:
-        es.indices.create(index=index, body=mapping)
-    except TransportError as e:
-        # ignore already existing index
-        if e.error == "resource_already_exists_exception":
-            pass
-        else:
-            raise
-
-
-def parse_date(date):
-    """
-    we need to convert dates to conform to the mapping in the following way:
-        months: one or two digit ints   -> MM
-        days:   one or two digit ints   -> dd
-        years:  two digit ints          -> yyyy
-        times:  {H}H:mm                 -> HH:mm
-    """
-
-    date = date.split("/")
-
-    month = date[0] if len(date[0]) == 2 else "0{}".format(date[0])
-
-    day = date[1] if len(date[1]) == 2 else "0{}".format(date[1])
-
-    year = date[2].split(" ")[0]
-    year = "20{}".format(year)
-
-    time = date[2].split(" ")[1]
-    time = time if len(time) == 5 else "0{}".format(time)
-
-    date = "{}/{}/{} {}".format(month, day, year, time)
-
-    return date
-
-
-def parse_line(line):
-    """
-    creates the document to be indexed
-    """
-    obj = {
-        "invoice_no": line[0],
-        "stock_code": line[1],
-        "description": line[2],
-        "quantity": line[3],
-        "invoice_date": parse_date(line[4]),
-        "unit_price": line[5],
-        "customer_id": line[6],
-        "country": line[7].replace("\n", "")
-    }
-
-    return obj
-
-
-def load_data(es):
-    """
-    generate one document per line of online-retail.csv
-    read file line by line to avoid loading all data into memory
-    """
-
-    create_index(es, "online-retail")
-
-    header = True
-    with open("data/online-retail.csv", "r") as f:
-        reader = csv.reader(f, quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL)
-        for line in reader:
-            if header:
-                header = False
-                continue
-            doc = parse_line(line)
-
-            yield doc
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-H",
-        "--host",
-        action="store",
-        default="localhost:9200",
-        help="The elasticsearch host you wish to connect to. (Default: localhost:9200)"
-    )
-
-    args = parser.parse_args()
-
-    # create the elasticsearch client, pointing to the host parameter
-    es = Elasticsearch(args.host)
-    index = 'online-retail'
-
-    # load data from online retail csv in data directory
-    stream = load_data(es)
-    for ok, result in helpers.streaming_bulk(
-            es,
-            actions=stream,
-            index=index,
-            chunk_size=1000
-    ):
-        action, result = result.popitem()
-        doc_id = "/{}/doc/{}".format(index, result['_id'])
-
-        if not ok:
-            print("Failed to {} document {} {}".format(action, doc_id, result))
-        else:
-            print(doc_id)
-
-    # make docs available for searches
-    es.indices.refresh(index=index)
-
-    # notify user of number of documents indexed
-    print(es.count(index=index)["count"], "documents in index")
--- a/example/requirements-example.txt
+++ b/example/requirements-example.txt
@ -1,80 +0,0 @@
-alabaster==0.7.12
-appnope==0.1.0
-atomicwrites==1.3.0
-attrs==19.3.0
-Babel==2.7.0
-backcall==0.1.0
-bleach==3.1.0
-certifi==2019.9.11
-chardet==3.0.4
-cycler==0.10.0
-decorator==4.4.1
-defusedxml==0.6.0
-docutils==0.15.2
-eland==0.1
-elasticsearch==7.1.0
-entrypoints==0.3
-idna==2.8
-imagesize==1.1.0
-importlib-metadata==0.23
-ipykernel==5.1.3
-ipython==7.9.0
-ipython-genutils==0.2.0
-ipywidgets==7.5.1
-jedi==0.15.1
-Jinja2==2.10.3
-jsonschema==3.1.1
-jupyter==1.0.0
-jupyter-client==5.3.4
-jupyter-console==6.0.0
-jupyter-core==4.6.1
-kiwisolver==1.1.0
-MarkupSafe==1.1.1
-matplotlib==3.1.1
-mistune==0.8.4
-more-itertools==7.2.0
-nbconvert==5.6.1
-nbformat==4.4.0
-notebook==6.0.2
-numpy==1.17.4
-numpydoc==0.8.0
-packaging==19.2
-pandas==0.25.1
-pandocfilters==1.4.2
-parso==0.5.1
-pexpect==4.7.0
-pickleshare==0.7.5
-pluggy==0.13.0
-prometheus-client==0.7.1
-prompt-toolkit==2.0.10
-ptyprocess==0.6.0
-py==1.8.0
-Pygments==2.4.2
-pyparsing==2.4.5
-pyrsistent==0.15.5
-pytest==5.2.2
-python-dateutil==2.8.1
-pytz==2019.3
-pyzmq==18.1.1
-qtconsole==4.5.5
-requests==2.22.0
-Send2Trash==1.5.0
-six==1.13.0
-snowballstemmer==2.0.0
-Sphinx==2.2.1
-sphinx-rtd-theme==0.4.3
-sphinxcontrib-applehelp==1.0.1
-sphinxcontrib-devhelp==1.0.1
-sphinxcontrib-htmlhelp==1.0.2
-sphinxcontrib-jsmath==1.0.1
-sphinxcontrib-qthelp==1.0.2
-sphinxcontrib-serializinghtml==1.1.3
-terminado==0.8.3
-testpath==0.4.4
-tornado==6.0.3
-traitlets==4.3.3
-urllib3==1.25.7
-wcwidth==0.1.7
-webencodings==0.5.1
-widgetsnbextension==3.5.1
-zipp==0.6.0
--- a/make_docs.sh
+++ b/make_docs.sh
@ -2,6 +2,9 @@

 python setup.py install

+jupyter nbconvert --to notebook --inplace --execute docs/source/examples/demo_notebook.ipynb
+jupyter nbconvert --to notebook --inplace --execute docs/source/examples/online_retail_analysis.ipynb 
+
 cd docs

 make clean
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -2,4 +2,5 @@ elasticsearch>=7.0.5
 pandas==0.25.1
 matplotlib
 pytest>=5.2.1
+nbval
 numpydoc==0.8