Added example notebooks + pytest for notebooks (#87)

* Added example notebooks + pytest for these notebooks1 * Fixed paths * Fixing link in docs * Adding cleaner demo_notebook
2025-07-11 00:02:14 +08:00 · 2019-12-10 15:27:13 +01:00 · 2019-12-10 15:27:13 +01:00 · 133b227b93
commit 133b227b93
parent 206276c5fa
20 changed files with 5105 additions and 12376 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -154,9 +154,9 @@ currently using a minimum version of PyCharm 2019.2.4.
 -   Setup Elasticsearch instance (assumes `localhost:9200`), and run
    `python -m eland.tests.setup_tests` to setup test environment -*note
    this modifies Elasticsearch indices*
-   Run `pytest --doctest-modules` to validate install
+-   Run `pytest --nbval  --doctest-modules` to validate install
 ### Documentation
 -   Install documentation requirements. Open terminal in virtual
-    environment and run `pip install -r requirements-dev.txt`
+    environment and run `pip install -r docs/requirements-docs.txt`
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@ -4,3 +4,4 @@ matplotlib
 pytest>=5.2.1
 git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master
 numpydoc==0.8
 nbsphinx
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -55,6 +55,7 @@ extensions = [
    'numpydoc',
    "matplotlib.sphinxext.plot_directive",
    "sphinx.ext.todo",
    "nbsphinx",
 ]
 doctest_global_setup = '''
@ -91,7 +92,7 @@ templates_path = ['_templates']
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = []
+exclude_patterns = ['**.ipynb_checkpoints']
 # -- Options for HTML output -------------------------------------------------
--- a/docs/source/examples/data/online-retail.csv.gz
+++ b/docs/source/examples/data/online-retail.csv.gz
--- a/docs/source/examples/demo_notebook.ipynb
+++ b/docs/source/examples/demo_notebook.ipynb
--- a/docs/source/examples/index.rst
+++ b/docs/source/examples/index.rst
@ -0,0 +1,11 @@
 .. _examples:
 ========
 Examples
 ========
 .. toctree::
   :maxdepth: 2
   demo_notebook
   online_retail_analysis
--- a/docs/source/examples/online_retail_analysis.ipynb
+++ b/docs/source/examples/online_retail_analysis.ipynb
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -25,6 +25,8 @@ In general, the data resides in elasticsearch and not in memory, which allows el
   reference/index
   implementation/index
   development/index
   examples/index
 * :doc:`reference/index`
@ -43,3 +45,6 @@ In general, the data resides in elasticsearch and not in memory, which allows el
 * :doc:`development/index`
  * :doc:`development/contributing`
 * :doc:`examples/index`
--- a/eland/dataframe.py
+++ b/eland/dataframe.py
@ -312,8 +312,7 @@ class DataFrame(NDFrame):
                max_rows = min_rows
            return self.to_html(max_rows=max_rows, max_cols=max_cols,
-                                show_dimensions=show_dimensions, notebook=True,
+                                show_dimensions=show_dimensions, notebook=True)
                                bold_rows=False)  # set for consistency with pandas output
        else:
            return None
@ -384,20 +383,35 @@ class DataFrame(NDFrame):
         index_field: _id
         is_source_field: False
        Mappings:
-         capabilities:                 _source   es_dtype        pd_dtype  searchable  aggregatable
+         capabilities:                     _source   es_dtype        pd_dtype  searchable  aggregatable
-        AvgTicketPrice     True      float         float64        True          True
+        AvgTicketPrice         True      float         float64        True          True
-        Cancelled          True    boolean            bool        True          True
+        Cancelled              True    boolean            bool        True          True
-        Carrier            True    keyword          object        True          True
+        Carrier                True    keyword          object        True          True
-        Dest               True    keyword          object        True          True
+        Dest                   True    keyword          object        True          True
-        DestAirportID      True    keyword          object        True          True
+        DestAirportID          True    keyword          object        True          True
-        ...                 ...        ...             ...         ...           ...
+        DestCityName           True    keyword          object        True          True
-        OriginLocation     True  geo_point          object        True          True
+        DestCountry            True    keyword          object        True          True
-        OriginRegion       True    keyword          object        True          True
+        DestLocation           True  geo_point          object        True          True
-        OriginWeather      True    keyword          object        True          True
+        DestRegion             True    keyword          object        True          True
-        dayOfWeek          True    integer           int64        True          True
+        DestWeather            True    keyword          object        True          True
-        timestamp          True       date  datetime64[ns]        True          True
+        DistanceKilometers     True      float         float64        True          True
-        <BLANKLINE>
+        DistanceMiles          True      float         float64        True          True
-        [27 rows x 5 columns]
+        FlightDelay            True    boolean            bool        True          True
        FlightDelayMin         True    integer           int64        True          True
        FlightDelayType        True    keyword          object        True          True
        FlightNum              True    keyword          object        True          True
        FlightTimeHour         True      float         float64        True          True
        FlightTimeMin          True      float         float64        True          True
        Origin                 True    keyword          object        True          True
        OriginAirportID        True    keyword          object        True          True
        OriginCityName         True    keyword          object        True          True
        OriginCountry          True    keyword          object        True          True
        OriginLocation         True  geo_point          object        True          True
        OriginRegion           True    keyword          object        True          True
        OriginWeather          True    keyword          object        True          True
        dayOfWeek              True    integer           int64        True          True
        timestamp              True       date  datetime64[ns]        True          True
         date_fields_format: {}
        Operations:
         tasks: [('boolean_filter': ('boolean_filter': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}})), ('tail': ('sort_field': '_doc', 'count': 5))]
         size: 5
--- a/eland/mappings.py
+++ b/eland/mappings.py
@ -541,4 +541,5 @@ class Mappings:
    def info_es(self, buf):
        buf.write("Mappings:\n")
-        buf.write(" capabilities: {0}\n".format(self._mappings_capabilities))
+        buf.write(" capabilities: {0}\n".format(self._mappings_capabilities.to_string()))
        buf.write(" date_fields_format: {0}\n".format(self._date_fields_format))
--- a/eland/query_compiler.py
+++ b/eland/query_compiler.py
@ -564,7 +564,7 @@ class ElandQueryCompiler:
            raise ValueError(
                "Can not perform arithmetic operations on non aggregatable fields"
                "One of [{}, {}] is not aggregatable.".format(self_field, right_field)
-        )
+            )
    def arithmetic_op_fields(self, new_field_name, op, left_field, right_field, op_type=None):
        result = self.copy()
@ -667,6 +667,7 @@ class ElandQueryCompiler:
            buf.write("'field_to_display_names': {}\n".format(self._field_to_display_names))
            buf.write("'display_to_field_names': {}\n".format(self._display_to_field_names))
 def elasticsearch_date_to_pandas_date(value: Union[int, str], date_format: str) -> pd.Timestamp:
    """
    Given a specific Elasticsearch format for a date datatype, returns the
--- a/eland/tests/Eland
+++ b/eland/tests/Eland
--- a/eland/tests/plotting/test_dataframe_hist_pytest.ipynb
+++ b/eland/tests/plotting/test_dataframe_hist_pytest.ipynb
--- a/eland/utils.py
+++ b/eland/utils.py
@ -280,6 +280,7 @@ def read_csv(filepath_or_buffer,
    kwds = dict()
    kwds.update(
        sep=sep,
        delimiter=delimiter,
        engine=engine,
        dialect=dialect,
--- a/Analysis.ipynb
+++ b/Analysis.ipynb
--- a/example/README.md
+++ b/example/README.md
@ -1,17 +0,0 @@
 # Example Walkthrough for eland
 This example demonstrate the functionality of `eland` through a walkthrough of a simple analysis of the [Online Retail Dataset](https://archive.ics.uci.edu/ml/datasets/online+retail).
 To run this example, make sure that you have an elasticsearch cluster running on port 9200 and please install any additional dependencies in addition to `eland`:
 ```
 pip install -r requirements-example.txt
 ```
 Once these requirements are satisfied, load the data using the provided script:
 ```
 python load.py
 ```
 This will create an index called `online-retail` with a mapping defined in `load.py`.
--- a/example/load_data.py
+++ b/example/load_data.py
@ -1,149 +0,0 @@
 #  Copyright 2019 Elasticsearch BV
 #
 #      Licensed under the Apache License, Version 2.0 (the "License");
 #      you may not use this file except in compliance with the License.
 #      You may obtain a copy of the License at
 #
 #          http://www.apache.org/licenses/LICENSE-2.0
 #
 #      Unless required by applicable law or agreed to in writing, software
 #      distributed under the License is distributed on an "AS IS" BASIS,
 #      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #      See the License for the specific language governing permissions and
 #      limitations under the License.
 import argparse
 import csv
 from elasticsearch import Elasticsearch, helpers
 from elasticsearch.exceptions import TransportError
 def create_index(es, index):
    mapping = {
        "mappings": {
            "properties": {
                "invoice_no": {"type": "keyword"},
                "stock_code": {"type": "keyword"},
                "description": {"type": "keyword"},
                "quantity": {"type": "integer"},
                "invoice_date": {"type": "date", "format": "MM/dd/yyyy HH:mm"},
                "unit_price": {"type": "float"},
                "customer_id": {"type": "keyword"},
                "country": {"type": "keyword"}
            }
        }
    }
    # create an empty index
    try:
        es.indices.create(index=index, body=mapping)
    except TransportError as e:
        # ignore already existing index
        if e.error == "resource_already_exists_exception":
            pass
        else:
            raise
 def parse_date(date):
    """
    we need to convert dates to conform to the mapping in the following way:
        months: one or two digit ints   -> MM
        days:   one or two digit ints   -> dd
        years:  two digit ints          -> yyyy
        times:  {H}H:mm                 -> HH:mm
    """
    date = date.split("/")
    month = date[0] if len(date[0]) == 2 else "0{}".format(date[0])
    day = date[1] if len(date[1]) == 2 else "0{}".format(date[1])
    year = date[2].split(" ")[0]
    year = "20{}".format(year)
    time = date[2].split(" ")[1]
    time = time if len(time) == 5 else "0{}".format(time)
    date = "{}/{}/{} {}".format(month, day, year, time)
    return date
 def parse_line(line):
    """
    creates the document to be indexed
    """
    obj = {
        "invoice_no": line[0],
        "stock_code": line[1],
        "description": line[2],
        "quantity": line[3],
        "invoice_date": parse_date(line[4]),
        "unit_price": line[5],
        "customer_id": line[6],
        "country": line[7].replace("\n", "")
    }
    return obj
 def load_data(es):
    """
    generate one document per line of online-retail.csv
    read file line by line to avoid loading all data into memory
    """
    create_index(es, "online-retail")
    header = True
    with open("data/online-retail.csv", "r") as f:
        reader = csv.reader(f, quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL)
        for line in reader:
            if header:
                header = False
                continue
            doc = parse_line(line)
            yield doc
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-H",
        "--host",
        action="store",
        default="localhost:9200",
        help="The elasticsearch host you wish to connect to. (Default: localhost:9200)"
    )
    args = parser.parse_args()
    # create the elasticsearch client, pointing to the host parameter
    es = Elasticsearch(args.host)
    index = 'online-retail'
    # load data from online retail csv in data directory
    stream = load_data(es)
    for ok, result in helpers.streaming_bulk(
            es,
            actions=stream,
            index=index,
            chunk_size=1000
    ):
        action, result = result.popitem()
        doc_id = "/{}/doc/{}".format(index, result['_id'])
        if not ok:
            print("Failed to {} document {} {}".format(action, doc_id, result))
        else:
            print(doc_id)
    # make docs available for searches
    es.indices.refresh(index=index)
    # notify user of number of documents indexed
    print(es.count(index=index)["count"], "documents in index")
--- a/example/requirements-example.txt
+++ b/example/requirements-example.txt
@ -1,80 +0,0 @@
 alabaster==0.7.12
 appnope==0.1.0
 atomicwrites==1.3.0
 attrs==19.3.0
 Babel==2.7.0
 backcall==0.1.0
 bleach==3.1.0
 certifi==2019.9.11
 chardet==3.0.4
 cycler==0.10.0
 decorator==4.4.1
 defusedxml==0.6.0
 docutils==0.15.2
 eland==0.1
 elasticsearch==7.1.0
 entrypoints==0.3
 idna==2.8
 imagesize==1.1.0
 importlib-metadata==0.23
 ipykernel==5.1.3
 ipython==7.9.0
 ipython-genutils==0.2.0
 ipywidgets==7.5.1
 jedi==0.15.1
 Jinja2==2.10.3
 jsonschema==3.1.1
 jupyter==1.0.0
 jupyter-client==5.3.4
 jupyter-console==6.0.0
 jupyter-core==4.6.1
 kiwisolver==1.1.0
 MarkupSafe==1.1.1
 matplotlib==3.1.1
 mistune==0.8.4
 more-itertools==7.2.0
 nbconvert==5.6.1
 nbformat==4.4.0
 notebook==6.0.2
 numpy==1.17.4
 numpydoc==0.8.0
 packaging==19.2
 pandas==0.25.1
 pandocfilters==1.4.2
 parso==0.5.1
 pexpect==4.7.0
 pickleshare==0.7.5
 pluggy==0.13.0
 prometheus-client==0.7.1
 prompt-toolkit==2.0.10
 ptyprocess==0.6.0
 py==1.8.0
 Pygments==2.4.2
 pyparsing==2.4.5
 pyrsistent==0.15.5
 pytest==5.2.2
 python-dateutil==2.8.1
 pytz==2019.3
 pyzmq==18.1.1
 qtconsole==4.5.5
 requests==2.22.0
 Send2Trash==1.5.0
 six==1.13.0
 snowballstemmer==2.0.0
 Sphinx==2.2.1
 sphinx-rtd-theme==0.4.3
 sphinxcontrib-applehelp==1.0.1
 sphinxcontrib-devhelp==1.0.1
 sphinxcontrib-htmlhelp==1.0.2
 sphinxcontrib-jsmath==1.0.1
 sphinxcontrib-qthelp==1.0.2
 sphinxcontrib-serializinghtml==1.1.3
 terminado==0.8.3
 testpath==0.4.4
 tornado==6.0.3
 traitlets==4.3.3
 urllib3==1.25.7
 wcwidth==0.1.7
 webencodings==0.5.1
 widgetsnbextension==3.5.1
 zipp==0.6.0
--- a/make_docs.sh
+++ b/make_docs.sh
@ -2,6 +2,9 @@
 python setup.py install
 jupyter nbconvert --to notebook --inplace --execute docs/source/examples/demo_notebook.ipynb
 jupyter nbconvert --to notebook --inplace --execute docs/source/examples/online_retail_analysis.ipynb 
 cd docs
 make clean
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -2,4 +2,5 @@ elasticsearch>=7.0.5
 pandas==0.25.1
 matplotlib
 pytest>=5.2.1
 nbval
 numpydoc==0.8