Merge pull request #42 from blaklaybul/example

Example Analysis of Online Retail
2025-07-24 00:00:39 +08:00 · 2019-11-15 09:36:06 +01:00 · 2019-11-15 09:36:06 +01:00 · 8e72a97dc5
commit 8e72a97dc5
parent 850e3f0d73 75f1bb51ce
4 changed files with 1333 additions and 0 deletions
--- a/Analysis.ipynb
+++ b/Analysis.ipynb
--- a/example/README.md
+++ b/example/README.md
@ -0,0 +1,17 @@
 # Example Walkthrough for eland
 This example demonstrate the functionality of `eland` through a walkthrough of a simple analysis of the [Online Retail Dataset](https://archive.ics.uci.edu/ml/datasets/online+retail).
 To run this example, make sure that you have an elasticsearch cluster running on port 9200 and please install any additional dependencies in addition to `eland`:
 ```
 pip install -r requirements-example.txt
 ```
 Once these requirements are satisfied, load the data using the provided script:
 ```
 python load.py
 ```
 This will create an index called `online-retail` with a mapping defined in `load.py`.
--- a/example/load_data.py
+++ b/example/load_data.py
@ -0,0 +1,131 @@
 import argparse
 import csv
 from elasticsearch import Elasticsearch, helpers
 from elasticsearch.exceptions import TransportError
 def create_index(es, index):
    mapping = {
        "mappings": {
            "properties": {
                "invoice_no": {"type": "keyword"},
                "stock_code": {"type": "keyword"},
                "description": {"type": "keyword"},
                "quantity": {"type": "integer"},
                "invoice_date": {"type": "date", "format": "MM/dd/yyyy HH:mm"},
                "unit_price": {"type": "float"},
                "customer_id": {"type": "keyword"},
                "country": {"type": "keyword"}
            }
        }
    }
    # create an empty index
    try:
        es.indices.create(index=index, body=mapping)
    except TransportError as e:
        # ignore already existing index
        if e.error == "resource_already_exists_exception":
            pass
        else:
            raise
 def parse_date(date):
    """
    we need to convert dates to conform to the mapping in the following way:
        months: one or two digit ints   -> MM
        days:   one or two digit ints   -> dd
        years:  two digit ints          -> yyyy
        times:  {H}H:mm                 -> HH:mm
    """
    date = date.split("/")
    month = date[0] if len(date[0]) == 2 else "0{}".format(date[0])
    day = date[1] if len(date[1]) == 2 else "0{}".format(date[1])
    year = date[2].split(" ")[0]
    year = "20{}".format(year)
    time = date[2].split(" ")[1]
    time = time if len(time) == 5 else "0{}".format(time)
    date = "{}/{}/{} {}".format(month, day, year, time)
    return date
 def parse_line(line):
    """
    creates the document to be indexed
    """
    obj = {
        "invoice_no": line[0],
        "stock_code": line[1],
        "description": line[2],
        "quantity": line[3],
        "invoice_date": parse_date(line[4]),
        "unit_price": line[5],
        "customer_id": line[6],
        "country": line[7].replace("\n", "")
    }
    return obj
 def load_data(es):
    """
    generate one document per line of online-retail.csv
    read file line by line to avoid loading all data into memory
    """
    create_index(es, "online-retail")
    header = True
    with open("data/online-retail.csv", "r") as f:
        reader = csv.reader(f, quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL)
        for line in reader:
            if header:
                header=False
                continue
            doc = parse_line(line)
            yield doc
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-H",
        "--host",
        action="store",
        default="localhost:9200",
        help="The elasticsearch host you wish to connect to. (Default: localhost:9200)"
    )
    args = parser.parse_args()
    # create the elasticsearch client, pointing to the host parameter
    es = Elasticsearch(args.host)
    index='online-retail'
    # load data from online retail csv in data directory
    stream = load_data(es)
    for ok, result  in helpers.streaming_bulk(
            es,
            actions=stream,
            index=index,
            chunk_size=1000
    ):
        action, result = result.popitem()
        doc_id = "/{}/doc/{}".format(index, result['_id'])
        if not ok:
            print("Failed to {} document {} {}".format(action, doc_id, result))
        else:
            print(doc_id)
    # make docs available for searches
    es.indices.refresh(index=index)
    # notify user of number of documents indexed
    print(es.count(index=index)["count"], "documents in index")
--- a/example/requirements-example.txt
+++ b/example/requirements-example.txt
@ -0,0 +1,80 @@
 alabaster==0.7.12
 appnope==0.1.0
 atomicwrites==1.3.0
 attrs==19.3.0
 Babel==2.7.0
 backcall==0.1.0
 bleach==3.1.0
 certifi==2019.9.11
 chardet==3.0.4
 cycler==0.10.0
 decorator==4.4.1
 defusedxml==0.6.0
 docutils==0.15.2
 eland==0.1
 elasticsearch==7.1.0
 entrypoints==0.3
 idna==2.8
 imagesize==1.1.0
 importlib-metadata==0.23
 ipykernel==5.1.3
 ipython==7.9.0
 ipython-genutils==0.2.0
 ipywidgets==7.5.1
 jedi==0.15.1
 Jinja2==2.10.3
 jsonschema==3.1.1
 jupyter==1.0.0
 jupyter-client==5.3.4
 jupyter-console==6.0.0
 jupyter-core==4.6.1
 kiwisolver==1.1.0
 MarkupSafe==1.1.1
 matplotlib==3.1.1
 mistune==0.8.4
 more-itertools==7.2.0
 nbconvert==5.6.1
 nbformat==4.4.0
 notebook==6.0.2
 numpy==1.17.4
 numpydoc==0.8.0
 packaging==19.2
 pandas==0.25.1
 pandocfilters==1.4.2
 parso==0.5.1
 pexpect==4.7.0
 pickleshare==0.7.5
 pluggy==0.13.0
 prometheus-client==0.7.1
 prompt-toolkit==2.0.10
 ptyprocess==0.6.0
 py==1.8.0
 Pygments==2.4.2
 pyparsing==2.4.5
 pyrsistent==0.15.5
 pytest==5.2.2
 python-dateutil==2.8.1
 pytz==2019.3
 pyzmq==18.1.1
 qtconsole==4.5.5
 requests==2.22.0
 Send2Trash==1.5.0
 six==1.13.0
 snowballstemmer==2.0.0
 Sphinx==2.2.1
 sphinx-rtd-theme==0.4.3
 sphinxcontrib-applehelp==1.0.1
 sphinxcontrib-devhelp==1.0.1
 sphinxcontrib-htmlhelp==1.0.2
 sphinxcontrib-jsmath==1.0.1
 sphinxcontrib-qthelp==1.0.2
 sphinxcontrib-serializinghtml==1.1.3
 terminado==0.8.3
 testpath==0.4.4
 tornado==6.0.3
 traitlets==4.3.3
 urllib3==1.25.7
 wcwidth==0.1.7
 webencodings==0.5.1
 widgetsnbextension==3.5.1
 zipp==0.6.0