mirror of
https://github.com/elastic/eland.git
synced 2025-07-24 00:00:39 +08:00
Merge pull request #42 from blaklaybul/example
Example Analysis of Online Retail
This commit is contained in:
commit
8e72a97dc5
1105
example/Online Retail Analysis.ipynb
Normal file
1105
example/Online Retail Analysis.ipynb
Normal file
File diff suppressed because one or more lines are too long
17
example/README.md
Normal file
17
example/README.md
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
# Example Walkthrough for eland
|
||||||
|
|
||||||
|
This example demonstrate the functionality of `eland` through a walkthrough of a simple analysis of the [Online Retail Dataset](https://archive.ics.uci.edu/ml/datasets/online+retail).
|
||||||
|
|
||||||
|
To run this example, make sure that you have an elasticsearch cluster running on port 9200 and please install any additional dependencies in addition to `eland`:
|
||||||
|
|
||||||
|
```
|
||||||
|
pip install -r requirements-example.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
Once these requirements are satisfied, load the data using the provided script:
|
||||||
|
|
||||||
|
```
|
||||||
|
python load.py
|
||||||
|
```
|
||||||
|
|
||||||
|
This will create an index called `online-retail` with a mapping defined in `load.py`.
|
131
example/load_data.py
Normal file
131
example/load_data.py
Normal file
@ -0,0 +1,131 @@
|
|||||||
|
import argparse
|
||||||
|
import csv
|
||||||
|
|
||||||
|
from elasticsearch import Elasticsearch, helpers
|
||||||
|
from elasticsearch.exceptions import TransportError
|
||||||
|
|
||||||
|
def create_index(es, index):
|
||||||
|
mapping = {
|
||||||
|
"mappings": {
|
||||||
|
"properties": {
|
||||||
|
"invoice_no": {"type": "keyword"},
|
||||||
|
"stock_code": {"type": "keyword"},
|
||||||
|
"description": {"type": "keyword"},
|
||||||
|
"quantity": {"type": "integer"},
|
||||||
|
"invoice_date": {"type": "date", "format": "MM/dd/yyyy HH:mm"},
|
||||||
|
"unit_price": {"type": "float"},
|
||||||
|
"customer_id": {"type": "keyword"},
|
||||||
|
"country": {"type": "keyword"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# create an empty index
|
||||||
|
try:
|
||||||
|
es.indices.create(index=index, body=mapping)
|
||||||
|
except TransportError as e:
|
||||||
|
# ignore already existing index
|
||||||
|
if e.error == "resource_already_exists_exception":
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
|
def parse_date(date):
|
||||||
|
"""
|
||||||
|
we need to convert dates to conform to the mapping in the following way:
|
||||||
|
months: one or two digit ints -> MM
|
||||||
|
days: one or two digit ints -> dd
|
||||||
|
years: two digit ints -> yyyy
|
||||||
|
times: {H}H:mm -> HH:mm
|
||||||
|
"""
|
||||||
|
|
||||||
|
date = date.split("/")
|
||||||
|
|
||||||
|
month = date[0] if len(date[0]) == 2 else "0{}".format(date[0])
|
||||||
|
|
||||||
|
day = date[1] if len(date[1]) == 2 else "0{}".format(date[1])
|
||||||
|
|
||||||
|
year = date[2].split(" ")[0]
|
||||||
|
year = "20{}".format(year)
|
||||||
|
|
||||||
|
time = date[2].split(" ")[1]
|
||||||
|
time = time if len(time) == 5 else "0{}".format(time)
|
||||||
|
|
||||||
|
date = "{}/{}/{} {}".format(month, day, year, time)
|
||||||
|
|
||||||
|
return date
|
||||||
|
|
||||||
|
def parse_line(line):
|
||||||
|
"""
|
||||||
|
creates the document to be indexed
|
||||||
|
"""
|
||||||
|
obj = {
|
||||||
|
"invoice_no": line[0],
|
||||||
|
"stock_code": line[1],
|
||||||
|
"description": line[2],
|
||||||
|
"quantity": line[3],
|
||||||
|
"invoice_date": parse_date(line[4]),
|
||||||
|
"unit_price": line[5],
|
||||||
|
"customer_id": line[6],
|
||||||
|
"country": line[7].replace("\n", "")
|
||||||
|
}
|
||||||
|
|
||||||
|
return obj
|
||||||
|
|
||||||
|
def load_data(es):
|
||||||
|
"""
|
||||||
|
generate one document per line of online-retail.csv
|
||||||
|
read file line by line to avoid loading all data into memory
|
||||||
|
"""
|
||||||
|
|
||||||
|
create_index(es, "online-retail")
|
||||||
|
|
||||||
|
header = True
|
||||||
|
with open("data/online-retail.csv", "r") as f:
|
||||||
|
reader = csv.reader(f, quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL)
|
||||||
|
for line in reader:
|
||||||
|
if header:
|
||||||
|
header=False
|
||||||
|
continue
|
||||||
|
doc = parse_line(line)
|
||||||
|
|
||||||
|
yield doc
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"-H",
|
||||||
|
"--host",
|
||||||
|
action="store",
|
||||||
|
default="localhost:9200",
|
||||||
|
help="The elasticsearch host you wish to connect to. (Default: localhost:9200)"
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# create the elasticsearch client, pointing to the host parameter
|
||||||
|
es = Elasticsearch(args.host)
|
||||||
|
index='online-retail'
|
||||||
|
|
||||||
|
# load data from online retail csv in data directory
|
||||||
|
stream = load_data(es)
|
||||||
|
for ok, result in helpers.streaming_bulk(
|
||||||
|
es,
|
||||||
|
actions=stream,
|
||||||
|
index=index,
|
||||||
|
chunk_size=1000
|
||||||
|
):
|
||||||
|
action, result = result.popitem()
|
||||||
|
doc_id = "/{}/doc/{}".format(index, result['_id'])
|
||||||
|
|
||||||
|
if not ok:
|
||||||
|
print("Failed to {} document {} {}".format(action, doc_id, result))
|
||||||
|
else:
|
||||||
|
print(doc_id)
|
||||||
|
|
||||||
|
# make docs available for searches
|
||||||
|
es.indices.refresh(index=index)
|
||||||
|
|
||||||
|
# notify user of number of documents indexed
|
||||||
|
print(es.count(index=index)["count"], "documents in index")
|
80
example/requirements-example.txt
Normal file
80
example/requirements-example.txt
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
alabaster==0.7.12
|
||||||
|
appnope==0.1.0
|
||||||
|
atomicwrites==1.3.0
|
||||||
|
attrs==19.3.0
|
||||||
|
Babel==2.7.0
|
||||||
|
backcall==0.1.0
|
||||||
|
bleach==3.1.0
|
||||||
|
certifi==2019.9.11
|
||||||
|
chardet==3.0.4
|
||||||
|
cycler==0.10.0
|
||||||
|
decorator==4.4.1
|
||||||
|
defusedxml==0.6.0
|
||||||
|
docutils==0.15.2
|
||||||
|
eland==0.1
|
||||||
|
elasticsearch==7.1.0
|
||||||
|
entrypoints==0.3
|
||||||
|
idna==2.8
|
||||||
|
imagesize==1.1.0
|
||||||
|
importlib-metadata==0.23
|
||||||
|
ipykernel==5.1.3
|
||||||
|
ipython==7.9.0
|
||||||
|
ipython-genutils==0.2.0
|
||||||
|
ipywidgets==7.5.1
|
||||||
|
jedi==0.15.1
|
||||||
|
Jinja2==2.10.3
|
||||||
|
jsonschema==3.1.1
|
||||||
|
jupyter==1.0.0
|
||||||
|
jupyter-client==5.3.4
|
||||||
|
jupyter-console==6.0.0
|
||||||
|
jupyter-core==4.6.1
|
||||||
|
kiwisolver==1.1.0
|
||||||
|
MarkupSafe==1.1.1
|
||||||
|
matplotlib==3.1.1
|
||||||
|
mistune==0.8.4
|
||||||
|
more-itertools==7.2.0
|
||||||
|
nbconvert==5.6.1
|
||||||
|
nbformat==4.4.0
|
||||||
|
notebook==6.0.2
|
||||||
|
numpy==1.17.4
|
||||||
|
numpydoc==0.8.0
|
||||||
|
packaging==19.2
|
||||||
|
pandas==0.25.1
|
||||||
|
pandocfilters==1.4.2
|
||||||
|
parso==0.5.1
|
||||||
|
pexpect==4.7.0
|
||||||
|
pickleshare==0.7.5
|
||||||
|
pluggy==0.13.0
|
||||||
|
prometheus-client==0.7.1
|
||||||
|
prompt-toolkit==2.0.10
|
||||||
|
ptyprocess==0.6.0
|
||||||
|
py==1.8.0
|
||||||
|
Pygments==2.4.2
|
||||||
|
pyparsing==2.4.5
|
||||||
|
pyrsistent==0.15.5
|
||||||
|
pytest==5.2.2
|
||||||
|
python-dateutil==2.8.1
|
||||||
|
pytz==2019.3
|
||||||
|
pyzmq==18.1.1
|
||||||
|
qtconsole==4.5.5
|
||||||
|
requests==2.22.0
|
||||||
|
Send2Trash==1.5.0
|
||||||
|
six==1.13.0
|
||||||
|
snowballstemmer==2.0.0
|
||||||
|
Sphinx==2.2.1
|
||||||
|
sphinx-rtd-theme==0.4.3
|
||||||
|
sphinxcontrib-applehelp==1.0.1
|
||||||
|
sphinxcontrib-devhelp==1.0.1
|
||||||
|
sphinxcontrib-htmlhelp==1.0.2
|
||||||
|
sphinxcontrib-jsmath==1.0.1
|
||||||
|
sphinxcontrib-qthelp==1.0.2
|
||||||
|
sphinxcontrib-serializinghtml==1.1.3
|
||||||
|
terminado==0.8.3
|
||||||
|
testpath==0.4.4
|
||||||
|
tornado==6.0.3
|
||||||
|
traitlets==4.3.3
|
||||||
|
urllib3==1.25.7
|
||||||
|
wcwidth==0.1.7
|
||||||
|
webencodings==0.5.1
|
||||||
|
widgetsnbextension==3.5.1
|
||||||
|
zipp==0.6.0
|
Loading…
x
Reference in New Issue
Block a user