mirror of
https://github.com/elastic/eland.git
synced 2025-07-11 00:02:14 +08:00
Added example notebooks + pytest for notebooks (#87)
* Added example notebooks + pytest for these notebooks1 * Fixed paths * Fixing link in docs * Adding cleaner demo_notebook
This commit is contained in:
parent
206276c5fa
commit
133b227b93
@ -154,9 +154,9 @@ currently using a minimum version of PyCharm 2019.2.4.
|
|||||||
- Setup Elasticsearch instance (assumes `localhost:9200`), and run
|
- Setup Elasticsearch instance (assumes `localhost:9200`), and run
|
||||||
`python -m eland.tests.setup_tests` to setup test environment -*note
|
`python -m eland.tests.setup_tests` to setup test environment -*note
|
||||||
this modifies Elasticsearch indices*
|
this modifies Elasticsearch indices*
|
||||||
- Run `pytest --doctest-modules` to validate install
|
- Run `pytest --nbval --doctest-modules` to validate install
|
||||||
|
|
||||||
### Documentation
|
### Documentation
|
||||||
|
|
||||||
- Install documentation requirements. Open terminal in virtual
|
- Install documentation requirements. Open terminal in virtual
|
||||||
environment and run `pip install -r requirements-dev.txt`
|
environment and run `pip install -r docs/requirements-docs.txt`
|
||||||
|
@ -4,3 +4,4 @@ matplotlib
|
|||||||
pytest>=5.2.1
|
pytest>=5.2.1
|
||||||
git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master
|
git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master
|
||||||
numpydoc==0.8
|
numpydoc==0.8
|
||||||
|
nbsphinx
|
||||||
|
@ -55,6 +55,7 @@ extensions = [
|
|||||||
'numpydoc',
|
'numpydoc',
|
||||||
"matplotlib.sphinxext.plot_directive",
|
"matplotlib.sphinxext.plot_directive",
|
||||||
"sphinx.ext.todo",
|
"sphinx.ext.todo",
|
||||||
|
"nbsphinx",
|
||||||
]
|
]
|
||||||
|
|
||||||
doctest_global_setup = '''
|
doctest_global_setup = '''
|
||||||
@ -91,7 +92,7 @@ templates_path = ['_templates']
|
|||||||
# List of patterns, relative to source directory, that match files and
|
# List of patterns, relative to source directory, that match files and
|
||||||
# directories to ignore when looking for source files.
|
# directories to ignore when looking for source files.
|
||||||
# This pattern also affects html_static_path and html_extra_path.
|
# This pattern also affects html_static_path and html_extra_path.
|
||||||
exclude_patterns = []
|
exclude_patterns = ['**.ipynb_checkpoints']
|
||||||
|
|
||||||
# -- Options for HTML output -------------------------------------------------
|
# -- Options for HTML output -------------------------------------------------
|
||||||
|
|
||||||
|
BIN
docs/source/examples/data/online-retail.csv.gz
Normal file
BIN
docs/source/examples/data/online-retail.csv.gz
Normal file
Binary file not shown.
3583
docs/source/examples/demo_notebook.ipynb
Normal file
3583
docs/source/examples/demo_notebook.ipynb
Normal file
File diff suppressed because one or more lines are too long
11
docs/source/examples/index.rst
Normal file
11
docs/source/examples/index.rst
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
.. _examples:
|
||||||
|
|
||||||
|
========
|
||||||
|
Examples
|
||||||
|
========
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 2
|
||||||
|
|
||||||
|
demo_notebook
|
||||||
|
online_retail_analysis
|
1462
docs/source/examples/online_retail_analysis.ipynb
Normal file
1462
docs/source/examples/online_retail_analysis.ipynb
Normal file
File diff suppressed because one or more lines are too long
@ -25,6 +25,8 @@ In general, the data resides in elasticsearch and not in memory, which allows el
|
|||||||
reference/index
|
reference/index
|
||||||
implementation/index
|
implementation/index
|
||||||
development/index
|
development/index
|
||||||
|
examples/index
|
||||||
|
|
||||||
|
|
||||||
* :doc:`reference/index`
|
* :doc:`reference/index`
|
||||||
|
|
||||||
@ -43,3 +45,6 @@ In general, the data resides in elasticsearch and not in memory, which allows el
|
|||||||
* :doc:`development/index`
|
* :doc:`development/index`
|
||||||
|
|
||||||
* :doc:`development/contributing`
|
* :doc:`development/contributing`
|
||||||
|
|
||||||
|
* :doc:`examples/index`
|
||||||
|
|
||||||
|
@ -312,8 +312,7 @@ class DataFrame(NDFrame):
|
|||||||
max_rows = min_rows
|
max_rows = min_rows
|
||||||
|
|
||||||
return self.to_html(max_rows=max_rows, max_cols=max_cols,
|
return self.to_html(max_rows=max_rows, max_cols=max_cols,
|
||||||
show_dimensions=show_dimensions, notebook=True,
|
show_dimensions=show_dimensions, notebook=True)
|
||||||
bold_rows=False) # set for consistency with pandas output
|
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -384,20 +383,35 @@ class DataFrame(NDFrame):
|
|||||||
index_field: _id
|
index_field: _id
|
||||||
is_source_field: False
|
is_source_field: False
|
||||||
Mappings:
|
Mappings:
|
||||||
capabilities: _source es_dtype pd_dtype searchable aggregatable
|
capabilities: _source es_dtype pd_dtype searchable aggregatable
|
||||||
AvgTicketPrice True float float64 True True
|
AvgTicketPrice True float float64 True True
|
||||||
Cancelled True boolean bool True True
|
Cancelled True boolean bool True True
|
||||||
Carrier True keyword object True True
|
Carrier True keyword object True True
|
||||||
Dest True keyword object True True
|
Dest True keyword object True True
|
||||||
DestAirportID True keyword object True True
|
DestAirportID True keyword object True True
|
||||||
... ... ... ... ... ...
|
DestCityName True keyword object True True
|
||||||
OriginLocation True geo_point object True True
|
DestCountry True keyword object True True
|
||||||
OriginRegion True keyword object True True
|
DestLocation True geo_point object True True
|
||||||
OriginWeather True keyword object True True
|
DestRegion True keyword object True True
|
||||||
dayOfWeek True integer int64 True True
|
DestWeather True keyword object True True
|
||||||
timestamp True date datetime64[ns] True True
|
DistanceKilometers True float float64 True True
|
||||||
<BLANKLINE>
|
DistanceMiles True float float64 True True
|
||||||
[27 rows x 5 columns]
|
FlightDelay True boolean bool True True
|
||||||
|
FlightDelayMin True integer int64 True True
|
||||||
|
FlightDelayType True keyword object True True
|
||||||
|
FlightNum True keyword object True True
|
||||||
|
FlightTimeHour True float float64 True True
|
||||||
|
FlightTimeMin True float float64 True True
|
||||||
|
Origin True keyword object True True
|
||||||
|
OriginAirportID True keyword object True True
|
||||||
|
OriginCityName True keyword object True True
|
||||||
|
OriginCountry True keyword object True True
|
||||||
|
OriginLocation True geo_point object True True
|
||||||
|
OriginRegion True keyword object True True
|
||||||
|
OriginWeather True keyword object True True
|
||||||
|
dayOfWeek True integer int64 True True
|
||||||
|
timestamp True date datetime64[ns] True True
|
||||||
|
date_fields_format: {}
|
||||||
Operations:
|
Operations:
|
||||||
tasks: [('boolean_filter': ('boolean_filter': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}})), ('tail': ('sort_field': '_doc', 'count': 5))]
|
tasks: [('boolean_filter': ('boolean_filter': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}})), ('tail': ('sort_field': '_doc', 'count': 5))]
|
||||||
size: 5
|
size: 5
|
||||||
|
@ -541,4 +541,5 @@ class Mappings:
|
|||||||
|
|
||||||
def info_es(self, buf):
|
def info_es(self, buf):
|
||||||
buf.write("Mappings:\n")
|
buf.write("Mappings:\n")
|
||||||
buf.write(" capabilities: {0}\n".format(self._mappings_capabilities))
|
buf.write(" capabilities: {0}\n".format(self._mappings_capabilities.to_string()))
|
||||||
|
buf.write(" date_fields_format: {0}\n".format(self._date_fields_format))
|
||||||
|
@ -564,7 +564,7 @@ class ElandQueryCompiler:
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Can not perform arithmetic operations on non aggregatable fields"
|
"Can not perform arithmetic operations on non aggregatable fields"
|
||||||
"One of [{}, {}] is not aggregatable.".format(self_field, right_field)
|
"One of [{}, {}] is not aggregatable.".format(self_field, right_field)
|
||||||
)
|
)
|
||||||
|
|
||||||
def arithmetic_op_fields(self, new_field_name, op, left_field, right_field, op_type=None):
|
def arithmetic_op_fields(self, new_field_name, op, left_field, right_field, op_type=None):
|
||||||
result = self.copy()
|
result = self.copy()
|
||||||
@ -667,6 +667,7 @@ class ElandQueryCompiler:
|
|||||||
buf.write("'field_to_display_names': {}\n".format(self._field_to_display_names))
|
buf.write("'field_to_display_names': {}\n".format(self._field_to_display_names))
|
||||||
buf.write("'display_to_field_names': {}\n".format(self._display_to_field_names))
|
buf.write("'display_to_field_names': {}\n".format(self._display_to_field_names))
|
||||||
|
|
||||||
|
|
||||||
def elasticsearch_date_to_pandas_date(value: Union[int, str], date_format: str) -> pd.Timestamp:
|
def elasticsearch_date_to_pandas_date(value: Union[int, str], date_format: str) -> pd.Timestamp:
|
||||||
"""
|
"""
|
||||||
Given a specific Elasticsearch format for a date datatype, returns the
|
Given a specific Elasticsearch format for a date datatype, returns the
|
||||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -280,6 +280,7 @@ def read_csv(filepath_or_buffer,
|
|||||||
kwds = dict()
|
kwds = dict()
|
||||||
|
|
||||||
kwds.update(
|
kwds.update(
|
||||||
|
sep=sep,
|
||||||
delimiter=delimiter,
|
delimiter=delimiter,
|
||||||
engine=engine,
|
engine=engine,
|
||||||
dialect=dialect,
|
dialect=dialect,
|
||||||
|
File diff suppressed because one or more lines are too long
@ -1,17 +0,0 @@
|
|||||||
# Example Walkthrough for eland
|
|
||||||
|
|
||||||
This example demonstrate the functionality of `eland` through a walkthrough of a simple analysis of the [Online Retail Dataset](https://archive.ics.uci.edu/ml/datasets/online+retail).
|
|
||||||
|
|
||||||
To run this example, make sure that you have an elasticsearch cluster running on port 9200 and please install any additional dependencies in addition to `eland`:
|
|
||||||
|
|
||||||
```
|
|
||||||
pip install -r requirements-example.txt
|
|
||||||
```
|
|
||||||
|
|
||||||
Once these requirements are satisfied, load the data using the provided script:
|
|
||||||
|
|
||||||
```
|
|
||||||
python load.py
|
|
||||||
```
|
|
||||||
|
|
||||||
This will create an index called `online-retail` with a mapping defined in `load.py`.
|
|
@ -1,149 +0,0 @@
|
|||||||
# Copyright 2019 Elasticsearch BV
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import csv
|
|
||||||
|
|
||||||
from elasticsearch import Elasticsearch, helpers
|
|
||||||
from elasticsearch.exceptions import TransportError
|
|
||||||
|
|
||||||
|
|
||||||
def create_index(es, index):
|
|
||||||
mapping = {
|
|
||||||
"mappings": {
|
|
||||||
"properties": {
|
|
||||||
"invoice_no": {"type": "keyword"},
|
|
||||||
"stock_code": {"type": "keyword"},
|
|
||||||
"description": {"type": "keyword"},
|
|
||||||
"quantity": {"type": "integer"},
|
|
||||||
"invoice_date": {"type": "date", "format": "MM/dd/yyyy HH:mm"},
|
|
||||||
"unit_price": {"type": "float"},
|
|
||||||
"customer_id": {"type": "keyword"},
|
|
||||||
"country": {"type": "keyword"}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# create an empty index
|
|
||||||
try:
|
|
||||||
es.indices.create(index=index, body=mapping)
|
|
||||||
except TransportError as e:
|
|
||||||
# ignore already existing index
|
|
||||||
if e.error == "resource_already_exists_exception":
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
|
|
||||||
|
|
||||||
def parse_date(date):
|
|
||||||
"""
|
|
||||||
we need to convert dates to conform to the mapping in the following way:
|
|
||||||
months: one or two digit ints -> MM
|
|
||||||
days: one or two digit ints -> dd
|
|
||||||
years: two digit ints -> yyyy
|
|
||||||
times: {H}H:mm -> HH:mm
|
|
||||||
"""
|
|
||||||
|
|
||||||
date = date.split("/")
|
|
||||||
|
|
||||||
month = date[0] if len(date[0]) == 2 else "0{}".format(date[0])
|
|
||||||
|
|
||||||
day = date[1] if len(date[1]) == 2 else "0{}".format(date[1])
|
|
||||||
|
|
||||||
year = date[2].split(" ")[0]
|
|
||||||
year = "20{}".format(year)
|
|
||||||
|
|
||||||
time = date[2].split(" ")[1]
|
|
||||||
time = time if len(time) == 5 else "0{}".format(time)
|
|
||||||
|
|
||||||
date = "{}/{}/{} {}".format(month, day, year, time)
|
|
||||||
|
|
||||||
return date
|
|
||||||
|
|
||||||
|
|
||||||
def parse_line(line):
|
|
||||||
"""
|
|
||||||
creates the document to be indexed
|
|
||||||
"""
|
|
||||||
obj = {
|
|
||||||
"invoice_no": line[0],
|
|
||||||
"stock_code": line[1],
|
|
||||||
"description": line[2],
|
|
||||||
"quantity": line[3],
|
|
||||||
"invoice_date": parse_date(line[4]),
|
|
||||||
"unit_price": line[5],
|
|
||||||
"customer_id": line[6],
|
|
||||||
"country": line[7].replace("\n", "")
|
|
||||||
}
|
|
||||||
|
|
||||||
return obj
|
|
||||||
|
|
||||||
|
|
||||||
def load_data(es):
|
|
||||||
"""
|
|
||||||
generate one document per line of online-retail.csv
|
|
||||||
read file line by line to avoid loading all data into memory
|
|
||||||
"""
|
|
||||||
|
|
||||||
create_index(es, "online-retail")
|
|
||||||
|
|
||||||
header = True
|
|
||||||
with open("data/online-retail.csv", "r") as f:
|
|
||||||
reader = csv.reader(f, quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL)
|
|
||||||
for line in reader:
|
|
||||||
if header:
|
|
||||||
header = False
|
|
||||||
continue
|
|
||||||
doc = parse_line(line)
|
|
||||||
|
|
||||||
yield doc
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"-H",
|
|
||||||
"--host",
|
|
||||||
action="store",
|
|
||||||
default="localhost:9200",
|
|
||||||
help="The elasticsearch host you wish to connect to. (Default: localhost:9200)"
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# create the elasticsearch client, pointing to the host parameter
|
|
||||||
es = Elasticsearch(args.host)
|
|
||||||
index = 'online-retail'
|
|
||||||
|
|
||||||
# load data from online retail csv in data directory
|
|
||||||
stream = load_data(es)
|
|
||||||
for ok, result in helpers.streaming_bulk(
|
|
||||||
es,
|
|
||||||
actions=stream,
|
|
||||||
index=index,
|
|
||||||
chunk_size=1000
|
|
||||||
):
|
|
||||||
action, result = result.popitem()
|
|
||||||
doc_id = "/{}/doc/{}".format(index, result['_id'])
|
|
||||||
|
|
||||||
if not ok:
|
|
||||||
print("Failed to {} document {} {}".format(action, doc_id, result))
|
|
||||||
else:
|
|
||||||
print(doc_id)
|
|
||||||
|
|
||||||
# make docs available for searches
|
|
||||||
es.indices.refresh(index=index)
|
|
||||||
|
|
||||||
# notify user of number of documents indexed
|
|
||||||
print(es.count(index=index)["count"], "documents in index")
|
|
@ -1,80 +0,0 @@
|
|||||||
alabaster==0.7.12
|
|
||||||
appnope==0.1.0
|
|
||||||
atomicwrites==1.3.0
|
|
||||||
attrs==19.3.0
|
|
||||||
Babel==2.7.0
|
|
||||||
backcall==0.1.0
|
|
||||||
bleach==3.1.0
|
|
||||||
certifi==2019.9.11
|
|
||||||
chardet==3.0.4
|
|
||||||
cycler==0.10.0
|
|
||||||
decorator==4.4.1
|
|
||||||
defusedxml==0.6.0
|
|
||||||
docutils==0.15.2
|
|
||||||
eland==0.1
|
|
||||||
elasticsearch==7.1.0
|
|
||||||
entrypoints==0.3
|
|
||||||
idna==2.8
|
|
||||||
imagesize==1.1.0
|
|
||||||
importlib-metadata==0.23
|
|
||||||
ipykernel==5.1.3
|
|
||||||
ipython==7.9.0
|
|
||||||
ipython-genutils==0.2.0
|
|
||||||
ipywidgets==7.5.1
|
|
||||||
jedi==0.15.1
|
|
||||||
Jinja2==2.10.3
|
|
||||||
jsonschema==3.1.1
|
|
||||||
jupyter==1.0.0
|
|
||||||
jupyter-client==5.3.4
|
|
||||||
jupyter-console==6.0.0
|
|
||||||
jupyter-core==4.6.1
|
|
||||||
kiwisolver==1.1.0
|
|
||||||
MarkupSafe==1.1.1
|
|
||||||
matplotlib==3.1.1
|
|
||||||
mistune==0.8.4
|
|
||||||
more-itertools==7.2.0
|
|
||||||
nbconvert==5.6.1
|
|
||||||
nbformat==4.4.0
|
|
||||||
notebook==6.0.2
|
|
||||||
numpy==1.17.4
|
|
||||||
numpydoc==0.8.0
|
|
||||||
packaging==19.2
|
|
||||||
pandas==0.25.1
|
|
||||||
pandocfilters==1.4.2
|
|
||||||
parso==0.5.1
|
|
||||||
pexpect==4.7.0
|
|
||||||
pickleshare==0.7.5
|
|
||||||
pluggy==0.13.0
|
|
||||||
prometheus-client==0.7.1
|
|
||||||
prompt-toolkit==2.0.10
|
|
||||||
ptyprocess==0.6.0
|
|
||||||
py==1.8.0
|
|
||||||
Pygments==2.4.2
|
|
||||||
pyparsing==2.4.5
|
|
||||||
pyrsistent==0.15.5
|
|
||||||
pytest==5.2.2
|
|
||||||
python-dateutil==2.8.1
|
|
||||||
pytz==2019.3
|
|
||||||
pyzmq==18.1.1
|
|
||||||
qtconsole==4.5.5
|
|
||||||
requests==2.22.0
|
|
||||||
Send2Trash==1.5.0
|
|
||||||
six==1.13.0
|
|
||||||
snowballstemmer==2.0.0
|
|
||||||
Sphinx==2.2.1
|
|
||||||
sphinx-rtd-theme==0.4.3
|
|
||||||
sphinxcontrib-applehelp==1.0.1
|
|
||||||
sphinxcontrib-devhelp==1.0.1
|
|
||||||
sphinxcontrib-htmlhelp==1.0.2
|
|
||||||
sphinxcontrib-jsmath==1.0.1
|
|
||||||
sphinxcontrib-qthelp==1.0.2
|
|
||||||
sphinxcontrib-serializinghtml==1.1.3
|
|
||||||
terminado==0.8.3
|
|
||||||
testpath==0.4.4
|
|
||||||
tornado==6.0.3
|
|
||||||
traitlets==4.3.3
|
|
||||||
urllib3==1.25.7
|
|
||||||
wcwidth==0.1.7
|
|
||||||
webencodings==0.5.1
|
|
||||||
widgetsnbextension==3.5.1
|
|
||||||
zipp==0.6.0
|
|
@ -2,6 +2,9 @@
|
|||||||
|
|
||||||
python setup.py install
|
python setup.py install
|
||||||
|
|
||||||
|
jupyter nbconvert --to notebook --inplace --execute docs/source/examples/demo_notebook.ipynb
|
||||||
|
jupyter nbconvert --to notebook --inplace --execute docs/source/examples/online_retail_analysis.ipynb
|
||||||
|
|
||||||
cd docs
|
cd docs
|
||||||
|
|
||||||
make clean
|
make clean
|
||||||
|
@ -2,4 +2,5 @@ elasticsearch>=7.0.5
|
|||||||
pandas==0.25.1
|
pandas==0.25.1
|
||||||
matplotlib
|
matplotlib
|
||||||
pytest>=5.2.1
|
pytest>=5.2.1
|
||||||
|
nbval
|
||||||
numpydoc==0.8
|
numpydoc==0.8
|
||||||
|
Loading…
x
Reference in New Issue
Block a user