Added example notebooks + pytest for notebooks (#87)

* Added example notebooks + pytest for these notebooks1

* Fixed paths

* Fixing link in docs

* Adding cleaner demo_notebook
This commit is contained in:
stevedodson 2019-12-10 15:27:13 +01:00 committed by GitHub
parent 206276c5fa
commit 133b227b93
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 5105 additions and 12376 deletions

View File

@ -154,9 +154,9 @@ currently using a minimum version of PyCharm 2019.2.4.
- Setup Elasticsearch instance (assumes `localhost:9200`), and run
`python -m eland.tests.setup_tests` to setup test environment -*note
this modifies Elasticsearch indices*
- Run `pytest --doctest-modules` to validate install
- Run `pytest --nbval --doctest-modules` to validate install
### Documentation
- Install documentation requirements. Open terminal in virtual
environment and run `pip install -r requirements-dev.txt`
environment and run `pip install -r docs/requirements-docs.txt`

View File

@ -4,3 +4,4 @@ matplotlib
pytest>=5.2.1
git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master
numpydoc==0.8
nbsphinx

View File

@ -55,6 +55,7 @@ extensions = [
'numpydoc',
"matplotlib.sphinxext.plot_directive",
"sphinx.ext.todo",
"nbsphinx",
]
doctest_global_setup = '''
@ -91,7 +92,7 @@ templates_path = ['_templates']
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = []
exclude_patterns = ['**.ipynb_checkpoints']
# -- Options for HTML output -------------------------------------------------

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,11 @@
.. _examples:
========
Examples
========
.. toctree::
:maxdepth: 2
demo_notebook
online_retail_analysis

File diff suppressed because one or more lines are too long

View File

@ -25,6 +25,8 @@ In general, the data resides in elasticsearch and not in memory, which allows el
reference/index
implementation/index
development/index
examples/index
* :doc:`reference/index`
@ -43,3 +45,6 @@ In general, the data resides in elasticsearch and not in memory, which allows el
* :doc:`development/index`
* :doc:`development/contributing`
* :doc:`examples/index`

View File

@ -312,8 +312,7 @@ class DataFrame(NDFrame):
max_rows = min_rows
return self.to_html(max_rows=max_rows, max_cols=max_cols,
show_dimensions=show_dimensions, notebook=True,
bold_rows=False) # set for consistency with pandas output
show_dimensions=show_dimensions, notebook=True)
else:
return None
@ -384,20 +383,35 @@ class DataFrame(NDFrame):
index_field: _id
is_source_field: False
Mappings:
capabilities: _source es_dtype pd_dtype searchable aggregatable
AvgTicketPrice True float float64 True True
Cancelled True boolean bool True True
Carrier True keyword object True True
Dest True keyword object True True
DestAirportID True keyword object True True
... ... ... ... ... ...
OriginLocation True geo_point object True True
OriginRegion True keyword object True True
OriginWeather True keyword object True True
dayOfWeek True integer int64 True True
timestamp True date datetime64[ns] True True
<BLANKLINE>
[27 rows x 5 columns]
capabilities: _source es_dtype pd_dtype searchable aggregatable
AvgTicketPrice True float float64 True True
Cancelled True boolean bool True True
Carrier True keyword object True True
Dest True keyword object True True
DestAirportID True keyword object True True
DestCityName True keyword object True True
DestCountry True keyword object True True
DestLocation True geo_point object True True
DestRegion True keyword object True True
DestWeather True keyword object True True
DistanceKilometers True float float64 True True
DistanceMiles True float float64 True True
FlightDelay True boolean bool True True
FlightDelayMin True integer int64 True True
FlightDelayType True keyword object True True
FlightNum True keyword object True True
FlightTimeHour True float float64 True True
FlightTimeMin True float float64 True True
Origin True keyword object True True
OriginAirportID True keyword object True True
OriginCityName True keyword object True True
OriginCountry True keyword object True True
OriginLocation True geo_point object True True
OriginRegion True keyword object True True
OriginWeather True keyword object True True
dayOfWeek True integer int64 True True
timestamp True date datetime64[ns] True True
date_fields_format: {}
Operations:
tasks: [('boolean_filter': ('boolean_filter': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}})), ('tail': ('sort_field': '_doc', 'count': 5))]
size: 5

View File

@ -541,4 +541,5 @@ class Mappings:
def info_es(self, buf):
buf.write("Mappings:\n")
buf.write(" capabilities: {0}\n".format(self._mappings_capabilities))
buf.write(" capabilities: {0}\n".format(self._mappings_capabilities.to_string()))
buf.write(" date_fields_format: {0}\n".format(self._date_fields_format))

View File

@ -564,7 +564,7 @@ class ElandQueryCompiler:
raise ValueError(
"Can not perform arithmetic operations on non aggregatable fields"
"One of [{}, {}] is not aggregatable.".format(self_field, right_field)
)
)
def arithmetic_op_fields(self, new_field_name, op, left_field, right_field, op_type=None):
result = self.copy()
@ -667,6 +667,7 @@ class ElandQueryCompiler:
buf.write("'field_to_display_names': {}\n".format(self._field_to_display_names))
buf.write("'display_to_field_names': {}\n".format(self._display_to_field_names))
def elasticsearch_date_to_pandas_date(value: Union[int, str], date_format: str) -> pd.Timestamp:
"""
Given a specific Elasticsearch format for a date datatype, returns the

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -280,6 +280,7 @@ def read_csv(filepath_or_buffer,
kwds = dict()
kwds.update(
sep=sep,
delimiter=delimiter,
engine=engine,
dialect=dialect,

File diff suppressed because one or more lines are too long

View File

@ -1,17 +0,0 @@
# Example Walkthrough for eland
This example demonstrate the functionality of `eland` through a walkthrough of a simple analysis of the [Online Retail Dataset](https://archive.ics.uci.edu/ml/datasets/online+retail).
To run this example, make sure that you have an elasticsearch cluster running on port 9200 and please install any additional dependencies in addition to `eland`:
```
pip install -r requirements-example.txt
```
Once these requirements are satisfied, load the data using the provided script:
```
python load.py
```
This will create an index called `online-retail` with a mapping defined in `load.py`.

View File

@ -1,149 +0,0 @@
# Copyright 2019 Elasticsearch BV
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import csv
from elasticsearch import Elasticsearch, helpers
from elasticsearch.exceptions import TransportError
def create_index(es, index):
mapping = {
"mappings": {
"properties": {
"invoice_no": {"type": "keyword"},
"stock_code": {"type": "keyword"},
"description": {"type": "keyword"},
"quantity": {"type": "integer"},
"invoice_date": {"type": "date", "format": "MM/dd/yyyy HH:mm"},
"unit_price": {"type": "float"},
"customer_id": {"type": "keyword"},
"country": {"type": "keyword"}
}
}
}
# create an empty index
try:
es.indices.create(index=index, body=mapping)
except TransportError as e:
# ignore already existing index
if e.error == "resource_already_exists_exception":
pass
else:
raise
def parse_date(date):
"""
we need to convert dates to conform to the mapping in the following way:
months: one or two digit ints -> MM
days: one or two digit ints -> dd
years: two digit ints -> yyyy
times: {H}H:mm -> HH:mm
"""
date = date.split("/")
month = date[0] if len(date[0]) == 2 else "0{}".format(date[0])
day = date[1] if len(date[1]) == 2 else "0{}".format(date[1])
year = date[2].split(" ")[0]
year = "20{}".format(year)
time = date[2].split(" ")[1]
time = time if len(time) == 5 else "0{}".format(time)
date = "{}/{}/{} {}".format(month, day, year, time)
return date
def parse_line(line):
"""
creates the document to be indexed
"""
obj = {
"invoice_no": line[0],
"stock_code": line[1],
"description": line[2],
"quantity": line[3],
"invoice_date": parse_date(line[4]),
"unit_price": line[5],
"customer_id": line[6],
"country": line[7].replace("\n", "")
}
return obj
def load_data(es):
"""
generate one document per line of online-retail.csv
read file line by line to avoid loading all data into memory
"""
create_index(es, "online-retail")
header = True
with open("data/online-retail.csv", "r") as f:
reader = csv.reader(f, quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL)
for line in reader:
if header:
header = False
continue
doc = parse_line(line)
yield doc
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-H",
"--host",
action="store",
default="localhost:9200",
help="The elasticsearch host you wish to connect to. (Default: localhost:9200)"
)
args = parser.parse_args()
# create the elasticsearch client, pointing to the host parameter
es = Elasticsearch(args.host)
index = 'online-retail'
# load data from online retail csv in data directory
stream = load_data(es)
for ok, result in helpers.streaming_bulk(
es,
actions=stream,
index=index,
chunk_size=1000
):
action, result = result.popitem()
doc_id = "/{}/doc/{}".format(index, result['_id'])
if not ok:
print("Failed to {} document {} {}".format(action, doc_id, result))
else:
print(doc_id)
# make docs available for searches
es.indices.refresh(index=index)
# notify user of number of documents indexed
print(es.count(index=index)["count"], "documents in index")

View File

@ -1,80 +0,0 @@
alabaster==0.7.12
appnope==0.1.0
atomicwrites==1.3.0
attrs==19.3.0
Babel==2.7.0
backcall==0.1.0
bleach==3.1.0
certifi==2019.9.11
chardet==3.0.4
cycler==0.10.0
decorator==4.4.1
defusedxml==0.6.0
docutils==0.15.2
eland==0.1
elasticsearch==7.1.0
entrypoints==0.3
idna==2.8
imagesize==1.1.0
importlib-metadata==0.23
ipykernel==5.1.3
ipython==7.9.0
ipython-genutils==0.2.0
ipywidgets==7.5.1
jedi==0.15.1
Jinja2==2.10.3
jsonschema==3.1.1
jupyter==1.0.0
jupyter-client==5.3.4
jupyter-console==6.0.0
jupyter-core==4.6.1
kiwisolver==1.1.0
MarkupSafe==1.1.1
matplotlib==3.1.1
mistune==0.8.4
more-itertools==7.2.0
nbconvert==5.6.1
nbformat==4.4.0
notebook==6.0.2
numpy==1.17.4
numpydoc==0.8.0
packaging==19.2
pandas==0.25.1
pandocfilters==1.4.2
parso==0.5.1
pexpect==4.7.0
pickleshare==0.7.5
pluggy==0.13.0
prometheus-client==0.7.1
prompt-toolkit==2.0.10
ptyprocess==0.6.0
py==1.8.0
Pygments==2.4.2
pyparsing==2.4.5
pyrsistent==0.15.5
pytest==5.2.2
python-dateutil==2.8.1
pytz==2019.3
pyzmq==18.1.1
qtconsole==4.5.5
requests==2.22.0
Send2Trash==1.5.0
six==1.13.0
snowballstemmer==2.0.0
Sphinx==2.2.1
sphinx-rtd-theme==0.4.3
sphinxcontrib-applehelp==1.0.1
sphinxcontrib-devhelp==1.0.1
sphinxcontrib-htmlhelp==1.0.2
sphinxcontrib-jsmath==1.0.1
sphinxcontrib-qthelp==1.0.2
sphinxcontrib-serializinghtml==1.1.3
terminado==0.8.3
testpath==0.4.4
tornado==6.0.3
traitlets==4.3.3
urllib3==1.25.7
wcwidth==0.1.7
webencodings==0.5.1
widgetsnbextension==3.5.1
zipp==0.6.0

View File

@ -2,6 +2,9 @@
python setup.py install
jupyter nbconvert --to notebook --inplace --execute docs/source/examples/demo_notebook.ipynb
jupyter nbconvert --to notebook --inplace --execute docs/source/examples/online_retail_analysis.ipynb
cd docs
make clean

View File

@ -2,4 +2,5 @@ elasticsearch>=7.0.5
pandas==0.25.1
matplotlib
pytest>=5.2.1
nbval
numpydoc==0.8