From 6564f2624577e81c444d22766b749253d8a486f4 Mon Sep 17 00:00:00 2001 From: Stephen Dodson Date: Wed, 20 Nov 2019 10:32:35 +0000 Subject: [PATCH] Adding 'development' section to docs Adding contributing section based on Elasticsearch/CONTRIBUTING.md TODO - add testing docs (based on CI)1 --- .gitignore | 6 + NOTES.md | 58 -------- docs/source/conf.py | 5 +- docs/source/development/contributing.rst | 167 +++++++++++++++++++++++ docs/source/development/index.rst | 10 ++ docs/source/implementation/details.rst | 2 +- docs/source/index.rst | 5 + eland/__version__.py | 4 +- setup.py | 2 +- 9 files changed, 194 insertions(+), 65 deletions(-) delete mode 100644 NOTES.md create mode 100644 docs/source/development/contributing.rst create mode 100644 docs/source/development/index.rst diff --git a/.gitignore b/.gitignore index 4de1325..1e640ca 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,12 @@ build/ # docs build folder docs/build/ +# pytest results +eland/tests/dataframe/results/ +eland/tests/dataframe/results/ +result_images/ + + # Python egg metadata, regenerated from source files by setuptools. /*.egg-info diff --git a/NOTES.md b/NOTES.md deleted file mode 100644 index 7fa3635..0000000 --- a/NOTES.md +++ /dev/null @@ -1,58 +0,0 @@ -# Implementation Notes - -The goal of an `eland.DataFrame` is to enable users who are familiar with `pandas.DataFrame` -to access, explore and manipulate data that resides in Elasticsearch. - -Ideally, all data should reside in Elasticsearch and not to reside in memory. -This restricts the API, but allows access to huge data sets that do not fit into memory, and allows -use of powerful Elasticsearch features such as aggrergations. - -## Implementation Details - -### 3rd Party System Access - -Generally, integrations with [3rd party storage systems](https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html) -(SQL, Google Big Query etc.) involve accessing these systems and reading all external data into an -in-core pandas data structure. This also applies to [Apache Arrow](https://arrow.apache.org/docs/python/pandas.html) -structures. - -Whilst this provides access to data in these systems, for large datasets this can require significant -in-core memory, and for systems such as Elasticsearch, bulk export of data can be an inefficient way -of exploring the data. - -An alternative option is to create an API that proxies `pandas.DataFrame`-like calls to Elasticsearch -queries and operations. This could allow the Elasticsearch cluster to perform operations such as -aggregations rather than exporting all the data and performing this operation in-core. - -### Implementation Options - -An option would be to replace the `pandas.DataFrame` backend in-core memory structures with Elasticsearch -accessors. This would allow full access to the `pandas.DataFrame` APIs. However, this has issues: - -* If a `pandas.DataFrame` instance maps to an index, typical manipulation of a `pandas.DataFrame` -may involve creating many derived `pandas.DataFrame` instances. Constructing an index per -`pandas.DataFrame` may result in many Elasticsearch indexes and a significant load on Elasticsearch. -For example, `df_a = df['a']` should not require Elasticsearch indices `df` and `df_a` - -* Not all `pandas.DataFrame` APIs map to things we may want to do in Elasticsearch. In particular, -API calls that involve exporting all data from Elasticsearch into memory e.g. `df.to_dict()`. - -* The backend `pandas.DataFrame` structures are not easily abstractable and are deeply embedded in -the implementation. - -Another option is to create a `eland.DataFrame` API that mimics appropriate aspects of -the `pandas.DataFrame` API. This resolves some of the issues above as: - -* `df_a = df['a']` could be implemented as a change to the Elasticsearch query used, rather -than a new index - -* Instead of supporting the enitre `pandas.DataFrame` API we can support a subset appropriate for -Elasticsearch. If addition calls are required, we could to create a `eland.DataFrame._to_pandas()` -method which would explicitly export all data to a `pandas.DataFrame` - -* Creating a new `eland.DataFrame` API gives us full flexibility in terms of implementation. However, -it does create a large amount of work which may duplicate a lot of the `pandas` code - for example, -printing objects etc. - this creates maintenance issues etc. - - - diff --git a/docs/source/conf.py b/docs/source/conf.py index 31b417f..400504b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -25,8 +25,7 @@ sys.path.extend( # -- Project information ----------------------------------------------------- project = 'eland' -copyright = '2019, Stephen Dodson' -author = 'Stephen Dodson' +copyright = '2019, Elasticsearch B.V.' # The full version, including alpha/beta/rc tags release = '0.1' @@ -95,4 +94,4 @@ html_theme = "pandas_sphinx_theme" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +#html_static_path = ['_static'] diff --git a/docs/source/development/contributing.rst b/docs/source/development/contributing.rst new file mode 100644 index 0000000..02dc723 --- /dev/null +++ b/docs/source/development/contributing.rst @@ -0,0 +1,167 @@ +===================== +Contributing to eland +===================== + +Eland is an open source project and we love to receive contributions +from our community — you! There are many ways to contribute, from +writing tutorials or blog posts, improving the documentation, submitting +bug reports and feature requests or writing code which can be +incorporated into eland itself. + +Bug reports +----------- + +If you think you have found a bug in eland, first make sure that you are +testing against the `latest version of +eland `__ - your issue may already +have been fixed. If not, search our `issues +list `__ on GitHub in case a +similar issue has already been opened. + +It is very helpful if you can prepare a reproduction of the bug. In +other words, provide a small test case which we can run to confirm your +bug. It makes it easier to find the problem and to fix it. Test cases +should be provided as python scripts, ideally with some details of your +Elasticsearch environment and index mappings, and (where appropriate) a +pandas example. + +Provide as much information as you can. You may think that the problem +lies with your query, when actually it depends on how your data is +indexed. The easier it is for us to recreate your problem, the faster it +is likely to be fixed. + +Feature requests +---------------- + +If you find yourself wishing for a feature that doesn't exist in eland, +you are probably not alone. There are bound to be others out there with +similar needs. Many of the features that eland has today have been added +because our users saw the need. Open an issue on our `issues +list `__ on GitHub which +describes the feature you would like to see, why you need it, and how it +should work. + +Contributing code and documentation changes +------------------------------------------- + +If you have a bugfix or new feature that you would like to contribute to +eland, please find or open an issue about it first. Talk about what you +would like to do. It may be that somebody is already working on it, or +that there are particular issues that you should know about before +implementing the change. + +We enjoy working with contributors to get their code accepted. There are +many approaches to fixing a problem and it is important to find the best +approach before writing too much code. + +Note that it is unlikely the project will merge refactors for the sake +of refactoring. These types of pull requests have a high cost to +maintainers in reviewing and testing with little to no tangible benefit. +This especially includes changes generated by tools. For example, +converting all generic interface instances to use the diamond operator. + +The process for contributing to any of the `Elastic +repositories `__ is similar. Details for +individual projects can be found below. + +Fork and clone the repository +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You will need to fork the main eland code or documentation repository +and clone it to your local machine. See `github help +page `__ for help. + +Further instructions for specific projects are given below. + +Submitting your changes +~~~~~~~~~~~~~~~~~~~~~~~ + +Once your changes and tests are ready to submit for review: + +1. Test your changes + + Run the test suite to make sure that nothing is broken (TODO add link + to testing doc). + +2. Sign the Contributor License Agreement + + Please make sure you have signed our `Contributor License + Agreement `__. We are + not asking you to assign copyright to us, but to give us the right to + distribute your code without restriction. We ask this of all + contributors in order to assure our users of the origin and + continuing existence of the code. You only need to sign the CLA once. + +3. Rebase your changes + + Update your local repository with the most recent code from the main + eland repository, and rebase your branch on top of the latest master + branch. We prefer your initial changes to be squashed into a single + commit. Later, if we ask you to make changes, add them as separate + commits. This makes them easier to review. As a final step before + merging we will either ask you to squash all commits yourself or + we'll do it for you. + +4. Submit a pull request + + Push your local changes to your forked copy of the repository and + `submit a pull + request `__. In + the pull request, choose a title which sums up the changes that you + have made, and in the body provide more details about what your + changes do. Also mention the number of the issue where discussion has + taken place, eg “Closes #123”. + +Then sit back and wait. There will probably be discussion about the pull +request and, if any changes are needed, we would love to work with you +to get your pull request merged into eland. + +Please adhere to the general guideline that you should never force push +to a publicly shared branch. Once you have opened your pull request, you +should consider your branch publicly shared. Instead of force pushing +you can just add incremental commits; this is generally easier on your +reviewers. If you need to pick up changes from master, you can merge +master into your branch. A reviewer might ask you to rebase a +long-running pull request in which case force pushing is okay for that +request. Note that squashing at the end of the review process should +also not be done, that can be done when the pull request is `integrated +via GitHub `__. + +Contributing to the eland codebase +---------------------------------- + +**Repository:** https://github.com/elastic/eland + +We internally develop using the PyCharm IDE. For PyCharm, we are +currently using a minimum version of PyCharm 2019.2.4. + +Configuring PyCharm And Running Tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +(All commands should be run from module root) + +- Create a new project via 'Check out from Version Control'->'Git' + on the "Welcome to PyCharm" page (or other) +- Enter the URL to your fork of eland + (e.g. ``git@github.com:stevedodson/eland.git``) +- Click 'Yes' for 'Checkout from Version Control' +- Configure PyCharm environment: +- In 'Preferences' configure a 'Project: eland'->'Project Interpreter'. + Generally, we recommend creating a virtual environment (TODO link to + installing for python version support). +- In 'Preferences' set 'Tools'->'Python Integrated Tools'->'Default + test runner' to ``pytest`` +- In 'Preferences' set 'Tools'->'Python Integrated Tools'->'Docstring + format' to ``numpy`` +- Install development requirements. Open terminal in virtual + environment and run ``pip install -r requirements-dev.txt`` +- Setup Elasticsearch instance (assumes ``localhost:9200``), and run + ``python -m eland.tests.setup_tests`` to setup test environment - + *note this modifies Elasticsearch indices* +- Run ``pytest --doctest-modules`` to validate install + +Documentation +~~~~~~~~~~~~~ + +- Install documentation requirements. Open terminal in virtual + environment and run ``pip install -r requirements-dev.txt`` diff --git a/docs/source/development/index.rst b/docs/source/development/index.rst new file mode 100644 index 0000000..6347536 --- /dev/null +++ b/docs/source/development/index.rst @@ -0,0 +1,10 @@ +.. _development: + +=========== +Development +=========== + +.. toctree:: + :maxdepth: 2 + + contributing.rst diff --git a/docs/source/implementation/details.rst b/docs/source/implementation/details.rst index 79c33d4..7149bfd 100644 --- a/docs/source/implementation/details.rst +++ b/docs/source/implementation/details.rst @@ -9,7 +9,7 @@ to access, explore and manipulate data that resides in Elasticsearch. Ideally, all data should reside in Elasticsearch and not to reside in memory. This restricts the API, but allows access to huge data sets that do not fit into memory, and allows -use of powerful Elasticsearch features such as aggrergations. +use of powerful Elasticsearch features such as aggregations. Pandas and 3rd Party Storage Systems diff --git a/docs/source/index.rst b/docs/source/index.rst index 578bc75..943eaee 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -24,6 +24,7 @@ In general, the data resides in elasticsearch and not in memory, which allows el reference/index implementation/index + development/index * :doc:`reference/index` @@ -38,3 +39,7 @@ In general, the data resides in elasticsearch and not in memory, which allows el * :doc:`implementation/details` * :doc:`implementation/dataframe_supported` + +* :doc:`development/index` + + * :doc:`development/contributing` diff --git a/eland/__version__.py b/eland/__version__.py index b3205f8..d124969 100644 --- a/eland/__version__.py +++ b/eland/__version__.py @@ -1,6 +1,6 @@ __title__ = 'eland' __description__ = 'Python elasticsearch client to analyse, explore and manipulate data that resides in elasticsearch.' -__url__ = 'https://github.com/elastic/app-search-python' -__version__ = '0.1' +__url__ = 'https://github.com/elastic/eland' +__version__ = '0.1a1' __maintainer__ = 'Elasticsearch B.V.' __maintainer_email__ = 'steve.dodson@elastic.co' diff --git a/setup.py b/setup.py index 8ea831e..899b814 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ setup( maintainer_email=about['__maintainer_email__'], license='Apache 2.0', classifiers=[ - 'Development Status :: 4 - Beta', + 'Development Status :: 3 - Alpha', 'Intended Audience :: Developers', 'License :: OSI Approved :: Apache Software License', 'Programming Language :: Python :: 3.7',