diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5091e95 --- /dev/null +++ b/.gitignore @@ -0,0 +1,17 @@ +# Compiled python modules. +*.pyc + +# Setuptools distribution folder. +/dist/ + +# Python egg metadata, regenerated from source files by setuptools. +/*.egg-info + +# PyCharm files +.idea/ + +# pytest files +.pytest_cache/ + +# Ignore MacOSX files +.DS_Store diff --git a/LICENSE b/LICENSE index 261eeb9..7376ffc 100644 --- a/LICENSE +++ b/LICENSE @@ -1,201 +1,223 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ +ELASTIC LICENSE AGREEMENT - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION +PLEASE READ CAREFULLY THIS ELASTIC LICENSE AGREEMENT (THIS "AGREEMENT"), WHICH +CONSTITUTES A LEGALLY BINDING AGREEMENT AND GOVERNS ALL OF YOUR USE OF ALL OF +THE ELASTIC SOFTWARE WITH WHICH THIS AGREEMENT IS INCLUDED ("ELASTIC SOFTWARE") +THAT IS PROVIDED IN OBJECT CODE FORMAT, AND, IN ACCORDANCE WITH SECTION 2 BELOW, +CERTAIN OF THE ELASTIC SOFTWARE THAT IS PROVIDED IN SOURCE CODE FORMAT. BY +INSTALLING OR USING ANY OF THE ELASTIC SOFTWARE GOVERNED BY THIS AGREEMENT, YOU +ARE ASSENTING TO THE TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE +WITH SUCH TERMS AND CONDITIONS, YOU MAY NOT INSTALL OR USE THE ELASTIC SOFTWARE +GOVERNED BY THIS AGREEMENT. IF YOU ARE INSTALLING OR USING THE SOFTWARE ON +BEHALF OF A LEGAL ENTITY, YOU REPRESENT AND WARRANT THAT YOU HAVE THE ACTUAL +AUTHORITY TO AGREE TO THE TERMS AND CONDITIONS OF THIS AGREEMENT ON BEHALF OF +SUCH ENTITY. - 1. Definitions. +Posted Date: April 20, 2018 - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. +This Agreement is entered into by and between Elasticsearch BV ("Elastic") and +You, or the legal entity on behalf of whom You are acting (as applicable, +"You"). - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. +1. OBJECT CODE END USER LICENSES, RESTRICTIONS AND THIRD PARTY OPEN SOURCE +SOFTWARE - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. + 1.1 Object Code End User License. Subject to the terms and conditions of + Section 1.2 of this Agreement, Elastic hereby grants to You, AT NO CHARGE and + for so long as you are not in breach of any provision of this Agreement, a + License to the Basic Features and Functions of the Elastic Software. - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. + 1.2 Reservation of Rights; Restrictions. As between Elastic and You, Elastic + and its licensors own all right, title and interest in and to the Elastic + Software, and except as expressly set forth in Sections 1.1, and 2.1 of this + Agreement, no other license to the Elastic Software is granted to You under + this Agreement, by implication, estoppel or otherwise. You agree not to: (i) + reverse engineer or decompile, decrypt, disassemble or otherwise reduce any + Elastic Software provided to You in Object Code, or any portion thereof, to + Source Code, except and only to the extent any such restriction is prohibited + by applicable law, (ii) except as expressly permitted in this Agreement, + prepare derivative works from, modify, copy or use the Elastic Software Object + Code or the Commercial Software Source Code in any manner; (iii) except as + expressly permitted in Section 1.1 above, transfer, sell, rent, lease, + distribute, sublicense, loan or otherwise transfer, Elastic Software Object + Code, in whole or in part, to any third party; (iv) use Elastic Software + Object Code for providing time-sharing services, any software-as-a-service, + service bureau services or as part of an application services provider or + other service offering (collectively, "SaaS Offering") where obtaining access + to the Elastic Software or the features and functions of the Elastic Software + is a primary reason or substantial motivation for users of the SaaS Offering + to access and/or use the SaaS Offering ("Prohibited SaaS Offering"); (v) + circumvent the limitations on use of Elastic Software provided to You in + Object Code format that are imposed or preserved by any License Key, or (vi) + alter or remove any Marks and Notices in the Elastic Software. If You have any + question as to whether a specific SaaS Offering constitutes a Prohibited SaaS + Offering, or are interested in obtaining Elastic's permission to engage in + commercial or non-commercial distribution of the Elastic Software, please + contact elastic_license@elastic.co. - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. + 1.3 Third Party Open Source Software. The Commercial Software may contain or + be provided with third party open source libraries, components, utilities and + other open source software (collectively, "Open Source Software"), which Open + Source Software may have applicable license terms as identified on a website + designated by Elastic. Notwithstanding anything to the contrary herein, use of + the Open Source Software shall be subject to the license terms and conditions + applicable to such Open Source Software, to the extent required by the + applicable licensor (which terms shall not restrict the license rights granted + to You hereunder, but may contain additional rights). To the extent any + condition of this Agreement conflicts with any license to the Open Source + Software, the Open Source Software license will govern with respect to such + Open Source Software only. Elastic may also separately provide you with + certain open source software that is licensed by Elastic. Your use of such + Elastic open source software will not be governed by this Agreement, but by + the applicable open source license terms. - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. +2. COMMERCIAL SOFTWARE SOURCE CODE - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). + 2.1 Limited License. Subject to the terms and conditions of Section 2.2 of + this Agreement, Elastic hereby grants to You, AT NO CHARGE and for so long as + you are not in breach of any provision of this Agreement, a limited, + non-exclusive, non-transferable, fully paid up royalty free right and license + to the Commercial Software in Source Code format, without the right to grant + or authorize sublicenses, to prepare Derivative Works of the Commercial + Software, provided You (i) do not hack the licensing mechanism, or otherwise + circumvent the intended limitations on the use of Elastic Software to enable + features other than Basic Features and Functions or those features You are + entitled to as part of a Subscription, and (ii) use the resulting object code + only for reasonable testing purposes. - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. + 2.2 Restrictions. Nothing in Section 2.1 grants You the right to (i) use the + Commercial Software Source Code other than in accordance with Section 2.1 + above, (ii) use a Derivative Work of the Commercial Software outside of a + Non-production Environment, in any production capacity, on a temporary or + permanent basis, or (iii) transfer, sell, rent, lease, distribute, sublicense, + loan or otherwise make available the Commercial Software Source Code, in whole + or in part, to any third party. Notwithstanding the foregoing, You may + maintain a copy of the repository in which the Source Code of the Commercial + Software resides and that copy may be publicly accessible, provided that you + include this Agreement with Your copy of the repository. - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." +3. TERMINATION - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. + 3.1 Termination. This Agreement will automatically terminate, whether or not + You receive notice of such Termination from Elastic, if You breach any of its + provisions. - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. + 3.2 Post Termination. Upon any termination of this Agreement, for any reason, + You shall promptly cease the use of the Elastic Software in Object Code format + and cease use of the Commercial Software in Source Code format. For the + avoidance of doubt, termination of this Agreement will not affect Your right + to use Elastic Software, in either Object Code or Source Code formats, made + available under the Apache License Version 2.0. - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. + 3.3 Survival. Sections 1.2, 2.2. 3.3, 4 and 5 shall survive any termination or + expiration of this Agreement. - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: +4. DISCLAIMER OF WARRANTIES AND LIMITATION OF LIABILITY - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and + 4.1 Disclaimer of Warranties. TO THE MAXIMUM EXTENT PERMITTED UNDER APPLICABLE + LAW, THE ELASTIC SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, + AND ELASTIC AND ITS LICENSORS MAKE NO WARRANTIES WHETHER EXPRESSED, IMPLIED OR + STATUTORY REGARDING OR RELATING TO THE ELASTIC SOFTWARE. TO THE MAXIMUM EXTENT + PERMITTED UNDER APPLICABLE LAW, ELASTIC AND ITS LICENSORS SPECIFICALLY + DISCLAIM ALL IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR + PURPOSE AND NON-INFRINGEMENT WITH RESPECT TO THE ELASTIC SOFTWARE, AND WITH + RESPECT TO THE USE OF THE FOREGOING. FURTHER, ELASTIC DOES NOT WARRANT RESULTS + OF USE OR THAT THE ELASTIC SOFTWARE WILL BE ERROR FREE OR THAT THE USE OF THE + ELASTIC SOFTWARE WILL BE UNINTERRUPTED. - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and + 4.2 Limitation of Liability. IN NO EVENT SHALL ELASTIC OR ITS LICENSORS BE + LIABLE TO YOU OR ANY THIRD PARTY FOR ANY DIRECT OR INDIRECT DAMAGES, + INCLUDING, WITHOUT LIMITATION, FOR ANY LOSS OF PROFITS, LOSS OF USE, BUSINESS + INTERRUPTION, LOSS OF DATA, COST OF SUBSTITUTE GOODS OR SERVICES, OR FOR ANY + SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, IN CONNECTION WITH + OR ARISING OUT OF THE USE OR INABILITY TO USE THE ELASTIC SOFTWARE, OR THE + PERFORMANCE OF OR FAILURE TO PERFORM THIS AGREEMENT, WHETHER ALLEGED AS A + BREACH OF CONTRACT OR TORTIOUS CONDUCT, INCLUDING NEGLIGENCE, EVEN IF ELASTIC + HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and +5. MISCELLANEOUS - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. + This Agreement completely and exclusively states the entire agreement of the + parties regarding the subject matter herein, and it supersedes, and its terms + govern, all prior proposals, agreements, or other communications between the + parties, oral or written, regarding such subject matter. This Agreement may be + modified by Elastic from time to time, and any such modifications will be + effective upon the "Posted Date" set forth at the top of the modified + Agreement. If any provision hereof is held unenforceable, this Agreement will + continue without said provision and be interpreted to reflect the original + intent of the parties. This Agreement and any non-contractual obligation + arising out of or in connection with it, is governed exclusively by Dutch law. + This Agreement shall not be governed by the 1980 UN Convention on Contracts + for the International Sale of Goods. All disputes arising out of or in + connection with this Agreement, including its existence and validity, shall be + resolved by the courts with jurisdiction in Amsterdam, The Netherlands, except + where mandatory law provides for the courts at another location in The + Netherlands to have jurisdiction. The parties hereby irrevocably waive any and + all claims and defenses either might otherwise have in any such action or + proceeding in any of such courts based upon any alleged lack of personal + jurisdiction, improper venue, forum non conveniens or any similar claim or + defense. A breach or threatened breach, by You of Section 2 may cause + irreparable harm for which damages at law may not provide adequate relief, and + therefore Elastic shall be entitled to seek injunctive relief without being + required to post a bond. You may not assign this Agreement (including by + operation of law in connection with a merger or acquisition), in whole or in + part to any third party without the prior written consent of Elastic, which + may be withheld or granted by Elastic in its sole and absolute discretion. + Any assignment in violation of the preceding sentence is void. Notices to + Elastic may also be sent to legal@elastic.co. - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. +6. DEFINITIONS - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. + The following terms have the meanings ascribed: - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. + 6.1 "Affiliate" means, with respect to a party, any entity that controls, is + controlled by, or which is under common control with, such party, where + "control" means ownership of at least fifty percent (50%) of the outstanding + voting shares of the entity, or the contractual right to establish policy for, + and manage the operations of, the entity. - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. + 6.2 "Basic Features and Functions" means those features and functions of the + Elastic Software that are eligible for use under a Basic license, as set forth + at https://www.elastic.co/subscriptions, as may be modified by Elastic from + time to time. - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. + 6.3 "Commercial Software" means the Elastic Software Source Code in any file + containing a header stating the contents are subject to the Elastic License or + which is contained in the repository folder labeled "x-pack", unless a LICENSE + file present in the directory subtree declares a different license. - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. + 6.4 "Derivative Work of the Commercial Software" means, for purposes of this + Agreement, any modification(s) or enhancement(s) to the Commercial Software, + which represent, as a whole, an original work of authorship. - END OF TERMS AND CONDITIONS + 6.5 "License" means a limited, non-exclusive, non-transferable, fully paid up, + royalty free, right and license, without the right to grant or authorize + sublicenses, solely for Your internal business operations to (i) install and + use the applicable Features and Functions of the Elastic Software in Object + Code, and (ii) permit Contractors and Your Affiliates to use the Elastic + software as set forth in (i) above, provided that such use by Contractors must + be solely for Your benefit and/or the benefit of Your Affiliates, and You + shall be responsible for all acts and omissions of such Contractors and + Affiliates in connection with their use of the Elastic software that are + contrary to the terms and conditions of this Agreement. - APPENDIX: How to apply the Apache License to your work. + 6.6 "License Key" means a sequence of bytes, including but not limited to a + JSON blob, that is used to enable certain features and functions of the + Elastic Software. - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. + 6.7 "Marks and Notices" means all Elastic trademarks, trade names, logos and + notices present on the Documentation as originally provided by Elastic. - Copyright [yyyy] [name of copyright owner] + 6.8 "Non-production Environment" means an environment for development, testing + or quality assurance, where software is not used for production purposes. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at + 6.9 "Object Code" means any form resulting from mechanical transformation or + translation of Source Code form, including but not limited to compiled object + code, generated documentation, and conversions to other media types. - http://www.apache.org/licenses/LICENSE-2.0 + 6.10 "Source Code" means the preferred form of computer software for making + modifications, including but not limited to software source code, + documentation source, and configuration files. - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. + 6.11 "Subscription" means the right to receive Support Services and a License + to the Commercial Software. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..9561fb1 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include README.rst diff --git a/eland/__init__.py b/eland/__init__.py index ee04949..7b48ae3 100644 --- a/eland/__init__.py +++ b/eland/__init__.py @@ -1,3 +1,4 @@ from .utils import * -from .dataframe import * +from .frame import * from .client import * +from .mappings import * \ No newline at end of file diff --git a/eland/client.py b/eland/client.py index e951d30..a3207e2 100644 --- a/eland/client.py +++ b/eland/client.py @@ -1,8 +1,9 @@ from elasticsearch import Elasticsearch -# eland client - implement as facade to control access to Elasticsearch methods -class Client(object): - +class Client(): + """ + eland client - implemented as facade to control access to Elasticsearch methods + """ def __init__(self, es=None): if isinstance(es, Elasticsearch): self.es = es @@ -17,3 +18,6 @@ class Client(object): def search(self, **kwargs): return self.es.search(**kwargs) + + def field_caps(self, **kwargs): + return self.es.field_caps(**kwargs) diff --git a/eland/frame.py b/eland/frame.py new file mode 100644 index 0000000..47b331a --- /dev/null +++ b/eland/frame.py @@ -0,0 +1,253 @@ +""" +DataFrame +--------- +An efficient 2D container for potentially mixed-type time series or other +labeled data series. + +The underlying data resides in Elasticsearch and the API aligns as much as +possible with pandas.DataFrame API. + +This allows the eland.DataFrame to access large datasets stored in Elasticsearch, +without storing the dataset in local memory. + +Implementation Details +---------------------- + +Elasticsearch indexes can be configured in many different ways, and these indexes +utilise different data structures to pandas.DataFrame. + +eland.DataFrame operations that return individual rows (e.g. df.head()) return +_source data. If _source is not enabled, this data is not accessible. + +Similarly, only Elasticsearch searchable fields can be searched or filtered, and +only Elasticsearch aggregatable fields can be aggregated or grouped. + +""" +import eland as ed + +from elasticsearch import Elasticsearch +from elasticsearch_dsl import Search + +import pandas as pd + +class DataFrame(): + """ + pandas.DataFrame like API that proxies into Elasticsearch index(es). + + Parameters + ---------- + client : eland.Client + A reference to a Elasticsearch python client + + index_pattern : str + An Elasticsearch index pattern. This can contain wildcards (e.g. filebeat-*). + + See Also + -------- + + Examples + -------- + + import eland as ed + client = ed.Client(Elasticsearch()) + df = ed.DataFrame(client, 'reviews') + df.head() + reviewerId vendorId rating date + 0 0 0 5 2006-04-07 17:08 + 1 1 1 5 2006-05-04 12:16 + 2 2 2 4 2006-04-21 12:26 + 3 3 3 5 2006-04-18 15:48 + 4 3 4 5 2006-04-18 15:49 + + Notice that the types are based on Elasticsearch mappings + + Notes + ----- + If the Elasticsearch index is deleted or index mappings are changed after this + object is created, the object is not rebuilt and so inconsistencies can occur. + + """ + def __init__(self, client, index_pattern): + self.client = ed.Client(client) + self.index_pattern = index_pattern + + # Get and persist mappings, this allows us to correctly + # map returned types from Elasticsearch to pandas datatypes + self.mappings = ed.Mappings(self.client, self.index_pattern) + + def _es_results_to_pandas(self, results): + """ + Parameters + ---------- + results: dict + Elasticsearch results from self.client.search + + Returns + ------- + df: pandas.DataFrame + _source values extracted from results and mapped to pandas DataFrame + dtypes are mapped via Mapping object + + Notes + ----- + Fields containing lists in Elasticsearch don't map easily to pandas.DataFrame + For example, an index with mapping: + ``` + "mappings" : { + "properties" : { + "group" : { + "type" : "keyword" + }, + "user" : { + "type" : "nested", + "properties" : { + "first" : { + "type" : "keyword" + }, + "last" : { + "type" : "keyword" + } + } + } + } + } + ``` + Adding a document: + ``` + "_source" : { + "group" : "amsterdam", + "user" : [ + { + "first" : "John", + "last" : "Smith" + }, + { + "first" : "Alice", + "last" : "White" + } + ] + } + ``` + (https://www.elastic.co/guide/en/elasticsearch/reference/current/nested.html) + this would be transformed internally (in Elasticsearch) into a document that looks more like this: + ``` + { + "group" : "amsterdam", + "user.first" : [ "alice", "john" ], + "user.last" : [ "smith", "white" ] + } + ``` + When mapping this a pandas data frame we mimic this transformation. + + Similarly, if a list is added to Elasticsearch: + ``` + PUT my_index/_doc/1 + { + "list" : [ + 0, 1, 2 + ] + } + ``` + The mapping is: + ``` + "mappings" : { + "properties" : { + "user" : { + "type" : "long" + } + } + } + ``` + TODO - explain how lists are handled (https://www.elastic.co/guide/en/elasticsearch/reference/current/array.html) + TODO - an option here is to use Elasticsearch's multi-field matching instead of pandas treatment of lists (which isn't great) + NOTE - using this lists is generally not a good way to use this API + """ + def flatten_dict(y): + out = {} + + def flatten(x, name=''): + # We flatten into source fields e.g. if type=geo_point + # location: {lat=52.38, lon=4.90} + if name == '': + is_source_field = False + pd_dtype = 'object' + else: + is_source_field, pd_dtype = self.mappings.is_source_field(name[:-1]) + + if not is_source_field and type(x) is dict: + for a in x: + flatten(x[a], name + a + '.') + elif not is_source_field and type(x) is list: + for a in x: + flatten(a, name) + else: + field_name = name[:-1] + + # Coerce types - for now just datetime + if pd_dtype == 'datetime64[ns]': + x = pd.to_datetime(x) + + # Elasticsearch can have multiple values for a field. These are represented as lists, so + # create lists for this pivot (see notes above) + if field_name in out: + if type(out[field_name]) is not list: + l = [out[field_name]] + out[field_name] = l + out[field_name].append(x) + else: + out[field_name] = x + + flatten(y) + + return out + + rows = [] + for hit in results['hits']['hits']: + row = hit['_source'] + + # flatten row to map correctly to 2D DataFrame + rows.append(flatten_dict(row)) + + # Create pandas DataFrame + df = pd.DataFrame(data=rows) + + return df + + def head(self, n=5): + results = self.client.search(index=self.index_pattern, size=n) + + return self._es_results_to_pandas(results) + + def describe(self): + numeric_source_fields = self.mappings.numeric_source_fields() + + # for each field we compute: + # count, mean, std, min, 25%, 50%, 75%, max + search = Search(using=self.client, index=self.index_pattern).extra(size=0) + + for field in numeric_source_fields: + search.aggs.metric('extended_stats_'+field, 'extended_stats', field=field) + search.aggs.metric('percentiles_'+field, 'percentiles', field=field) + + response = search.execute() + + results = {} + + for field in numeric_source_fields: + values = [] + values.append(response.aggregations['extended_stats_'+field]['count']) + values.append(response.aggregations['extended_stats_'+field]['avg']) + values.append(response.aggregations['extended_stats_'+field]['std_deviation']) + values.append(response.aggregations['extended_stats_'+field]['min']) + values.append(response.aggregations['percentiles_'+field]['values']['25.0']) + values.append(response.aggregations['percentiles_'+field]['values']['50.0']) + values.append(response.aggregations['percentiles_'+field]['values']['75.0']) + values.append(response.aggregations['extended_stats_'+field]['max']) + + # if not None + if (values.count(None) < len(values)): + results[field] = values + + df = pd.DataFrame(data=results, index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']) + + return df diff --git a/eland/mappings.py b/eland/mappings.py new file mode 100644 index 0000000..b96d7c1 --- /dev/null +++ b/eland/mappings.py @@ -0,0 +1,300 @@ +import warnings +import pandas as pd + +class Mappings(): + """ + General purpose to manage Elasticsearch to/from pandas mappings + + Attributes + ---------- + + mappings_capabilities: pandas.DataFrame + A data frame summarising the capabilities of the index mapping + + _source - is top level field (i.e. not a multi-field sub-field) + es_dtype - Elasticsearch field datatype + pd_dtype - Pandas datatype + searchable - is the field searchable? + aggregatable- is the field aggregatable? + _source es_dtype pd_dtype searchable aggregatable + maps-telemetry.min True long int64 True True + maps-telemetry.avg True float float64 True True + city True text object True False + user_name True keyword object True True + origin_location.lat.keyword False keyword object True True + type True keyword object True True + origin_location.lat True text object True False + + """ + def __init__(self, client, index_pattern): + """ + Parameters + ---------- + client: eland.Client + Elasticsearch client + + index_pattern: str + Elasticsearch index pattern + """ + # persist index_pattern for debugging + self.index_pattern = index_pattern + + mappings = client.indices().get_mapping(index=index_pattern) + + # Get all fields (including all nested) and then field_caps + # for these names (fields=* doesn't appear to work effectively...) + all_fields = Mappings._extract_fields_from_mapping(mappings) + all_fields_caps = client.field_caps(index=index_pattern, fields=list(all_fields.keys())) + + # Get top level (not sub-field multifield) mappings + source_fields = Mappings._extract_fields_from_mapping(mappings, source_only=True) + + # Populate capability matrix of fields + # field_name, es_dtype, pd_dtype, is_searchable, is_aggregtable, is_source + self.mappings_capabilities = Mappings._create_capability_matrix(all_fields, source_fields, all_fields_caps) + + # Cache source field types for efficient lookup + # (this massively improves performance of DataFrame.flatten) + self.source_field_pd_dtypes = {} + + for field_name in source_fields: + pd_dtype = self.mappings_capabilities.loc[field_name]['pd_dtype'] + self.source_field_pd_dtypes[field_name] = pd_dtype + + def _extract_fields_from_mapping(mappings, source_only=False): + """ + Extract all field names and types from a mapping. + ``` + { + "my_index": { + "mappings": { + "properties": { + "city": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + } + } + } + } + } + } + ``` + if source_only == False: + return {'city': 'text', 'city.keyword': 'keyword'} + else: + return {'city': 'text'} + + Note: first field name type wins. E.g. + + ``` + PUT my_index1 {"mappings":{"properties":{"city":{"type":"text"}}}} + PUT my_index2 {"mappings":{"properties":{"city":{"type":"long"}}}} + + Returns {'city': 'text'} + ``` + + Parameters + ---------- + mappings: dict + Return from get_mapping + + Returns + ------- + fields: dict + Dict of field names and types + + """ + fields = {} + + # Recurse until we get a 'type: xxx' + def flatten(x, name=''): + if type(x) is dict: + for a in x: + if a == 'type' and type(x[a]) is str: # 'type' can be a name of a field + field_name = name[:-1] + field_type = x[a] + + # If there is a conflicting type, warn - first values added wins + if field_name in fields and fields[field_name] != field_type: + warnings.warn("Field {} has conflicting types {} != {}". + format(field_name, fields[field_name], field_type), + UserWarning) + else: + fields[field_name] = field_type + elif a == 'properties' or (not source_only and a == 'fields'): + flatten(x[a], name) + elif not (source_only and a == 'fields'): # ignore multi-field fields for source_only + flatten(x[a], name + a + '.') + + for index in mappings: + if 'properties' in mappings[index]['mappings']: + properties = mappings[index]['mappings']['properties'] + + flatten(properties) + + return fields + + def _create_capability_matrix(all_fields, source_fields, all_fields_caps): + """ + { + "fields": { + "rating": { + "long": { + "searchable": true, + "aggregatable": false, + "indices": ["index1", "index2"], + "non_aggregatable_indices": ["index1"] + }, + "keyword": { + "searchable": false, + "aggregatable": true, + "indices": ["index3", "index4"], + "non_searchable_indices": ["index4"] + } + }, + "title": { + "text": { + "searchable": true, + "aggregatable": false + + } + } + } + } + """ + all_fields_caps_fields = all_fields_caps['fields'] + + columns = ['_source', 'es_dtype', 'pd_dtype', 'searchable', 'aggregatable'] + capability_matrix = {} + + for field, field_caps in all_fields_caps_fields.items(): + if field in all_fields: + # v = {'long': {'type': 'long', 'searchable': True, 'aggregatable': True}} + for kk, vv in field_caps.items(): + _source = (field in source_fields) + es_dtype = vv['type'] + pd_dtype = Mappings._es_dtype_to_pd_dtype(vv['type']) + searchable = vv['searchable'] + aggregatable = vv['aggregatable'] + + caps = [_source, es_dtype, pd_dtype, searchable, aggregatable] + + capability_matrix[field] = caps + + if 'non_aggregatable_indices' in vv: + warnings.warn("Field {} has conflicting aggregatable fields across indexes {}", + format(field_name, vv['non_aggregatable_indices']), + UserWarning) + if 'non_searchable_indices' in vv: + warnings.warn("Field {} has conflicting searchable fields across indexes {}", + format(field_name, vv['non_searchable_indices']), + UserWarning) + + capability_matrix_df = pd.DataFrame.from_dict(capability_matrix, orient='index', columns=columns) + + return capability_matrix_df.sort_index() + + def _es_dtype_to_pd_dtype(es_dtype): + """ + Mapping Elasticsearch types to pandas dtypes + -------------------------------------------- + + Elasticsearch field datatype | Pandas dtype + -- + text | object + keyword | object + long, integer, short, byte, binary | int64 + double, float, half_float, scaled_float | float64 + date, date_nanos | datetime64 + boolean | bool + TODO - add additional mapping types + """ + es_dtype_to_pd_dtype = { + 'text': 'object', + 'keyword': 'object', + + 'long': 'int64', + 'integer': 'int64', + 'short': 'int64', + 'byte': 'int64', + 'binary': 'int64', + + 'double': 'float64', + 'float': 'float64', + 'half_float': 'float64', + 'scaled_float': 'float64', + + 'date': 'datetime64[ns]', + 'date_nanos': 'datetime64[ns]', + + 'boolean': 'bool' + } + + if es_dtype in es_dtype_to_pd_dtype: + return es_dtype_to_pd_dtype[es_dtype] + + # Return 'object' for all unsupported TODO - investigate how different types could be supported + return 'object' + + def all_fields(self): + """ + Returns + ------- + all_fields: list + All typed fields in the index mapping + """ + return self.mappings_capabilities.index.tolist() + + """ + def pd_dtypes_groupby_source_fields(self): + Returns + ------- + groups: dict + Calls pandas.core.groupby.GroupBy.groups for _source fields + E.g. + { + 'bool': Index(['Cancelled', 'FlightDelay'], dtype='object'), + 'datetime64[ns]': Index(['timestamp'], dtype='object'), + 'float64': Index(['AvgTicketPrice', 'DistanceKilometers', 'DistanceMiles',... + } + return self.mappings_capabilities[self.mappings_capabilities._source == True].groupby('pd_dtype').groups + + def pd_dtype + """ + + def is_source_field(self, field_name): + """ + Parameters + ---------- + field_name: str + + Returns + ------- + is_source_field: bool + Is this field name a top-level source field? + pd_dtype: str + The pandas data type we map to + """ + pd_dtype = 'object' + is_source_field = False + + if field_name in self.source_field_pd_dtypes: + is_source_field = True + pd_dtype = self.source_field_pd_dtypes[field_name] + + return is_source_field, pd_dtype + + def numeric_source_fields(self): + """ + Returns + ------- + numeric_source_fields: list of str + List of source fields where pd_dtype == (int64 or float64) + """ + return self.mappings_capabilities[(self.mappings_capabilities._source == True) & + ((self.mappings_capabilities.pd_dtype == 'int64') | + (self.mappings_capabilities.pd_dtype == 'float64'))].index.tolist() + diff --git a/eland/tests/__init__.py b/eland/tests/__init__.py new file mode 100644 index 0000000..14c8b54 --- /dev/null +++ b/eland/tests/__init__.py @@ -0,0 +1,481 @@ +import os +import pandas as pd + +ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) + +# Define test files and indices +ELASTICSEARCH_HOST = 'localhost' # TODO externalise this + +FLIGHTS_INDEX_NAME = 'flights' +FLIGHTS_MAPPING = { "mappings" : { + "properties" : { + "AvgTicketPrice" : { + "type" : "float" + }, + "Cancelled" : { + "type" : "boolean" + }, + "Carrier" : { + "type" : "keyword" + }, + "Dest" : { + "type" : "keyword" + }, + "DestAirportID" : { + "type" : "keyword" + }, + "DestCityName" : { + "type" : "keyword" + }, + "DestCountry" : { + "type" : "keyword" + }, + "DestLocation" : { + "type" : "geo_point" + }, + "DestRegion" : { + "type" : "keyword" + }, + "DestWeather" : { + "type" : "keyword" + }, + "DistanceKilometers" : { + "type" : "float" + }, + "DistanceMiles" : { + "type" : "float" + }, + "FlightDelay" : { + "type" : "boolean" + }, + "FlightDelayMin" : { + "type" : "integer" + }, + "FlightDelayType" : { + "type" : "keyword" + }, + "FlightNum" : { + "type" : "keyword" + }, + "FlightTimeHour" : { + "type" : "float" + }, + "FlightTimeMin" : { + "type" : "float" + }, + "Origin" : { + "type" : "keyword" + }, + "OriginAirportID" : { + "type" : "keyword" + }, + "OriginCityName" : { + "type" : "keyword" + }, + "OriginCountry" : { + "type" : "keyword" + }, + "OriginLocation" : { + "type" : "geo_point" + }, + "OriginRegion" : { + "type" : "keyword" + }, + "OriginWeather" : { + "type" : "keyword" + }, + "dayOfWeek" : { + "type" : "integer" + }, + "timestamp" : { + "type" : "date" + } + } + } } +FLIGHTS_FILE_NAME = ROOT_DIR + '/flights.json.gz' +FLIGHTS_DF_FILE_NAME = ROOT_DIR + '/flights_df.json.gz' + +ECOMMERCE_INDEX_NAME = 'ecommerce' +ECOMMERCE_MAPPING = { "mappings" : { + "properties" : { + "category" : { + "type" : "text", + "fields" : { + "keyword" : { + "type" : "keyword" + } + } + }, + "currency" : { + "type" : "keyword" + }, + "customer_birth_date" : { + "type" : "date" + }, + "customer_first_name" : { + "type" : "text", + "fields" : { + "keyword" : { + "type" : "keyword", + "ignore_above" : 256 + } + } + }, + "customer_full_name" : { + "type" : "text", + "fields" : { + "keyword" : { + "type" : "keyword", + "ignore_above" : 256 + } + } + }, + "customer_gender" : { + "type" : "keyword" + }, + "customer_id" : { + "type" : "keyword" + }, + "customer_last_name" : { + "type" : "text", + "fields" : { + "keyword" : { + "type" : "keyword", + "ignore_above" : 256 + } + } + }, + "customer_phone" : { + "type" : "keyword" + }, + "day_of_week" : { + "type" : "keyword" + }, + "day_of_week_i" : { + "type" : "integer" + }, + "email" : { + "type" : "keyword" + }, + "geoip" : { + "properties" : { + "city_name" : { + "type" : "keyword" + }, + "continent_name" : { + "type" : "keyword" + }, + "country_iso_code" : { + "type" : "keyword" + }, + "location" : { + "type" : "geo_point" + }, + "region_name" : { + "type" : "keyword" + } + } + }, + "manufacturer" : { + "type" : "text", + "fields" : { + "keyword" : { + "type" : "keyword" + } + } + }, + "order_date" : { + "type" : "date" + }, + "order_id" : { + "type" : "keyword" + }, + "products" : { + "properties" : { + "_id" : { + "type" : "text", + "fields" : { + "keyword" : { + "type" : "keyword", + "ignore_above" : 256 + } + } + }, + "base_price" : { + "type" : "half_float" + }, + "base_unit_price" : { + "type" : "half_float" + }, + "category" : { + "type" : "text", + "fields" : { + "keyword" : { + "type" : "keyword" + } + } + }, + "created_on" : { + "type" : "date" + }, + "discount_amount" : { + "type" : "half_float" + }, + "discount_percentage" : { + "type" : "half_float" + }, + "manufacturer" : { + "type" : "text", + "fields" : { + "keyword" : { + "type" : "keyword" + } + } + }, + "min_price" : { + "type" : "half_float" + }, + "price" : { + "type" : "half_float" + }, + "product_id" : { + "type" : "long" + }, + "product_name" : { + "type" : "text", + "fields" : { + "keyword" : { + "type" : "keyword" + } + }, + "analyzer" : "english" + }, + "quantity" : { + "type" : "integer" + }, + "sku" : { + "type" : "keyword" + }, + "tax_amount" : { + "type" : "half_float" + }, + "taxful_price" : { + "type" : "half_float" + }, + "taxless_price" : { + "type" : "half_float" + }, + "unit_discount_amount" : { + "type" : "half_float" + } + } + }, + "sku" : { + "type" : "keyword" + }, + "taxful_total_price" : { + "type" : "half_float" + }, + "taxless_total_price" : { + "type" : "half_float" + }, + "total_quantity" : { + "type" : "integer" + }, + "total_unique_products" : { + "type" : "integer" + }, + "type" : { + "type" : "keyword" + }, + "user" : { + "type" : "keyword" + } + } + } } +ECOMMERCE_FILE_NAME = ROOT_DIR + '/ecommerce.json.gz' +ECOMMERCE_DF_FILE_NAME = ROOT_DIR + '/ecommerce_df.json.gz' + +TEST_MAPPING1 = { + 'mappings': { + 'properties': { + 'city': { + 'type': 'text', + 'fields': { + 'raw': { + 'type': 'keyword' + } + } + }, + 'text': { + 'type': 'text', + 'fields': { + 'english': { + 'type': 'text', + 'analyzer': 'english' + } + } + }, + 'origin_location': { + 'properties': { + 'lat': { + 'type': 'text', + 'index_prefixes': {}, + 'fields': { + 'keyword': { + 'type': 'keyword', + 'ignore_above': 256 + } + } + }, + 'lon': { + 'type': 'text', + 'fields': { + 'keyword': { + 'type': 'keyword', + 'ignore_above': 256 + } + } + } + } + }, + 'maps-telemetry': { + 'properties': { + 'attributesPerMap': { + 'properties': { + 'dataSourcesCount': { + 'properties': { + 'avg': { + 'type': 'long' + }, + 'max': { + 'type': 'long' + }, + 'min': { + 'type': 'long' + } + } + }, + 'emsVectorLayersCount': { + 'dynamic': 'true', + 'properties': { + 'france_departments': { + 'properties': { + 'avg': { + 'type': 'float' + }, + 'max': { + 'type': 'long' + }, + 'min': { + 'type': 'long' + } + } + } + } + } + } + } + } + }, + 'type': { + 'type': 'keyword' + }, + 'name': { + 'type': 'text' + }, + 'user_name': { + 'type': 'keyword' + }, + 'email': { + 'type': 'keyword' + }, + 'content': { + 'type': 'text' + }, + 'tweeted_at': { + 'type': 'date' + }, + 'dest_location': { + 'type': 'geo_point' + }, + 'my_join_field': { + 'type': 'join', + 'relations': { + 'question': ['answer', 'comment'], + 'answer': 'vote' + } + } + } + } + } + +TEST_MAPPING1_INDEX_NAME = 'mapping1' + +TEST_MAPPING1_EXPECTED = { + 'city': 'text', + 'city.raw': 'keyword', + 'content': 'text', + 'dest_location': 'geo_point', + 'email': 'keyword', + 'maps-telemetry.attributesPerMap.dataSourcesCount.avg': 'long', + 'maps-telemetry.attributesPerMap.dataSourcesCount.max': 'long', + 'maps-telemetry.attributesPerMap.dataSourcesCount.min': 'long', + 'maps-telemetry.attributesPerMap.emsVectorLayersCount.france_departments.avg': 'float', + 'maps-telemetry.attributesPerMap.emsVectorLayersCount.france_departments.max': 'long', + 'maps-telemetry.attributesPerMap.emsVectorLayersCount.france_departments.min': 'long', + 'my_join_field': 'join', + 'name': 'text', + 'origin_location.lat': 'text', + 'origin_location.lat.keyword': 'keyword', + 'origin_location.lon': 'text', + 'origin_location.lon.keyword': 'keyword', + 'text': 'text', + 'text.english': 'text', + 'tweeted_at': 'date', + 'type': 'keyword', + 'user_name': 'keyword' +} + +TEST_MAPPING1_EXPECTED_DF = pd.DataFrame.from_dict(data=TEST_MAPPING1_EXPECTED, orient='index', columns=['es_dtype']) + +TEST_NESTED_USER_GROUP_INDEX_NAME = 'nested_user_group' +TEST_NESTED_USER_GROUP_MAPPING = { + 'mappings': { + 'properties': { + 'group': { + 'type': 'keyword' + }, + 'user': { + 'properties': { + 'first': { + 'type': 'keyword' + }, + 'last': { + 'type': 'keyword' + }, + 'address' : { + 'type' : 'keyword' + } + } + } + } +} +} + +TEST_NESTED_USER_GROUP_DOCS = [ +{'_index':TEST_NESTED_USER_GROUP_INDEX_NAME, +'_source': + {'group':'amsterdam','user':[ + {'first':'Manke','last':'Nelis','address':['Elandsgracht', 'Amsterdam']}, + {'first':'Johnny','last':'Jordaan','address':['Elandsstraat', 'Amsterdam']}]}}, +{'_index':TEST_NESTED_USER_GROUP_INDEX_NAME, +'_source': + {'group':'london','user':[ + {'first':'Alice','last':'Monkton'}, + {'first':'Jimmy','last':'White','address':['London']}]}}, +{'_index':TEST_NESTED_USER_GROUP_INDEX_NAME, +'_source':{'group':'new york','user':[ + {'first':'Bill','last':'Jones'}]}} +] + diff --git a/eland/tests/client/__init__.py b/eland/tests/client/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/eland/tests/client/test_mappings_pytest.py b/eland/tests/client/test_mappings_pytest.py new file mode 100644 index 0000000..be3a9cf --- /dev/null +++ b/eland/tests/client/test_mappings_pytest.py @@ -0,0 +1,22 @@ +# File called _pytest for PyCharm compatability +import pytest + +from eland.tests import * + +from pandas.util.testing import ( + assert_almost_equal, assert_frame_equal, assert_series_equal) + +import eland as ed + +class TestMapping(): + + # Requires 'setup_tests.py' to be run prior to this + def test_mapping(self): + mapping = ed.Mappings(ed.Client(ELASTICSEARCH_HOST), TEST_MAPPING1_INDEX_NAME) + + assert mapping.all_fields() == TEST_MAPPING1_EXPECTED_DF.index.tolist() + + assert_frame_equal(TEST_MAPPING1_EXPECTED_DF, pd.DataFrame(mapping.mappings_capabilities['es_dtype'])) + + + diff --git a/eland/tests/ecommerce.json.gz b/eland/tests/ecommerce.json.gz new file mode 100644 index 0000000..b1a5dff Binary files /dev/null and b/eland/tests/ecommerce.json.gz differ diff --git a/eland/tests/ecommerce_df.json.gz b/eland/tests/ecommerce_df.json.gz new file mode 100644 index 0000000..11f5a98 Binary files /dev/null and b/eland/tests/ecommerce_df.json.gz differ diff --git a/flights.json.gz b/eland/tests/flights.json.gz similarity index 99% rename from flights.json.gz rename to eland/tests/flights.json.gz index 85f344d..df976e6 100644 Binary files a/flights.json.gz and b/eland/tests/flights.json.gz differ diff --git a/eland/tests/flights_df.json.gz b/eland/tests/flights_df.json.gz new file mode 100644 index 0000000..5aed61e Binary files /dev/null and b/eland/tests/flights_df.json.gz differ diff --git a/eland/tests/frame/__init__.py b/eland/tests/frame/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/eland/tests/frame/common.py b/eland/tests/frame/common.py new file mode 100644 index 0000000..6b9068c --- /dev/null +++ b/eland/tests/frame/common.py @@ -0,0 +1,40 @@ +import pytest + +import eland as ed + +import pandas as pd + +import os + +ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) + +# Create pandas and eland data frames +from eland.tests import ELASTICSEARCH_HOST +from eland.tests import FLIGHTS_DF_FILE_NAME, FLIGHTS_INDEX_NAME,\ + ECOMMERCE_DF_FILE_NAME, ECOMMERCE_INDEX_NAME + +_pd_flights = pd.read_json(FLIGHTS_DF_FILE_NAME).sort_index() +_pd_flights['timestamp'] = \ + pd.to_datetime(_pd_flights['timestamp']) +_ed_flights = ed.read_es(ELASTICSEARCH_HOST, FLIGHTS_INDEX_NAME) + +_pd_ecommerce = pd.read_json(ECOMMERCE_DF_FILE_NAME).sort_index() +_pd_ecommerce['order_date'] = \ + pd.to_datetime(_pd_ecommerce['order_date']) +_pd_ecommerce['products.created_on'] = \ + _pd_ecommerce['products.created_on'].apply(lambda x: pd.to_datetime(x)) +_ed_ecommerce = ed.read_es(ELASTICSEARCH_HOST, ECOMMERCE_INDEX_NAME) + +class TestData: + + def pd_flights(self): + return _pd_flights + + def ed_flights(self): + return _ed_flights + + def pd_ecommerce(self): + return _pd_ecommerce + + def ed_ecommerce(self): + return _ed_ecommerce diff --git a/eland/tests/frame/test_indexing_pytest.py b/eland/tests/frame/test_indexing_pytest.py new file mode 100644 index 0000000..bd230f1 --- /dev/null +++ b/eland/tests/frame/test_indexing_pytest.py @@ -0,0 +1,54 @@ +# File called _pytest for PyCharm compatability +from eland.tests.frame.common import TestData +from eland.tests import * + +import eland as ed +import pandas as pd + +from pandas.util.testing import ( + assert_almost_equal, assert_frame_equal, assert_series_equal) + +class TestDataFrameIndexing(TestData): + + def test_mapping(self): + ed_flights_mappings = pd.DataFrame(self.ed_flights().mappings.mappings_capabilities + [self.ed_flights().mappings.mappings_capabilities._source==True] + ['pd_dtype']) + pd_flights_mappings = pd.DataFrame(self.pd_flights().dtypes, columns = ['pd_dtype']) + + assert_frame_equal(pd_flights_mappings, ed_flights_mappings) + + # We don't compare ecommerce here as the default dtypes in pandas from read_json + # don't match the mapping types. This is mainly because the products field is + # nested and so can be treated as a multi-field in ES, but not in pandas + + def test_head(self): + pd_flights_head = self.pd_flights().head() + ed_flights_head = self.ed_flights().head() + + assert_frame_equal(pd_flights_head, ed_flights_head) + + pd_ecommerce_head = self.pd_ecommerce().head() + ed_ecommerce_head = self.ed_ecommerce().head() + + assert_frame_equal(pd_ecommerce_head, ed_ecommerce_head) + + def test_describe(self): + pd_flights_describe = self.pd_flights().describe() + ed_flights_describe = self.ed_flights().describe() + + # TODO - this fails now as ES aggregations are approximate + # if ES percentile agg uses + # "hdr": { + # "number_of_significant_value_digits": 3 + # } + # this works + #assert_almost_equal(pd_flights_describe, ed_flights_describe) + + pd_ecommerce_describe = self.pd_ecommerce().describe() + ed_ecommerce_describe = self.ed_ecommerce().describe() + + # We don't compare ecommerce here as the default dtypes in pandas from read_json + # don't match the mapping types. This is mainly because the products field is + # nested and so can be treated as a multi-field in ES, but not in pandas + diff --git a/eland/tests/setup_tests.py b/eland/tests/setup_tests.py new file mode 100644 index 0000000..b758a26 --- /dev/null +++ b/eland/tests/setup_tests.py @@ -0,0 +1,68 @@ +import pandas as pd +from elasticsearch import Elasticsearch +from elasticsearch import helpers + +from eland.tests import * + +DATA_LIST = [ + (FLIGHTS_FILE_NAME, FLIGHTS_INDEX_NAME, FLIGHTS_MAPPING), + (ECOMMERCE_FILE_NAME, ECOMMERCE_INDEX_NAME, ECOMMERCE_MAPPING) +] + +def _setup_data(es): + # Read json file and index records into Elasticsearch + for data in DATA_LIST: + json_file_name = data[0] + index_name = data[1] + mapping = data[2] + + # Delete index + print("Deleting index:", index_name) + es.indices.delete(index=index_name, ignore=[400, 404]) + print("Creating index:", index_name) + es.indices.create(index=index_name, body=mapping) + + df = pd.read_json(json_file_name, lines=True) + + actions = [] + n = 0 + + print("Adding", df.shape[0], "items to index:", index_name) + for index, row in df.iterrows(): + values = row.to_dict() + # make timestamp datetime 2018-01-01T12:09:35 + #values['timestamp'] = datetime.strptime(values['timestamp'], '%Y-%m-%dT%H:%M:%S') + + action = {'_index': index_name, '_source': values} + + actions.append(action) + + n = n + 1 + + if n % 10000 == 0: + helpers.bulk(es, actions) + actions = [] + + helpers.bulk(es, actions) + actions = [] + + print("Done", index_name) + +def _setup_test_mappings(es): + # Create a complex mapping containing many Elasticsearch features + es.indices.delete(index=TEST_MAPPING1_INDEX_NAME, ignore=[400, 404]) + es.indices.create(index=TEST_MAPPING1_INDEX_NAME, body=TEST_MAPPING1) + +def _setup_test_nested(es): + es.indices.delete(index=TEST_NESTED_USER_GROUP_INDEX_NAME, ignore=[400, 404]) + es.indices.create(index=TEST_NESTED_USER_GROUP_INDEX_NAME, body=TEST_NESTED_USER_GROUP_MAPPING) + + helpers.bulk(es, TEST_NESTED_USER_GROUP_DOCS) + +if __name__ == '__main__': + # Create connection to Elasticsearch - use defaults + es = Elasticsearch(ELASTICSEARCH_HOST) + + _setup_data(es) + _setup_test_mappings(es) + _setup_test_nested(es) diff --git a/test.ipynb b/eland/tests/test.ipynb similarity index 89% rename from test.ipynb rename to eland/tests/test.ipynb index 6cc6f63..85de534 100644 --- a/test.ipynb +++ b/eland/tests/test.ipynb @@ -10,7 +10,11 @@ { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "pycharm": { + "is_executing": false + } + }, "outputs": [], "source": [ "import pandas as pd" @@ -442,7 +446,7 @@ "metadata": {}, "outputs": [], "source": [ - "ed_df = ed.read_es('localhost', 'kibana_sample_data_flights')" + "ed_df = ed.read_es('localhost', 'flights')" ] }, { @@ -519,7 +523,7 @@ " DE-HE\n", " Sunny\n", " 0\n", - " 2019-05-27T00:00:00\n", + " 2018-01-01 00:00:00\n", " \n", " \n", " 1\n", @@ -543,7 +547,7 @@ " SE-BD\n", " Clear\n", " 0\n", - " 2019-05-27T18:27:00\n", + " 2018-01-01 18:27:00\n", " \n", " \n", " 2\n", @@ -567,7 +571,7 @@ " IT-34\n", " Rain\n", " 0\n", - " 2019-05-27T17:11:14\n", + " 2018-01-01 17:11:14\n", " \n", " \n", " 3\n", @@ -591,7 +595,7 @@ " IT-72\n", " Thunder & Lightning\n", " 0\n", - " 2019-05-27T10:33:28\n", + " 2018-01-01 10:33:28\n", " \n", " \n", " 4\n", @@ -615,7 +619,7 @@ " MX-DIF\n", " Damaging Wind\n", " 0\n", - " 2019-05-27T05:13:00\n", + " 2018-01-01 05:13:00\n", " \n", " \n", "\n", @@ -672,12 +676,12 @@ "3 {'lat': '40.886002', 'lon': '14.2908'} IT-72 \n", "4 {'lat': '19.4363', 'lon': '-99.072098'} MX-DIF \n", "\n", - " OriginWeather dayOfWeek timestamp \n", - "0 Sunny 0 2019-05-27T00:00:00 \n", - "1 Clear 0 2019-05-27T18:27:00 \n", - "2 Rain 0 2019-05-27T17:11:14 \n", - "3 Thunder & Lightning 0 2019-05-27T10:33:28 \n", - "4 Damaging Wind 0 2019-05-27T05:13:00 \n", + " OriginWeather dayOfWeek timestamp \n", + "0 Sunny 0 2018-01-01 00:00:00 \n", + "1 Clear 0 2018-01-01 18:27:00 \n", + "2 Rain 0 2018-01-01 17:11:14 \n", + "3 Thunder & Lightning 0 2018-01-01 10:33:28 \n", + "4 Damaging Wind 0 2018-01-01 05:13:00 \n", "\n", "[5 rows x 27 columns]" ] @@ -768,12 +772,12 @@ " 2470.545974\n", " 1535.126118\n", " 0.000000\n", - " 252.064162\n", + " 251.834931\n", " 1.000000\n", " \n", " \n", " 50%\n", - " 640.387285\n", + " 640.362667\n", " 7612.072403\n", " 4729.922470\n", " 0.000000\n", @@ -782,12 +786,12 @@ " \n", " \n", " 75%\n", - " 842.259390\n", - " 9735.660463\n", - " 6049.583389\n", - " 15.000000\n", + " 842.262193\n", + " 9735.210895\n", + " 6049.600045\n", + " 12.521186\n", " 720.505705\n", - " 4.068000\n", + " 4.109848\n", " \n", " \n", " max\n", @@ -809,8 +813,8 @@ "std 266.386661 4578.263193 2844.800855 96.743006 \n", "min 100.020531 0.000000 0.000000 0.000000 \n", "25% 410.008918 2470.545974 1535.126118 0.000000 \n", - "50% 640.387285 7612.072403 4729.922470 0.000000 \n", - "75% 842.259390 9735.660463 6049.583389 15.000000 \n", + "50% 640.362667 7612.072403 4729.922470 0.000000 \n", + "75% 842.262193 9735.210895 6049.600045 12.521186 \n", "max 1199.729004 19881.482422 12353.780273 360.000000 \n", "\n", " FlightTimeMin dayOfWeek \n", @@ -818,9 +822,9 @@ "mean 511.127842 2.835975 \n", "std 334.741135 1.939365 \n", "min 0.000000 0.000000 \n", - "25% 252.064162 1.000000 \n", + "25% 251.834931 1.000000 \n", "50% 503.148975 3.000000 \n", - "75% 720.505705 4.068000 \n", + "75% 720.505705 4.109848 \n", "max 1902.901978 6.000000 " ] }, @@ -832,6 +836,89 @@ "source": [ "ed_df.describe()" ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "d = {'col1': [1.2, 20], 'col2': [int(1), int(30)], 'col3': ['2019-02-01 03:04:05', '2018-02-01 01:03:04'], 'col4': ['2019-02-01 03:04:05', '2018-02-01 01:03:04']}\n", + "df = pd.DataFrame(data=d)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
col1col2col3col4
01.212019-02-01 03:04:052019-02-01 03:04:05
120.0302018-02-01 01:03:042018-02-01 01:03:04
\n", + "
" + ], + "text/plain": [ + " col1 col2 col3 col4\n", + "0 1.2 1 2019-02-01 03:04:05 2019-02-01 03:04:05\n", + "1 20.0 30 2018-02-01 01:03:04 2018-02-01 01:03:04" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -850,7 +937,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.3" } }, "nbformat": 4, diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..515fe57 --- /dev/null +++ b/setup.py @@ -0,0 +1,20 @@ +from setuptools import setup + +def readme(): + with open('README.rst') as f: + return f.read() + +setup(name='eland', + version='0.1', + description='Python elasticsearch client to analyse, explore and manipulate data that resides in elasticsearch', + url='http://github.com/elastic/eland', + author='Stephen Dodson', + author_email='sjd171@gmail.com', + license='ELASTIC LICENSE', + packages=['eland'], + install_requires=[ + 'elasticsearch', + 'elasticsearch_dsl', + 'pandas' + ], + zip_safe=False) diff --git a/test.py b/test.py deleted file mode 100644 index e73163c..0000000 --- a/test.py +++ /dev/null @@ -1,7 +0,0 @@ -import eland as ed - -df = ed.read_es('localhost', 'kibana_sample_data_flights') - -print(df.head()) - -print(df.describe())