diff --git a/README.md b/README.md index 98d9853..7f034f3 100644 --- a/README.md +++ b/README.md @@ -52,26 +52,110 @@ index pattern, and explore using an API that mirrors a subset of the pandas.Data ``` >>> import eland as ed ->>> df = ed.read_es('http://localhost:9200', 'reviews') +>>> # Connect to 'flights' index via localhost Elasticsearch node +>>> df = ed.DataFrame('localhost:9200', 'flights') >>> df.head() - reviewerId vendorId rating date -0 0 0 5 2006-04-07 17:08 -1 1 1 5 2006-05-04 12:16 -2 2 2 4 2006-04-21 12:26 -3 3 3 5 2006-04-18 15:48 -4 3 4 5 2006-04-18 15:49 + AvgTicketPrice Cancelled Carrier ... OriginWeather dayOfWeek timestamp +0 841.265642 False Kibana Airlines ... Sunny 0 2018-01-01 00:00:00 +1 882.982662 False Logstash Airways ... Clear 0 2018-01-01 18:27:00 +2 190.636904 False Logstash Airways ... Rain 0 2018-01-01 17:11:14 +3 181.694216 True Kibana Airlines ... Thunder & Lightning 0 2018-01-01 10:33:28 +4 730.041778 False Kibana Airlines ... Damaging Wind 0 2018-01-01 05:13:00 + +[5 rows x 27 columns] >>> df.describe() - reviewerId vendorId rating -count 578805.000000 578805.000000 578805.000000 -mean 174124.098437 60.645267 4.679671 -std 116951.972209 54.488053 0.800891 -min 0.000000 0.000000 0.000000 -25% 70043.000000 20.000000 5.000000 -50% 161052.000000 44.000000 5.000000 -75% 272697.000000 83.000000 5.000000 -max 400140.000000 246.000000 5.000000 + AvgTicketPrice DistanceKilometers DistanceMiles FlightDelayMin FlightTimeHour FlightTimeMin dayOfWeek +count 13059.000000 13059.000000 13059.000000 13059.000000 13059.000000 13059.000000 13059.000000 +mean 628.253689 7092.142457 4406.853010 47.335171 8.518797 511.127842 2.835975 +std 266.386661 4578.263193 2844.800855 96.743006 5.579019 334.741135 1.939365 +min 100.020531 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 +25% 410.008918 2470.545974 1535.126118 0.000000 4.194976 251.738513 1.000000 +50% 640.362667 7612.072403 4729.922470 0.000000 8.385816 503.148975 3.000000 +75% 842.254990 9735.082407 6049.459005 15.000000 12.009396 720.534532 4.141221 +max 1199.729004 19881.482422 12353.780273 360.000000 31.715034 1902.901978 6.000000 + +>>> df[['Carrier', 'AvgTicketPrice', 'Cancelled']] + Carrier AvgTicketPrice Cancelled +0 Kibana Airlines 841.265642 False +1 Logstash Airways 882.982662 False +2 Logstash Airways 190.636904 False +3 Kibana Airlines 181.694216 True +4 Kibana Airlines 730.041778 False +... ... ... ... +13054 Logstash Airways 1080.446279 False +13055 Logstash Airways 646.612941 False +13056 Logstash Airways 997.751876 False +13057 JetBeats 1102.814465 False +13058 JetBeats 858.144337 False + +[13059 rows x 3 columns] + +>>> df[(df.Carrier=="Kibana Airlines") & (df.AvgTicketPrice > 900.0) & (df.Cancelled == True)].head() + AvgTicketPrice Cancelled Carrier ... OriginWeather dayOfWeek timestamp +8 960.869736 True Kibana Airlines ... Heavy Fog 0 2018-01-01 12:09:35 +26 975.812632 True Kibana Airlines ... Rain 0 2018-01-01 15:38:32 +311 946.358410 True Kibana Airlines ... Heavy Fog 0 2018-01-01 11:51:12 +651 975.383864 True Kibana Airlines ... Rain 2 2018-01-03 21:13:17 +950 907.836523 True Kibana Airlines ... Thunder & Lightning 2 2018-01-03 05:14:51 + +[5 rows x 27 columns] + +>>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std']) + DistanceKilometers AvgTicketPrice +sum 9.261629e+07 8.204365e+06 +min 0.000000e+00 1.000205e+02 +std 4.578263e+03 2.663867e+02 + +>>> df[['Carrier', 'Origin', 'Dest']].nunique() +Carrier 4 +Origin 156 +Dest 156 +dtype: int64 + +>>> s = df.AvgTicketPrice * 2 + df.DistanceKilometers - df.FlightDelayMin +>>> s +0 18174.857422 +1 10589.365723 +2 381.273804 +3 739.126221 +4 14818.327637 + ... +13054 10219.474121 +13055 8381.823975 +13056 12661.157104 +13057 20819.488281 +13058 18315.431274 +Length: 13059, dtype: float64 + +>>> print(s.info_es()) +index_pattern: flights +Index: + index_field: _id + is_source_field: False +Mappings: + capabilities: + es_field_name is_source es_dtype es_date_format pd_dtype is_searchable is_aggregatable is_scripted aggregatable_es_field_name +NaN script_field_None False double None float64 True True True script_field_None +Operations: + tasks: [] + size: None + sort_params: None + _source: ['script_field_None'] + body: {'script_fields': {'script_field_None': {'script': {'source': "(((doc['AvgTicketPrice'].value * 2) + doc['DistanceKilometers'].value) - doc['FlightDelayMin'].value)"}}}} + post_processing: [] + +>>> pd_df = ed.eland_to_pandas(df) +>>> pd_df.head() + AvgTicketPrice Cancelled Carrier ... OriginWeather dayOfWeek timestamp +0 841.265642 False Kibana Airlines ... Sunny 0 2018-01-01 00:00:00 +1 882.982662 False Logstash Airways ... Clear 0 2018-01-01 18:27:00 +2 190.636904 False Logstash Airways ... Rain 0 2018-01-01 17:11:14 +3 181.694216 True Kibana Airlines ... Thunder & Lightning 0 2018-01-01 10:33:28 +4 730.041778 False Kibana Airlines ... Damaging Wind 0 2018-01-01 05:13:00 + +[5 rows x 27 columns] ``` See [docs](https://eland.readthedocs.io/en/latest) and [demo_notebook.ipynb](https://eland.readthedocs.io/en/latest/examples/demo_notebook.html) for more examples. @@ -87,28 +171,6 @@ package index](https://pypi.org/project/eland). pip install eland ``` -## Development Setup - -1. Create a virtual environment in Python - -For example, - -``` -python3 -m venv env -``` - -2. Activate the virtual environment - -``` -source env/bin/activate -``` - -3. Install dependencies from the `requirements.txt` file - -``` -pip install -r requirements.txt -``` - ## Versions and Compatibility ### Python Version Support @@ -127,7 +189,48 @@ No compatibility assurances are given between different major versions of the cl Major differences likely exist between major versions of Elasticsearch, particularly around request and response object formats, but also around API urls and behaviour. -## Connecting to Elasticsearch Cloud +## Connecting to Elasticsearch + +eland uses the [Elasticsearch low level client](https://elasticsearch-py.readthedocs.io/) to connect to Elasticsearch. +This client supports a range of [connection options and authentication mechanisms] +(https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch). + +### Basic Connection Options + +``` +>>> import eland as ed + +>>> # Connect to flights index via localhost Elasticsearch node +>>> ed.DataFrame('localhost', 'flights') + +>>> # Connect to flights index via localhost Elasticsearch node on port 9200 +>>> ed.DataFrame('localhost:9200', 'flights') + +>>> # Connect to flights index via localhost Elasticsearch node on port 9200 with : credentials +>>> ed.DataFrame('http://:@localhost:9200', 'flights') + +>>> # Connect to flights index via ssl +>>> es = Elasticsearch( + 'https://:@localhost:443', + use_ssl=True, + verify_certs=True, + ca_certs='/path/to/ca.crt' +) +>>> ed.DataFrame(es, 'flights') + +>>> # Connect to flights index via ssl using Urllib3HttpConnection options +>>> es = Elasticsearch( + ['localhost:443', 'other_host:443'], + use_ssl=True, + verify_certs=True, + ca_certs='/path/to/CA_certs', + client_cert='/path/to/clientcert.pem', + client_key='/path/to/clientkey.pem' +) +>>> ed.DataFrame(es, 'flights') +``` + +### Connecting to an Elasticsearch Cloud Cluster ``` >>> import eland as ed diff --git a/docs/source/examples/demo_notebook.ipynb b/docs/source/examples/demo_notebook.ipynb index a396ade..2209460 100644 --- a/docs/source/examples/demo_notebook.ipynb +++ b/docs/source/examples/demo_notebook.ipynb @@ -753,7 +753,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 17, @@ -2707,24 +2707,24 @@ " 410.008918\n", " 2470.545974\n", " ...\n", - " 251.942965\n", + " 251.944994\n", " 1.000000\n", " \n", " \n", " 50%\n", - " 640.362667\n", + " 640.387285\n", " 7612.072403\n", " ...\n", - " 503.148975\n", + " 502.986750\n", " 3.000000\n", " \n", " \n", " 75%\n", - " 842.006180\n", - " 9735.660463\n", + " 842.272763\n", + " 9735.860651\n", " ...\n", - " 720.569838\n", - " 4.243151\n", + " 720.505705\n", + " 4.246711\n", " \n", " \n", " max\n", @@ -2745,9 +2745,9 @@ "mean 628.253689 7092.142457 ... 511.127842 2.835975\n", "std 266.386661 4578.263193 ... 334.741135 1.939365\n", "min 100.020531 0.000000 ... 0.000000 0.000000\n", - "25% 410.008918 2470.545974 ... 251.942965 1.000000\n", - "50% 640.362667 7612.072403 ... 503.148975 3.000000\n", - "75% 842.006180 9735.660463 ... 720.569838 4.243151\n", + "25% 410.008918 2470.545974 ... 251.944994 1.000000\n", + "50% 640.387285 7612.072403 ... 502.986750 3.000000\n", + "75% 842.272763 9735.860651 ... 720.505705 4.246711\n", "max 1199.729004 19881.482422 ... 1902.901978 6.000000\n", "\n", "[8 rows x 7 columns]" diff --git a/docs/source/examples/online_retail_analysis.ipynb b/docs/source/examples/online_retail_analysis.ipynb index cb65fa3..f2e8e26 100644 --- a/docs/source/examples/online_retail_analysis.ipynb +++ b/docs/source/examples/online_retail_analysis.ipynb @@ -44,7 +44,8 @@ " es_if_exists='replace', \n", " es_dropna=True,\n", " es_refresh=True,\n", - " compression='gzip')" + " compression='gzip',\n", + " index_col=0)" ] }, { @@ -90,7 +91,7 @@ "data": { "text/plain": [ "Index(['Country', 'CustomerID', 'Description', 'InvoiceDate', 'InvoiceNo', 'Quantity', 'StockCode',\n", - " 'UnitPrice', 'Unnamed: 0'],\n", + " 'UnitPrice'],\n", " dtype='object')" ] }, @@ -126,7 +127,6 @@ "Quantity int64\n", "StockCode object\n", "UnitPrice float64\n", - "Unnamed: 0 int64\n", "dtype: object" ] }, @@ -170,12 +170,11 @@ "Quantity Quantity True long None int64 True True False Quantity\n", "StockCode StockCode True keyword None object True True False StockCode\n", "UnitPrice UnitPrice True double None float64 True True False UnitPrice\n", - "Unnamed: 0 Unnamed: 0 True long None int64 True True False Unnamed: 0\n", "Operations:\n", " tasks: []\n", " size: None\n", " sort_params: None\n", - " _source: ['Country', 'CustomerID', 'Description', 'InvoiceDate', 'InvoiceNo', 'Quantity', 'StockCode', 'UnitPrice', 'Unnamed: 0']\n", + " _source: ['Country', 'CustomerID', 'Description', 'InvoiceDate', 'InvoiceNo', 'Quantity', 'StockCode', 'UnitPrice']\n", " body: {}\n", " post_processing: []\n", "\n" @@ -233,8 +232,8 @@ " Country\n", " CustomerID\n", " ...\n", + " StockCode\n", " UnitPrice\n", - " Unnamed: 0\n", " \n", " \n", " \n", @@ -243,28 +242,28 @@ " United Kingdom\n", " 14729.0\n", " ...\n", + " 21123\n", " 1.25\n", - " 1000\n", " \n", " \n", " 1001\n", " United Kingdom\n", " 14729.0\n", " ...\n", + " 21124\n", " 1.25\n", - " 1001\n", " \n", " \n", "\n", "\n", - "

2 rows × 9 columns

" + "

2 rows × 8 columns

" ], "text/plain": [ - " Country CustomerID ... UnitPrice Unnamed: 0\n", - "1000 United Kingdom 14729.0 ... 1.25 1000\n", - "1001 United Kingdom 14729.0 ... 1.25 1001\n", + " Country CustomerID ... StockCode UnitPrice\n", + "1000 United Kingdom 14729.0 ... 21123 1.25\n", + "1001 United Kingdom 14729.0 ... 21124 1.25\n", "\n", - "[2 rows x 9 columns]" + "[2 rows x 8 columns]" ] }, "execution_count": 7, @@ -300,12 +299,11 @@ "Quantity Quantity True long None int64 True True False Quantity\n", "StockCode StockCode True keyword None object True True False StockCode\n", "UnitPrice UnitPrice True double None float64 True True False UnitPrice\n", - "Unnamed: 0 Unnamed: 0 True long None int64 True True False Unnamed: 0\n", "Operations:\n", " tasks: [('tail': ('sort_field': '_doc', 'count': 2)), ('head': ('sort_field': '_doc', 'count': 2)), ('tail': ('sort_field': '_doc', 'count': 2))]\n", " size: 2\n", " sort_params: _doc:desc\n", - " _source: ['Country', 'CustomerID', 'Description', 'InvoiceDate', 'InvoiceNo', 'Quantity', 'StockCode', 'UnitPrice', 'Unnamed: 0']\n", + " _source: ['Country', 'CustomerID', 'Description', 'InvoiceDate', 'InvoiceNo', 'Quantity', 'StockCode', 'UnitPrice']\n", " body: {}\n", " post_processing: [('sort_index'), ('head': ('count': 2)), ('tail': ('count': 2))]\n", "\n" @@ -345,8 +343,8 @@ " Country\n", " CustomerID\n", " ...\n", + " StockCode\n", " UnitPrice\n", - " Unnamed: 0\n", " \n", " \n", " \n", @@ -355,28 +353,28 @@ " United Kingdom\n", " 17419.0\n", " ...\n", + " 21773\n", " 1.25\n", - " 14998\n", " \n", " \n", " 14999\n", " United Kingdom\n", " 17419.0\n", " ...\n", + " 22149\n", " 2.10\n", - " 14999\n", " \n", " \n", "\n", "\n", - "

2 rows × 9 columns

" + "

2 rows × 8 columns

" ], "text/plain": [ - " Country CustomerID ... UnitPrice Unnamed: 0\n", - "14998 United Kingdom 17419.0 ... 1.25 14998\n", - "14999 United Kingdom 17419.0 ... 2.10 14999\n", + " Country CustomerID ... StockCode UnitPrice\n", + "14998 United Kingdom 17419.0 ... 21773 1.25\n", + "14999 United Kingdom 17419.0 ... 22149 2.10\n", "\n", - "[2 rows x 9 columns]" + "[2 rows x 8 columns]" ] }, "execution_count": 9, @@ -523,8 +521,8 @@ " Country\n", " CustomerID\n", " ...\n", + " StockCode\n", " UnitPrice\n", - " Unnamed: 0\n", " \n", " \n", " \n", @@ -533,55 +531,55 @@ " Germany\n", " 12662.0\n", " ...\n", + " 22809\n", " 2.95\n", - " 1109\n", " \n", " \n", " 1110\n", " Germany\n", " 12662.0\n", " ...\n", + " 84347\n", " 2.55\n", - " 1110\n", " \n", " \n", " 1111\n", " Germany\n", " 12662.0\n", " ...\n", + " 84945\n", " 0.85\n", - " 1111\n", " \n", " \n", " 1112\n", " Germany\n", " 12662.0\n", " ...\n", + " 22242\n", " 1.65\n", - " 1112\n", " \n", " \n", " 1113\n", " Germany\n", " 12662.0\n", " ...\n", + " 22244\n", " 1.95\n", - " 1113\n", " \n", " \n", "\n", "\n", - "

5 rows × 9 columns

" + "

5 rows × 8 columns

" ], "text/plain": [ - " Country CustomerID ... UnitPrice Unnamed: 0\n", - "1109 Germany 12662.0 ... 2.95 1109\n", - "1110 Germany 12662.0 ... 2.55 1110\n", - "1111 Germany 12662.0 ... 0.85 1111\n", - "1112 Germany 12662.0 ... 1.65 1112\n", - "1113 Germany 12662.0 ... 1.95 1113\n", + " Country CustomerID ... StockCode UnitPrice\n", + "1109 Germany 12662.0 ... 22809 2.95\n", + "1110 Germany 12662.0 ... 84347 2.55\n", + "1111 Germany 12662.0 ... 84945 0.85\n", + "1112 Germany 12662.0 ... 22242 1.65\n", + "1113 Germany 12662.0 ... 22244 1.95\n", "\n", - "[5 rows x 9 columns]" + "[5 rows x 8 columns]" ] }, "execution_count": 11, @@ -638,8 +636,8 @@ " Country\n", " CustomerID\n", " ...\n", + " StockCode\n", " UnitPrice\n", - " Unnamed: 0\n", " \n", " \n", " \n", @@ -648,55 +646,55 @@ " United Kingdom\n", " 14729.0\n", " ...\n", + " 21123\n", " 1.25\n", - " 1000\n", " \n", " \n", " 1001\n", " United Kingdom\n", " 14729.0\n", " ...\n", + " 21124\n", " 1.25\n", - " 1001\n", " \n", " \n", " 1002\n", " United Kingdom\n", " 14729.0\n", " ...\n", + " 21122\n", " 1.25\n", - " 1002\n", " \n", " \n", " 1003\n", " United Kingdom\n", " 14729.0\n", " ...\n", + " 84378\n", " 1.25\n", - " 1003\n", " \n", " \n", " 1004\n", " United Kingdom\n", " 14729.0\n", " ...\n", + " 21985\n", " 0.29\n", - " 1004\n", " \n", " \n", "\n", "\n", - "

5 rows × 9 columns

" + "

5 rows × 8 columns

" ], "text/plain": [ - " Country CustomerID ... UnitPrice Unnamed: 0\n", - "1000 United Kingdom 14729.0 ... 1.25 1000\n", - "1001 United Kingdom 14729.0 ... 1.25 1001\n", - "1002 United Kingdom 14729.0 ... 1.25 1002\n", - "1003 United Kingdom 14729.0 ... 1.25 1003\n", - "1004 United Kingdom 14729.0 ... 0.29 1004\n", + " Country CustomerID ... StockCode UnitPrice\n", + "1000 United Kingdom 14729.0 ... 21123 1.25\n", + "1001 United Kingdom 14729.0 ... 21124 1.25\n", + "1002 United Kingdom 14729.0 ... 21122 1.25\n", + "1003 United Kingdom 14729.0 ... 84378 1.25\n", + "1004 United Kingdom 14729.0 ... 21985 0.29\n", "\n", - "[5 rows x 9 columns]" + "[5 rows x 8 columns]" ] }, "execution_count": 12, @@ -745,22 +743,22 @@ " Country\n", " CustomerID\n", " ...\n", + " StockCode\n", " UnitPrice\n", - " Unnamed: 0\n", " \n", " \n", " \n", " \n", "\n", "\n", - "

0 rows × 9 columns

" + "

0 rows × 8 columns

" ], "text/plain": [ "Empty DataFrame\n", - "Columns: [Country, CustomerID, Description, InvoiceDate, InvoiceNo, Quantity, StockCode, UnitPrice, Unnamed: 0]\n", + "Columns: [Country, CustomerID, Description, InvoiceDate, InvoiceNo, Quantity, StockCode, UnitPrice]\n", "Index: []\n", "\n", - "[0 rows x 9 columns]" + "[0 rows x 8 columns]" ] }, "execution_count": 13, @@ -803,12 +801,11 @@ "Quantity Quantity True long None int64 True True False Quantity\n", "StockCode StockCode True keyword None object True True False StockCode\n", "UnitPrice UnitPrice True double None float64 True True False UnitPrice\n", - "Unnamed: 0 Unnamed: 0 True long None int64 True True False Unnamed: 0\n", "Operations:\n", " tasks: [('boolean_filter': ('boolean_filter': {'bool': {'must': [{'term': {'Country': 'Germany'}}, {'range': {'Quantity': {'gt': 90}}}]}}))]\n", " size: None\n", " sort_params: None\n", - " _source: ['Country', 'CustomerID', 'Description', 'InvoiceDate', 'InvoiceNo', 'Quantity', 'StockCode', 'UnitPrice', 'Unnamed: 0']\n", + " _source: ['Country', 'CustomerID', 'Description', 'InvoiceDate', 'InvoiceNo', 'Quantity', 'StockCode', 'UnitPrice']\n", " body: {'query': {'bool': {'must': [{'term': {'Country': 'Germany'}}, {'range': {'Quantity': {'gt': 90}}}]}}}\n", " post_processing: []\n", "\n" @@ -997,7 +994,6 @@ " CustomerID\n", " Quantity\n", " UnitPrice\n", - " Unnamed: 0\n", " \n", " \n", " \n", @@ -1006,71 +1002,63 @@ " 10729.000000\n", " 15000.000000\n", " 15000.000000\n", - " 15000.000000\n", " \n", " \n", " mean\n", " 15590.776680\n", " 7.464000\n", " 4.103233\n", - " 7499.500000\n", " \n", " \n", " std\n", " 1764.025160\n", " 85.924387\n", " 20.104873\n", - " 4330.127009\n", " \n", " \n", " min\n", " 12347.000000\n", " -9360.000000\n", " 0.000000\n", - " 0.000000\n", " \n", " \n", " 25%\n", - " 14224.078193\n", + " 14225.075800\n", " 1.000000\n", " 1.250000\n", - " 3760.745049\n", " \n", " \n", " 50%\n", - " 15659.417515\n", + " 15667.359184\n", " 2.000000\n", " 2.510000\n", - " 7499.488310\n", " \n", " \n", " 75%\n", - " 17213.978376\n", - " 6.564935\n", + " 17212.690092\n", + " 6.552523\n", " 4.210000\n", - " 11249.500000\n", " \n", " \n", " max\n", " 18239.000000\n", " 2880.000000\n", " 950.990000\n", - " 14999.000000\n", " \n", " \n", "\n", "" ], "text/plain": [ - " CustomerID Quantity UnitPrice Unnamed: 0\n", - "count 10729.000000 15000.000000 15000.000000 15000.000000\n", - "mean 15590.776680 7.464000 4.103233 7499.500000\n", - "std 1764.025160 85.924387 20.104873 4330.127009\n", - "min 12347.000000 -9360.000000 0.000000 0.000000\n", - "25% 14224.078193 1.000000 1.250000 3760.745049\n", - "50% 15659.417515 2.000000 2.510000 7499.488310\n", - "75% 17213.978376 6.564935 4.210000 11249.500000\n", - "max 18239.000000 2880.000000 950.990000 14999.000000" + " CustomerID Quantity UnitPrice\n", + "count 10729.000000 15000.000000 15000.000000\n", + "mean 15590.776680 7.464000 4.103233\n", + "std 1764.025160 85.924387 20.104873\n", + "min 12347.000000 -9360.000000 0.000000\n", + "25% 14225.075800 1.000000 1.250000\n", + "50% 15667.359184 2.000000 2.510000\n", + "75% 17212.690092 6.552523 4.210000\n", + "max 18239.000000 2880.000000 950.990000" ] }, "execution_count": 18, @@ -1171,8 +1159,8 @@ " Country\n", " CustomerID\n", " ...\n", + " StockCode\n", " UnitPrice\n", - " Unnamed: 0\n", " \n", " \n", " \n", @@ -1181,40 +1169,40 @@ " United Kingdom\n", " 15485.0\n", " ...\n", + " 22086\n", " 2.55\n", - " 1228\n", " \n", " \n", " 1237\n", " Norway\n", " 12433.0\n", " ...\n", + " 22444\n", " 1.06\n", - " 1237\n", " \n", " \n", " 1286\n", " Norway\n", " 12433.0\n", " ...\n", + " 84050\n", " 1.25\n", - " 1286\n", " \n", " \n", " 1293\n", " Norway\n", " 12433.0\n", " ...\n", + " 22197\n", " 0.85\n", - " 1293\n", " \n", " \n", " 1333\n", " United Kingdom\n", " 18144.0\n", " ...\n", + " 84879\n", " 1.69\n", - " 1333\n", " \n", " \n", " ...\n", @@ -1229,61 +1217,61 @@ " United Kingdom\n", " 15061.0\n", " ...\n", + " 22423\n", " 10.95\n", - " 14784\n", " \n", " \n", " 14785\n", " United Kingdom\n", " 15061.0\n", " ...\n", + " 22075\n", " 1.45\n", - " 14785\n", " \n", " \n", " 14788\n", " United Kingdom\n", " 15061.0\n", " ...\n", + " 17038\n", " 0.07\n", - " 14788\n", " \n", " \n", " 14974\n", " United Kingdom\n", " 14739.0\n", " ...\n", + " 21704\n", " 0.72\n", - " 14974\n", " \n", " \n", " 14980\n", " United Kingdom\n", " 14739.0\n", " ...\n", + " 22178\n", " 1.06\n", - " 14980\n", " \n", " \n", "\n", "\n", - "

258 rows × 9 columns

" + "

258 rows × 8 columns

" ], "text/plain": [ - " Country CustomerID ... UnitPrice Unnamed: 0\n", - "1228 United Kingdom 15485.0 ... 2.55 1228\n", - "1237 Norway 12433.0 ... 1.06 1237\n", - "1286 Norway 12433.0 ... 1.25 1286\n", - "1293 Norway 12433.0 ... 0.85 1293\n", - "1333 United Kingdom 18144.0 ... 1.69 1333\n", - "... ... ... ... ... ...\n", - "14784 United Kingdom 15061.0 ... 10.95 14784\n", - "14785 United Kingdom 15061.0 ... 1.45 14785\n", - "14788 United Kingdom 15061.0 ... 0.07 14788\n", - "14974 United Kingdom 14739.0 ... 0.72 14974\n", - "14980 United Kingdom 14739.0 ... 1.06 14980\n", + " Country CustomerID ... StockCode UnitPrice\n", + "1228 United Kingdom 15485.0 ... 22086 2.55\n", + "1237 Norway 12433.0 ... 22444 1.06\n", + "1286 Norway 12433.0 ... 84050 1.25\n", + "1293 Norway 12433.0 ... 22197 0.85\n", + "1333 United Kingdom 18144.0 ... 84879 1.69\n", + "... ... ... ... ... ...\n", + "14784 United Kingdom 15061.0 ... 22423 10.95\n", + "14785 United Kingdom 15061.0 ... 22075 1.45\n", + "14788 United Kingdom 15061.0 ... 17038 0.07\n", + "14974 United Kingdom 14739.0 ... 21704 0.72\n", + "14980 United Kingdom 14739.0 ... 22178 1.06\n", "\n", - "[258 rows x 9 columns]" + "[258 rows x 8 columns]" ] }, "execution_count": 21, @@ -1449,6 +1437,15 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.5" + }, + "pycharm": { + "stem_cell": { + "cell_type": "raw", + "metadata": { + "collapsed": false + }, + "source": [] + } } }, "nbformat": 4, diff --git a/eland/_version.py b/eland/_version.py index 4bf8edd..4eb64b0 100644 --- a/eland/_version.py +++ b/eland/_version.py @@ -15,6 +15,6 @@ __title__ = 'eland' __description__ = 'Python elasticsearch client to analyse, explore and manipulate data that resides in elasticsearch.' __url__ = 'https://github.com/elastic/eland' -__version__ = '7.5.1a2' +__version__ = '7.5.1a3' __maintainer__ = 'Elasticsearch B.V.' __maintainer_email__ = 'steve.dodson@elastic.co' diff --git a/setup.py b/setup.py index 78aece2..48b7150 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,6 @@ CLASSIFIERS = [ "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", - "Programming Language :: Cython", "Topic :: Scientific/Engineering", ] @@ -52,26 +51,110 @@ index pattern, and explore using an API that mirrors a subset of the pandas.Data ``` >>> import eland as ed ->>> df = ed.read_es('http://localhost:9200', 'reviews') +>>> # Connect to 'flights' index via localhost Elasticsearch node +>>> df = ed.DataFrame('localhost:9200', 'flights') >>> df.head() - reviewerId vendorId rating date -0 0 0 5 2006-04-07 17:08 -1 1 1 5 2006-05-04 12:16 -2 2 2 4 2006-04-21 12:26 -3 3 3 5 2006-04-18 15:48 -4 3 4 5 2006-04-18 15:49 + AvgTicketPrice Cancelled Carrier ... OriginWeather dayOfWeek timestamp +0 841.265642 False Kibana Airlines ... Sunny 0 2018-01-01 00:00:00 +1 882.982662 False Logstash Airways ... Clear 0 2018-01-01 18:27:00 +2 190.636904 False Logstash Airways ... Rain 0 2018-01-01 17:11:14 +3 181.694216 True Kibana Airlines ... Thunder & Lightning 0 2018-01-01 10:33:28 +4 730.041778 False Kibana Airlines ... Damaging Wind 0 2018-01-01 05:13:00 + +[5 rows x 27 columns] >>> df.describe() - reviewerId vendorId rating -count 578805.000000 578805.000000 578805.000000 -mean 174124.098437 60.645267 4.679671 -std 116951.972209 54.488053 0.800891 -min 0.000000 0.000000 0.000000 -25% 70043.000000 20.000000 5.000000 -50% 161052.000000 44.000000 5.000000 -75% 272697.000000 83.000000 5.000000 -max 400140.000000 246.000000 5.000000 + AvgTicketPrice DistanceKilometers DistanceMiles FlightDelayMin FlightTimeHour FlightTimeMin dayOfWeek +count 13059.000000 13059.000000 13059.000000 13059.000000 13059.000000 13059.000000 13059.000000 +mean 628.253689 7092.142457 4406.853010 47.335171 8.518797 511.127842 2.835975 +std 266.386661 4578.263193 2844.800855 96.743006 5.579019 334.741135 1.939365 +min 100.020531 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 +25% 410.008918 2470.545974 1535.126118 0.000000 4.194976 251.738513 1.000000 +50% 640.362667 7612.072403 4729.922470 0.000000 8.385816 503.148975 3.000000 +75% 842.254990 9735.082407 6049.459005 15.000000 12.009396 720.534532 4.141221 +max 1199.729004 19881.482422 12353.780273 360.000000 31.715034 1902.901978 6.000000 + +>>> df[['Carrier', 'AvgTicketPrice', 'Cancelled']] + Carrier AvgTicketPrice Cancelled +0 Kibana Airlines 841.265642 False +1 Logstash Airways 882.982662 False +2 Logstash Airways 190.636904 False +3 Kibana Airlines 181.694216 True +4 Kibana Airlines 730.041778 False +... ... ... ... +13054 Logstash Airways 1080.446279 False +13055 Logstash Airways 646.612941 False +13056 Logstash Airways 997.751876 False +13057 JetBeats 1102.814465 False +13058 JetBeats 858.144337 False + +[13059 rows x 3 columns] + +>>> df[(df.Carrier=="Kibana Airlines") & (df.AvgTicketPrice > 900.0) & (df.Cancelled == True)].head() + AvgTicketPrice Cancelled Carrier ... OriginWeather dayOfWeek timestamp +8 960.869736 True Kibana Airlines ... Heavy Fog 0 2018-01-01 12:09:35 +26 975.812632 True Kibana Airlines ... Rain 0 2018-01-01 15:38:32 +311 946.358410 True Kibana Airlines ... Heavy Fog 0 2018-01-01 11:51:12 +651 975.383864 True Kibana Airlines ... Rain 2 2018-01-03 21:13:17 +950 907.836523 True Kibana Airlines ... Thunder & Lightning 2 2018-01-03 05:14:51 + +[5 rows x 27 columns] + +>>> df[['DistanceKilometers', 'AvgTicketPrice']].aggregate(['sum', 'min', 'std']) + DistanceKilometers AvgTicketPrice +sum 9.261629e+07 8.204365e+06 +min 0.000000e+00 1.000205e+02 +std 4.578263e+03 2.663867e+02 + +>>> df[['Carrier', 'Origin', 'Dest']].nunique() +Carrier 4 +Origin 156 +Dest 156 +dtype: int64 + +>>> s = df.AvgTicketPrice * 2 + df.DistanceKilometers - df.FlightDelayMin +>>> s +0 18174.857422 +1 10589.365723 +2 381.273804 +3 739.126221 +4 14818.327637 + ... +13054 10219.474121 +13055 8381.823975 +13056 12661.157104 +13057 20819.488281 +13058 18315.431274 +Length: 13059, dtype: float64 + +>>> print(s.info_es()) +index_pattern: flights +Index: + index_field: _id + is_source_field: False +Mappings: + capabilities: + es_field_name is_source es_dtype es_date_format pd_dtype is_searchable is_aggregatable is_scripted aggregatable_es_field_name +NaN script_field_None False double None float64 True True True script_field_None +Operations: + tasks: [] + size: None + sort_params: None + _source: ['script_field_None'] + body: {'script_fields': {'script_field_None': {'script': {'source': "(((doc['AvgTicketPrice'].value * 2) + doc['DistanceKilometers'].value) - doc['FlightDelayMin'].value)"}}}} + post_processing: [] + +>>> pd_df = ed.eland_to_pandas(df) +>>> pd_df.head() + AvgTicketPrice Cancelled Carrier ... OriginWeather dayOfWeek timestamp +0 841.265642 False Kibana Airlines ... Sunny 0 2018-01-01 00:00:00 +1 882.982662 False Logstash Airways ... Clear 0 2018-01-01 18:27:00 +2 190.636904 False Logstash Airways ... Rain 0 2018-01-01 17:11:14 +3 181.694216 True Kibana Airlines ... Thunder & Lightning 0 2018-01-01 10:33:28 +4 730.041778 False Kibana Airlines ... Damaging Wind 0 2018-01-01 05:13:00 + +[5 rows x 27 columns] ``` See [docs](https://eland.readthedocs.io/en/latest) and [demo_notebook.ipynb](https://eland.readthedocs.io/en/latest/examples/demo_notebook.html) for more examples.