Feature/python 3.5 (#93)

* Adding python 3.5 compatibility.

Main issue is ordering of dictionaries.

* Updating notebooks with 3.7 results.

* Removing tempoorary code.

* Defaulting to OrderedDict for python 3.5 + lint all code

All code reformated by PyCharm and inspection results analysed.
This commit is contained in:
stevedodson 2019-12-11 14:27:35 +01:00 committed by GitHub
parent 9a2d55f3c8
commit c5730e6d38
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
35 changed files with 664 additions and 442 deletions

View File

@ -140,7 +140,11 @@
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -166,7 +170,11 @@
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -199,7 +207,11 @@
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -230,7 +242,11 @@
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -268,7 +284,11 @@
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -421,7 +441,11 @@
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -581,7 +605,11 @@
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -601,7 +629,11 @@
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -628,7 +660,11 @@
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -648,7 +684,11 @@
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -677,7 +717,11 @@
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -700,12 +744,16 @@
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"<eland.index.Index at 0x11214bfd0>"
"<eland.index.Index at 0x12036ef90>"
]
},
"execution_count": 17,
@ -721,7 +769,11 @@
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -750,7 +802,11 @@
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -782,7 +838,11 @@
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"name": "stdout",
@ -1023,7 +1083,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## DataFrame.tail"
"### DataFrame.tail"
]
},
{
@ -1242,7 +1302,11 @@
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -1268,7 +1332,11 @@
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -1301,7 +1369,11 @@
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -1332,7 +1404,11 @@
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -1363,7 +1439,11 @@
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -1487,7 +1567,11 @@
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"name": "stdout",
@ -1514,7 +1598,11 @@
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -1676,7 +1764,11 @@
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -1836,7 +1928,11 @@
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -1991,7 +2087,11 @@
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -2160,7 +2260,11 @@
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -2233,7 +2337,11 @@
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -2313,7 +2421,11 @@
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -2344,7 +2456,11 @@
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -2382,7 +2498,11 @@
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -2515,7 +2635,11 @@
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -2580,15 +2704,15 @@
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>409.983219</td>\n",
" <td>410.008918</td>\n",
" <td>2470.545974</td>\n",
" <td>...</td>\n",
" <td>251.738513</td>\n",
" <td>251.944994</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>640.387285</td>\n",
" <td>640.362667</td>\n",
" <td>7612.072403</td>\n",
" <td>...</td>\n",
" <td>503.148975</td>\n",
@ -2596,11 +2720,11 @@
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>842.255395</td>\n",
" <td>9735.860651</td>\n",
" <td>842.254990</td>\n",
" <td>9735.660463</td>\n",
" <td>...</td>\n",
" <td>720.561564</td>\n",
" <td>4.230496</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
@ -2621,9 +2745,9 @@
"mean 628.253689 7092.142457 ... 511.127842 2.835975\n",
"std 266.386661 4578.263193 ... 334.741135 1.939365\n",
"min 100.020531 0.000000 ... 0.000000 0.000000\n",
"25% 409.983219 2470.545974 ... 251.738513 1.000000\n",
"50% 640.387285 7612.072403 ... 503.148975 3.000000\n",
"75% 842.255395 9735.860651 ... 720.561564 4.230496\n",
"25% 410.008918 2470.545974 ... 251.944994 1.000000\n",
"50% 640.362667 7612.072403 ... 503.148975 3.000000\n",
"75% 842.254990 9735.660463 ... 720.561564 4.000000\n",
"max 1199.729004 19881.482422 ... 1902.901978 6.000000\n",
"\n",
"[8 rows x 7 columns]"
@ -2649,7 +2773,11 @@
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"name": "stdout",
@ -2697,7 +2825,11 @@
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"name": "stdout",
@ -2759,7 +2891,11 @@
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -2795,7 +2931,11 @@
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -2831,7 +2971,11 @@
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -2860,7 +3004,11 @@
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -2896,7 +3044,11 @@
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -2925,7 +3077,11 @@
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -2961,7 +3117,11 @@
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -2990,7 +3150,11 @@
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -3026,7 +3190,11 @@
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -3049,7 +3217,11 @@
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -3079,7 +3251,11 @@
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -3103,7 +3279,7 @@
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Carrier</th>\n",
" <th>DestLocation</th>\n",
" <th>DestRegion</th>\n",
" <th>...</th>\n",
" <th>dayOfWeek</th>\n",
" <th>timestamp</th>\n",
@ -3113,7 +3289,7 @@
" <tr>\n",
" <th>0</th>\n",
" <td>Kibana Airlines</td>\n",
" <td>{'lat': '-33.94609833', 'lon': '151.177002'}</td>\n",
" <td>SE-BD</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>2018-01-01 00:00:00</td>\n",
@ -3121,7 +3297,7 @@
" <tr>\n",
" <th>1</th>\n",
" <td>Logstash Airways</td>\n",
" <td>{'lat': '45.505299', 'lon': '12.3519'}</td>\n",
" <td>IT-34</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>2018-01-01 18:27:00</td>\n",
@ -3129,7 +3305,7 @@
" <tr>\n",
" <th>2</th>\n",
" <td>Logstash Airways</td>\n",
" <td>{'lat': '45.505299', 'lon': '12.3519'}</td>\n",
" <td>IT-34</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>2018-01-01 17:11:14</td>\n",
@ -3137,7 +3313,7 @@
" <tr>\n",
" <th>3</th>\n",
" <td>Kibana Airlines</td>\n",
" <td>{'lat': '45.648399', 'lon': '12.1944'}</td>\n",
" <td>IT-34</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>2018-01-01 10:33:28</td>\n",
@ -3145,7 +3321,7 @@
" <tr>\n",
" <th>4</th>\n",
" <td>Kibana Airlines</td>\n",
" <td>{'lat': '34.447102', 'lon': '108.751999'}</td>\n",
" <td>SE-BD</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>2018-01-01 05:13:00</td>\n",
@ -3161,7 +3337,7 @@
" <tr>\n",
" <th>13054</th>\n",
" <td>Logstash Airways</td>\n",
" <td>{'lat': '34.447102', 'lon': '108.751999'}</td>\n",
" <td>SE-BD</td>\n",
" <td>...</td>\n",
" <td>6</td>\n",
" <td>2018-02-11 20:42:25</td>\n",
@ -3169,7 +3345,7 @@
" <tr>\n",
" <th>13055</th>\n",
" <td>Logstash Airways</td>\n",
" <td>{'lat': '47.464699', 'lon': '8.54917'}</td>\n",
" <td>CH-ZH</td>\n",
" <td>...</td>\n",
" <td>6</td>\n",
" <td>2018-02-11 01:41:57</td>\n",
@ -3177,7 +3353,7 @@
" <tr>\n",
" <th>13056</th>\n",
" <td>Logstash Airways</td>\n",
" <td>{'lat': '51.169997', 'lon': '128.445007'}</td>\n",
" <td>RU-AMU</td>\n",
" <td>...</td>\n",
" <td>6</td>\n",
" <td>2018-02-11 04:09:27</td>\n",
@ -3185,7 +3361,7 @@
" <tr>\n",
" <th>13057</th>\n",
" <td>JetBeats</td>\n",
" <td>{'lat': '-34.8222', 'lon': '-58.5358'}</td>\n",
" <td>SE-BD</td>\n",
" <td>...</td>\n",
" <td>6</td>\n",
" <td>2018-02-11 08:28:21</td>\n",
@ -3193,44 +3369,31 @@
" <tr>\n",
" <th>13058</th>\n",
" <td>JetBeats</td>\n",
" <td>{'lat': '38.94449997', 'lon': '-77.45580292'}</td>\n",
" <td>US-DC</td>\n",
" <td>...</td>\n",
" <td>6</td>\n",
" <td>2018-02-11 14:54:34</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>13059 rows × 21 columns</p>\n",
"<p>13059 rows × 20 columns</p>\n",
"</div>"
],
"text/plain": [
" Carrier DestLocation ... dayOfWeek \\\n",
"0 Kibana Airlines {'lat': '-33.94609833', 'lon': '151.177002'} ... 0 \n",
"1 Logstash Airways {'lat': '45.505299', 'lon': '12.3519'} ... 0 \n",
"2 Logstash Airways {'lat': '45.505299', 'lon': '12.3519'} ... 0 \n",
"3 Kibana Airlines {'lat': '45.648399', 'lon': '12.1944'} ... 0 \n",
"4 Kibana Airlines {'lat': '34.447102', 'lon': '108.751999'} ... 0 \n",
"... ... ... ... ... \n",
"13054 Logstash Airways {'lat': '34.447102', 'lon': '108.751999'} ... 6 \n",
"13055 Logstash Airways {'lat': '47.464699', 'lon': '8.54917'} ... 6 \n",
"13056 Logstash Airways {'lat': '51.169997', 'lon': '128.445007'} ... 6 \n",
"13057 JetBeats {'lat': '-34.8222', 'lon': '-58.5358'} ... 6 \n",
"13058 JetBeats {'lat': '38.94449997', 'lon': '-77.45580292'} ... 6 \n",
" Carrier DestRegion ... dayOfWeek timestamp\n",
"0 Kibana Airlines SE-BD ... 0 2018-01-01 00:00:00\n",
"1 Logstash Airways IT-34 ... 0 2018-01-01 18:27:00\n",
"2 Logstash Airways IT-34 ... 0 2018-01-01 17:11:14\n",
"3 Kibana Airlines IT-34 ... 0 2018-01-01 10:33:28\n",
"4 Kibana Airlines SE-BD ... 0 2018-01-01 05:13:00\n",
"... ... ... ... ... ...\n",
"13054 Logstash Airways SE-BD ... 6 2018-02-11 20:42:25\n",
"13055 Logstash Airways CH-ZH ... 6 2018-02-11 01:41:57\n",
"13056 Logstash Airways RU-AMU ... 6 2018-02-11 04:09:27\n",
"13057 JetBeats SE-BD ... 6 2018-02-11 08:28:21\n",
"13058 JetBeats US-DC ... 6 2018-02-11 14:54:34\n",
"\n",
" timestamp \n",
"0 2018-01-01 00:00:00 \n",
"1 2018-01-01 18:27:00 \n",
"2 2018-01-01 17:11:14 \n",
"3 2018-01-01 10:33:28 \n",
"4 2018-01-01 05:13:00 \n",
"... ... \n",
"13054 2018-02-11 20:42:25 \n",
"13055 2018-02-11 01:41:57 \n",
"13056 2018-02-11 04:09:27 \n",
"13057 2018-02-11 08:28:21 \n",
"13058 2018-02-11 14:54:34 \n",
"\n",
"[13059 rows x 21 columns]"
"[13059 rows x 20 columns]"
]
},
"execution_count": 53,
@ -3241,6 +3404,7 @@
"source": [
"pd_flights.drop(columns=['AvgTicketPrice', \n",
" 'Cancelled', \n",
" 'DestLocation',\n",
" 'Dest', \n",
" 'DestAirportID', \n",
" 'DestCityName', \n",
@ -3250,7 +3414,11 @@
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -3274,7 +3442,7 @@
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Carrier</th>\n",
" <th>DestLocation</th>\n",
" <th>DestRegion</th>\n",
" <th>...</th>\n",
" <th>dayOfWeek</th>\n",
" <th>timestamp</th>\n",
@ -3284,7 +3452,7 @@
" <tr>\n",
" <th>0</th>\n",
" <td>Kibana Airlines</td>\n",
" <td>{'lon': '151.177002', 'lat': '-33.94609833'}</td>\n",
" <td>SE-BD</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>2018-01-01 00:00:00</td>\n",
@ -3292,7 +3460,7 @@
" <tr>\n",
" <th>1</th>\n",
" <td>Logstash Airways</td>\n",
" <td>{'lon': '12.3519', 'lat': '45.505299'}</td>\n",
" <td>IT-34</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>2018-01-01 18:27:00</td>\n",
@ -3300,7 +3468,7 @@
" <tr>\n",
" <th>2</th>\n",
" <td>Logstash Airways</td>\n",
" <td>{'lon': '12.3519', 'lat': '45.505299'}</td>\n",
" <td>IT-34</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>2018-01-01 17:11:14</td>\n",
@ -3308,7 +3476,7 @@
" <tr>\n",
" <th>3</th>\n",
" <td>Kibana Airlines</td>\n",
" <td>{'lon': '12.1944', 'lat': '45.648399'}</td>\n",
" <td>IT-34</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>2018-01-01 10:33:28</td>\n",
@ -3316,7 +3484,7 @@
" <tr>\n",
" <th>4</th>\n",
" <td>Kibana Airlines</td>\n",
" <td>{'lon': '108.751999', 'lat': '34.447102'}</td>\n",
" <td>SE-BD</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>2018-01-01 05:13:00</td>\n",
@ -3332,7 +3500,7 @@
" <tr>\n",
" <th>13054</th>\n",
" <td>Logstash Airways</td>\n",
" <td>{'lon': '108.751999', 'lat': '34.447102'}</td>\n",
" <td>SE-BD</td>\n",
" <td>...</td>\n",
" <td>6</td>\n",
" <td>2018-02-11 20:42:25</td>\n",
@ -3340,7 +3508,7 @@
" <tr>\n",
" <th>13055</th>\n",
" <td>Logstash Airways</td>\n",
" <td>{'lon': '8.54917', 'lat': '47.464699'}</td>\n",
" <td>CH-ZH</td>\n",
" <td>...</td>\n",
" <td>6</td>\n",
" <td>2018-02-11 01:41:57</td>\n",
@ -3348,7 +3516,7 @@
" <tr>\n",
" <th>13056</th>\n",
" <td>Logstash Airways</td>\n",
" <td>{'lon': '128.445007', 'lat': '51.169997'}</td>\n",
" <td>RU-AMU</td>\n",
" <td>...</td>\n",
" <td>6</td>\n",
" <td>2018-02-11 04:09:27</td>\n",
@ -3356,7 +3524,7 @@
" <tr>\n",
" <th>13057</th>\n",
" <td>JetBeats</td>\n",
" <td>{'lon': '-58.5358', 'lat': '-34.8222'}</td>\n",
" <td>SE-BD</td>\n",
" <td>...</td>\n",
" <td>6</td>\n",
" <td>2018-02-11 08:28:21</td>\n",
@ -3364,7 +3532,7 @@
" <tr>\n",
" <th>13058</th>\n",
" <td>JetBeats</td>\n",
" <td>{'lon': '-77.45580292', 'lat': '38.94449997'}</td>\n",
" <td>US-DC</td>\n",
" <td>...</td>\n",
" <td>6</td>\n",
" <td>2018-02-11 14:54:34</td>\n",
@ -3372,36 +3540,23 @@
" </tbody>\n",
"</table>\n",
"</div>\n",
"<p>13059 rows × 21 columns</p>"
"<p>13059 rows × 20 columns</p>"
],
"text/plain": [
" Carrier DestLocation ... dayOfWeek \\\n",
"0 Kibana Airlines {'lon': '151.177002', 'lat': '-33.94609833'} ... 0 \n",
"1 Logstash Airways {'lon': '12.3519', 'lat': '45.505299'} ... 0 \n",
"2 Logstash Airways {'lon': '12.3519', 'lat': '45.505299'} ... 0 \n",
"3 Kibana Airlines {'lon': '12.1944', 'lat': '45.648399'} ... 0 \n",
"4 Kibana Airlines {'lon': '108.751999', 'lat': '34.447102'} ... 0 \n",
"... ... ... ... ... \n",
"13054 Logstash Airways {'lon': '108.751999', 'lat': '34.447102'} ... 6 \n",
"13055 Logstash Airways {'lon': '8.54917', 'lat': '47.464699'} ... 6 \n",
"13056 Logstash Airways {'lon': '128.445007', 'lat': '51.169997'} ... 6 \n",
"13057 JetBeats {'lon': '-58.5358', 'lat': '-34.8222'} ... 6 \n",
"13058 JetBeats {'lon': '-77.45580292', 'lat': '38.94449997'} ... 6 \n",
" Carrier DestRegion ... dayOfWeek timestamp\n",
"0 Kibana Airlines SE-BD ... 0 2018-01-01 00:00:00\n",
"1 Logstash Airways IT-34 ... 0 2018-01-01 18:27:00\n",
"2 Logstash Airways IT-34 ... 0 2018-01-01 17:11:14\n",
"3 Kibana Airlines IT-34 ... 0 2018-01-01 10:33:28\n",
"4 Kibana Airlines SE-BD ... 0 2018-01-01 05:13:00\n",
"... ... ... ... ... ...\n",
"13054 Logstash Airways SE-BD ... 6 2018-02-11 20:42:25\n",
"13055 Logstash Airways CH-ZH ... 6 2018-02-11 01:41:57\n",
"13056 Logstash Airways RU-AMU ... 6 2018-02-11 04:09:27\n",
"13057 JetBeats SE-BD ... 6 2018-02-11 08:28:21\n",
"13058 JetBeats US-DC ... 6 2018-02-11 14:54:34\n",
"\n",
" timestamp \n",
"0 2018-01-01 00:00:00 \n",
"1 2018-01-01 18:27:00 \n",
"2 2018-01-01 17:11:14 \n",
"3 2018-01-01 10:33:28 \n",
"4 2018-01-01 05:13:00 \n",
"... ... \n",
"13054 2018-02-11 20:42:25 \n",
"13055 2018-02-11 01:41:57 \n",
"13056 2018-02-11 04:09:27 \n",
"13057 2018-02-11 08:28:21 \n",
"13058 2018-02-11 14:54:34 \n",
"\n",
"[13059 rows x 21 columns]"
"[13059 rows x 20 columns]"
]
},
"execution_count": 54,
@ -3412,6 +3567,7 @@
"source": [
"ed_flights.drop(columns=['AvgTicketPrice', \n",
" 'Cancelled', \n",
" 'DestLocation',\n",
" 'Dest', \n",
" 'DestAirportID', \n",
" 'DestCityName', \n",
@ -3428,7 +3584,11 @@
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -3451,7 +3611,11 @@
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"data": {
@ -3481,7 +3645,11 @@
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [],
"source": [
"ed_flights2 = ed_flights[(ed_flights.OriginAirportID == 'AMS') & (ed_flights.FlightDelayMin > 60)]\n",
@ -3492,7 +3660,11 @@
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"name": "stdout",
@ -3537,7 +3709,7 @@
" size: 5\n",
" sort_params: _doc:desc\n",
" _source: ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']\n",
" body: {'query': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}, 'aggs': {}}\n",
" body: {'query': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}}\n",
" post_processing: [('sort_index')]\n",
"'field_to_display_names': {}\n",
"'display_to_field_names': {}\n",

View File

@ -5,7 +5,7 @@ Examples
========
.. toctree::
:maxdepth: 2
:maxdepth: 3
demo_notebook
online_retail_analysis

View File

@ -176,7 +176,7 @@
" size: None\n",
" sort_params: None\n",
" _source: None\n",
" body: {'aggs': {}}\n",
" body: {}\n",
" post_processing: []\n",
"'field_to_display_names': {}\n",
"'display_to_field_names': {}\n",
@ -308,7 +308,7 @@
" size: 2\n",
" sort_params: _doc:desc\n",
" _source: None\n",
" body: {'aggs': {}}\n",
" body: {}\n",
" post_processing: [('sort_index'), ('head': ('count': 2)), ('tail': ('count': 2))]\n",
"'field_to_display_names': {}\n",
"'display_to_field_names': {}\n",
@ -813,7 +813,7 @@
" size: None\n",
" sort_params: None\n",
" _source: None\n",
" body: {'query': {'bool': {'must': [{'term': {'Country': 'Germany'}}, {'range': {'Quantity': {'gt': 90}}}]}}, 'aggs': {}}\n",
" body: {'query': {'bool': {'must': [{'term': {'Country': 'Germany'}}, {'range': {'Quantity': {'gt': 90}}}]}}}\n",
" post_processing: []\n",
"'field_to_display_names': {}\n",
"'display_to_field_names': {}\n",
@ -1037,23 +1037,23 @@
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>14220.581670</td>\n",
" <td>14220.529879</td>\n",
" <td>1.000000</td>\n",
" <td>1.250000</td>\n",
" <td>3756.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>15666.545935</td>\n",
" <td>15661.227460</td>\n",
" <td>2.000000</td>\n",
" <td>2.510000</td>\n",
" <td>7498.861278</td>\n",
" <td>7499.363732</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>17213.978376</td>\n",
" <td>6.614054</td>\n",
" <td>4.215516</td>\n",
" <td>17214.478439</td>\n",
" <td>6.613198</td>\n",
" <td>4.210000</td>\n",
" <td>11249.500000</td>\n",
" </tr>\n",
" <tr>\n",
@ -1073,9 +1073,9 @@
"mean 15590.776680 7.464000 4.103233 7499.500000\n",
"std 1764.025160 85.924387 20.104873 4330.127009\n",
"min 12347.000000 -9360.000000 0.000000 0.000000\n",
"25% 14220.581670 1.000000 1.250000 3756.500000\n",
"50% 15666.545935 2.000000 2.510000 7498.861278\n",
"75% 17213.978376 6.614054 4.215516 11249.500000\n",
"25% 14220.529879 1.000000 1.250000 3756.500000\n",
"50% 15661.227460 2.000000 2.510000 7499.363732\n",
"75% 17214.478439 6.613198 4.210000 11249.500000\n",
"max 18239.000000 2880.000000 950.990000 14999.000000"
]
},

View File

@ -48,3 +48,5 @@ In general, the data resides in elasticsearch and not in memory, which allows el
* :doc:`examples/index`
* :doc:`examples/demo_notebook`
* :doc:`examples/online_retail_analysis`

View File

@ -3,6 +3,9 @@ from abc import ABC, abstractmethod
# -------------------------------------------------------------------------------------------------------------------- #
# PostProcessingActions #
# -------------------------------------------------------------------------------------------------------------------- #
from eland import SortOrder
class PostProcessingAction(ABC):
def __init__(self, action_type):
"""
@ -27,6 +30,7 @@ class PostProcessingAction(ABC):
def __repr__(self):
pass
class SortIndexAction(PostProcessingAction):
def __init__(self):
super().__init__("sort_index")
@ -37,6 +41,7 @@ class SortIndexAction(PostProcessingAction):
def __repr__(self):
return "('{}')".format(self.type)
class HeadAction(PostProcessingAction):
def __init__(self, count):
super().__init__("head")
@ -76,10 +81,10 @@ class SortFieldAction(PostProcessingAction):
raise ValueError("Expected ES sort params string (e.g. _doc:desc). Got '{}'".format(sort_params_string))
self._sort_field = sort_params[0]
self._sort_order = Operations.SortOrder.from_string(sort_params[1])
self._sort_order = SortOrder.from_string(sort_params[1])
def resolve_action(self, df):
if self._sort_order == Operations.SortOrder.ASC:
if self._sort_order == SortOrder.ASC:
return df.sort_values(self._sort_field, True)
return df.sort_values(self._sort_field, False)

View File

@ -13,6 +13,8 @@
# limitations under the License.
# Default number of rows displayed (different to pandas where ALL could be displayed)
from enum import Enum
DEFAULT_NUM_ROWS_DISPLAYED = 60
@ -22,3 +24,29 @@ def docstring_parameter(*sub):
return obj
return dec
class SortOrder(Enum):
ASC = 0
DESC = 1
@staticmethod
def reverse(order):
if order == SortOrder.ASC:
return SortOrder.DESC
return SortOrder.ASC
@staticmethod
def to_string(order):
if order == SortOrder.ASC:
return "asc"
return "desc"
@staticmethod
def from_string(order):
if order == "asc":
return SortOrder.ASC
return SortOrder.DESC

17
eland/compat.py Normal file
View File

@ -0,0 +1,17 @@
# Copyright 2019 Elasticsearch BV
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
PY36 = sys.version_info >= (3, 6)

View File

@ -27,6 +27,7 @@ from pandas.io.common import _expand_user, _stringify_path
from pandas.io.formats import console
from pandas.io.formats import format as fmt
from pandas.io.formats.printing import pprint_thing
from pandas.util._validators import validate_bool_kwarg
import eland.plotting as gfx
from eland import NDFrame
@ -255,6 +256,151 @@ class DataFrame(NDFrame):
"""
return DataFrame(query_compiler=self._query_compiler.tail(n))
def drop(
self,
labels=None,
axis=0,
index=None,
columns=None,
level=None,
inplace=False,
errors="raise",
):
"""Return new object with labels in requested axis removed.
Parameters
----------
labels:
Index or column labels to drop.
axis:
Whether to drop labels from the index (0 / 'index') or columns (1 / 'columns').
index, columns:
Alternative to specifying axis (labels, axis=1 is equivalent to columns=labels).
level:
For MultiIndex - not supported
inplace:
If True, do operation inplace and return None.
errors:
If 'ignore', suppress error and existing labels are dropped.
Returns
-------
dropped:
type of caller
See Also
--------
:pandas_api_docs:`pandas.DataFrame.drop`
Examples
--------
Drop a column
>>> df = ed.DataFrame('localhost', 'ecommerce', columns=['customer_first_name', 'email', 'user'])
>>> df.drop(columns=['user'])
customer_first_name email
0 Eddie eddie@underwood-family.zzz
1 Mary mary@bailey-family.zzz
2 Gwen gwen@butler-family.zzz
3 Diane diane@chandler-family.zzz
4 Eddie eddie@weber-family.zzz
... ... ...
4670 Mary mary@lambert-family.zzz
4671 Jim jim@gilbert-family.zzz
4672 Yahya yahya@rivera-family.zzz
4673 Mary mary@hampton-family.zzz
4674 Jackson jackson@hopkins-family.zzz
<BLANKLINE>
[4675 rows x 2 columns]
Drop rows by index value (axis=0)
>>> df.drop(['1', '2'])
customer_first_name email user
0 Eddie eddie@underwood-family.zzz eddie
3 Diane diane@chandler-family.zzz diane
4 Eddie eddie@weber-family.zzz eddie
5 Diane diane@goodwin-family.zzz diane
6 Oliver oliver@rios-family.zzz oliver
... ... ... ...
4670 Mary mary@lambert-family.zzz mary
4671 Jim jim@gilbert-family.zzz jim
4672 Yahya yahya@rivera-family.zzz yahya
4673 Mary mary@hampton-family.zzz mary
4674 Jackson jackson@hopkins-family.zzz jackson
<BLANKLINE>
[4673 rows x 3 columns]
"""
# Level not supported
if level is not None:
raise NotImplementedError("level not supported {}".format(level))
inplace = validate_bool_kwarg(inplace, "inplace")
if labels is not None:
if index is not None or columns is not None:
raise ValueError("Cannot specify both 'labels' and 'index'/'columns'")
axis = pd.DataFrame()._get_axis_name(axis)
axes = {axis: labels}
elif index is not None or columns is not None:
axes, _ = pd.DataFrame()._construct_axes_from_arguments(
(index, columns), {}
)
else:
raise ValueError(
"Need to specify at least one of 'labels', 'index' or 'columns'"
)
# TODO Clean up this error checking
if "index" not in axes:
axes["index"] = None
elif axes["index"] is not None:
if not is_list_like(axes["index"]):
axes["index"] = [axes["index"]]
if errors == "raise":
# Check if axes['index'] values exists in index
count = self._query_compiler._index_matches_count(axes["index"])
if count != len(axes["index"]):
raise ValueError(
"number of labels {}!={} not contained in axis".format(count, len(axes["index"]))
)
else:
"""
axes["index"] = self._query_compiler.index_matches(axes["index"])
# If the length is zero, we will just do nothing
if not len(axes["index"]):
axes["index"] = None
"""
raise NotImplementedError()
if "columns" not in axes:
axes["columns"] = None
elif axes["columns"] is not None:
if not is_list_like(axes["columns"]):
axes["columns"] = [axes["columns"]]
if errors == "raise":
non_existant = [
obj for obj in axes["columns"] if obj not in self.columns
]
if len(non_existant):
raise ValueError(
"labels {} not contained in axis".format(non_existant)
)
else:
axes["columns"] = [
obj for obj in axes["columns"] if obj in self.columns
]
# If the length is zero, we will just do nothing
if not len(axes["columns"]):
axes["columns"] = None
new_query_compiler = self._query_compiler.drop(
index=axes["index"], columns=axes["columns"]
)
return self._create_or_update_from_compiler(new_query_compiler, inplace)
def __getitem__(self, key):
return self._getitem(key)
def __repr__(self):
"""
From pandas
@ -312,7 +458,8 @@ class DataFrame(NDFrame):
max_rows = min_rows
return self.to_html(max_rows=max_rows, max_cols=max_cols,
show_dimensions=show_dimensions, notebook=True) # set for consistency with pandas output
show_dimensions=show_dimensions,
notebook=True) # set for consistency with pandas output
else:
return None
@ -417,7 +564,7 @@ class DataFrame(NDFrame):
size: 5
sort_params: _doc:desc
_source: ['timestamp', 'OriginAirportID', 'DestAirportID', 'FlightDelayMin']
body: {'query': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}, 'aggs': {}}
body: {'query': {'bool': {'must': [{'term': {'OriginAirportID': 'AMS'}}, {'range': {'FlightDelayMin': {'gt': 60}}}]}}}
post_processing: [('sort_index')]
'field_to_display_names': {}
'display_to_field_names': {}

View File

@ -24,10 +24,10 @@ class BooleanFilter:
if isinstance(self, AndFilter):
if 'must_not' in x.subtree:
# nest a must_not under a must
self.subtree['must'].append(x.build()) # 'build includes bool'
self.subtree['must'].append(x.build()) # 'build includes bool'
else:
# append a must to a must
self.subtree['must'].append(x.subtree) # 'subtree strips bool'
self.subtree['must'].append(x.subtree) # 'subtree strips bool'
return self
elif isinstance(x, AndFilter):
if 'must_not' in self.subtree:

View File

@ -11,8 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
from collections import OrderedDict
import numpy as np
import pandas as pd
@ -66,7 +66,7 @@ class Mappings:
"""
# here we keep track of the format of any date fields
self._date_fields_format = {}
self._date_fields_format = dict()
if (client is not None) and (index_pattern is not None):
get_mapping = client.get_mapping(index=index_pattern)
@ -86,7 +86,8 @@ class Mappings:
# Cache source field types for efficient lookup
# (this massively improves performance of DataFrame.flatten)
self._source_field_pd_dtypes = {}
self._source_field_pd_dtypes = OrderedDict()
for field_name in self._mappings_capabilities[self._mappings_capabilities._source].index:
pd_dtype = self._mappings_capabilities.loc[field_name]['pd_dtype']
@ -135,14 +136,14 @@ class Mappings:
Returns
-------
fields, dates_format: tuple(dict, dict)
fields, dates_format: tuple(OrderedDict, dict)
where:
fields: Dict of field names and types
fields: OrderedDict of field names and types
dates_format: Dict of date field names and format
"""
fields = {}
dates_format = {}
fields = OrderedDict()
dates_format = dict()
# Recurse until we get a 'type: xxx'
def flatten(x, name=''):
@ -206,7 +207,7 @@ class Mappings:
all_fields_caps_fields = all_fields_caps['fields']
field_names = ['_source', 'es_dtype', 'pd_dtype', 'searchable', 'aggregatable']
capability_matrix = {}
capability_matrix = OrderedDict()
for field, field_caps in all_fields_caps_fields.items():
if field in all_fields:
@ -353,7 +354,7 @@ class Mappings:
else:
es_dtype = Mappings._pd_dtype_to_es_dtype(dtype)
mappings['properties'][field_name_name] = {}
mappings['properties'][field_name_name] = OrderedDict()
mappings['properties'][field_name_name]['type'] = es_dtype
return {"mappings": mappings}
@ -401,8 +402,8 @@ class Mappings:
Returns
-------
dict
A dictionary (for date fields) containing the mapping {field_name:format}
str
A string (for date fields) containing the date format for the field
"""
return self._date_fields_format.get(field_name)
@ -460,12 +461,12 @@ class Mappings:
Returns
-------
dict
OrderedDict
e.g. {'customer_full_name': 'customer_full_name.keyword', ...}
"""
if field_names is None:
field_names = self.source_fields()
aggregatables = {}
aggregatables = OrderedDict()
for field_name in field_names:
capabilities = self.field_capabilities(field_name)
if capabilities['aggregatable']:
@ -478,7 +479,7 @@ class Mappings:
aggregatables[field_name_keyword] = field_name
if not aggregatables:
raise ValueError("Aggregations not supported for ", field_name)
raise ValueError("Aggregations not supported for ", field_names)
return aggregatables
@ -533,11 +534,15 @@ class Mappings:
Source field name + pd_dtype as np.dtype
"""
if field_names is not None:
return pd.Series(
{key: np.dtype(self._source_field_pd_dtypes[key]) for key in field_names})
data = OrderedDict()
for key in field_names:
data[key] = np.dtype(self._source_field_pd_dtypes[key])
return pd.Series(data)
return pd.Series(
{key: np.dtype(value) for key, value in self._source_field_pd_dtypes.items()})
data = OrderedDict()
for key, value in self._source_field_pd_dtypes.items():
data[key] = np.dtype(value)
return pd.Series(data)
def info_es(self, buf):
buf.write("Mappings:\n")

View File

@ -1,3 +1,22 @@
# Copyright 2019 Elasticsearch BV
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
from abc import ABC, abstractmethod
from eland import QueryCompiler
"""
NDFrame
---------
@ -23,29 +42,6 @@ only Elasticsearch aggregatable fields can be aggregated or grouped.
"""
# Copyright 2019 Elasticsearch BV
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
from abc import ABC
import pandas as pd
from pandas.core.dtypes.common import is_list_like
from pandas.util._validators import validate_bool_kwarg
from eland import ElandQueryCompiler
class NDFrame(ABC):
@ -64,8 +60,8 @@ class NDFrame(ABC):
A reference to a Elasticsearch python client
"""
if query_compiler is None:
query_compiler = ElandQueryCompiler(client=client, index_pattern=index_pattern, field_names=columns,
index_field=index_field)
query_compiler = QueryCompiler(client=client, index_pattern=index_pattern, field_names=columns,
index_field=index_field)
self._query_compiler = query_compiler
def _get_index(self):
@ -139,9 +135,6 @@ class NDFrame(ABC):
return head.append(tail)
def __getitem__(self, key):
return self._getitem(key)
def __sizeof__(self):
# Don't default to pandas, just return approximation TODO - make this more accurate
return sys.getsizeof(self._query_compiler)
@ -157,148 +150,6 @@ class NDFrame(ABC):
def _info_es(self, buf):
self._query_compiler.info_es(buf)
def drop(
self,
labels=None,
axis=0,
index=None,
columns=None,
level=None,
inplace=False,
errors="raise",
):
"""Return new object with labels in requested axis removed.
Parameters
----------
labels:
Index or column labels to drop.
axis:
Whether to drop labels from the index (0 / 'index') or columns (1 / 'columns').
index, columns:
Alternative to specifying axis (labels, axis=1 is equivalent to columns=labels).
level:
For MultiIndex - not supported
inplace:
If True, do operation inplace and return None.
errors:
If 'ignore', suppress error and existing labels are dropped.
Returns
-------
dropped:
type of caller
See Also
--------
:pandas_api_docs:`pandas.DataFrame.drop`
Examples
--------
Drop a column
>>> df = ed.DataFrame('localhost', 'ecommerce', columns=['customer_first_name', 'email', 'user'])
>>> df.drop(columns=['user'])
customer_first_name email
0 Eddie eddie@underwood-family.zzz
1 Mary mary@bailey-family.zzz
2 Gwen gwen@butler-family.zzz
3 Diane diane@chandler-family.zzz
4 Eddie eddie@weber-family.zzz
... ... ...
4670 Mary mary@lambert-family.zzz
4671 Jim jim@gilbert-family.zzz
4672 Yahya yahya@rivera-family.zzz
4673 Mary mary@hampton-family.zzz
4674 Jackson jackson@hopkins-family.zzz
<BLANKLINE>
[4675 rows x 2 columns]
Drop rows by index value (axis=0)
>>> df.drop(['1', '2'])
customer_first_name email user
0 Eddie eddie@underwood-family.zzz eddie
3 Diane diane@chandler-family.zzz diane
4 Eddie eddie@weber-family.zzz eddie
5 Diane diane@goodwin-family.zzz diane
6 Oliver oliver@rios-family.zzz oliver
... ... ... ...
4670 Mary mary@lambert-family.zzz mary
4671 Jim jim@gilbert-family.zzz jim
4672 Yahya yahya@rivera-family.zzz yahya
4673 Mary mary@hampton-family.zzz mary
4674 Jackson jackson@hopkins-family.zzz jackson
<BLANKLINE>
[4673 rows x 3 columns]
"""
# Level not supported
if level is not None:
raise NotImplementedError("level not supported {}".format(level))
inplace = validate_bool_kwarg(inplace, "inplace")
if labels is not None:
if index is not None or columns is not None:
raise ValueError("Cannot specify both 'labels' and 'index'/'columns'")
axis = pd.DataFrame()._get_axis_name(axis)
axes = {axis: labels}
elif index is not None or columns is not None:
axes, _ = pd.DataFrame()._construct_axes_from_arguments(
(index, columns), {}
)
else:
raise ValueError(
"Need to specify at least one of 'labels', 'index' or 'columns'"
)
# TODO Clean up this error checking
if "index" not in axes:
axes["index"] = None
elif axes["index"] is not None:
if not is_list_like(axes["index"]):
axes["index"] = [axes["index"]]
if errors == "raise":
# Check if axes['index'] values exists in index
count = self._query_compiler._index_matches_count(axes["index"])
if count != len(axes["index"]):
raise ValueError(
"number of labels {}!={} not contained in axis".format(count, len(axes["index"]))
)
else:
"""
axes["index"] = self._query_compiler.index_matches(axes["index"])
# If the length is zero, we will just do nothing
if not len(axes["index"]):
axes["index"] = None
"""
raise NotImplementedError()
if "columns" not in axes:
axes["columns"] = None
elif axes["columns"] is not None:
if not is_list_like(axes["columns"]):
axes["columns"] = [axes["columns"]]
if errors == "raise":
non_existant = [
obj for obj in axes["columns"] if obj not in self.columns
]
if len(non_existant):
raise ValueError(
"labels {} not contained in axis".format(non_existant)
)
else:
axes["columns"] = [
obj for obj in axes["columns"] if obj in self.columns
]
# If the length is zero, we will just do nothing
if not len(axes["columns"]):
axes["columns"] = None
new_query_compiler = self._query_compiler.drop(
index=axes["index"], columns=axes["columns"]
)
return self._create_or_update_from_compiler(new_query_compiler, inplace)
def mean(self, numeric_only=True):
"""
Return mean value for each numeric column
@ -518,3 +369,15 @@ class NDFrame(ABC):
max 1199.729004 360.000000
"""
return self._query_compiler.describe()
@abstractmethod
def _to_pandas(self):
pass
@abstractmethod
def head(self, n=5):
pass
@abstractmethod
def tail(self, n=5):
pass

View File

@ -13,14 +13,15 @@
# limitations under the License.
import copy
from collections import OrderedDict
import pandas as pd
from eland import Index
from eland import Index, SortOrder
from eland import Query
from eland.actions import SortFieldAction
from eland.tasks import HeadTask, TailTask, BooleanFilterTask, ArithmeticOpFieldsTask, QueryTermsTask, \
QueryIdsTask, SortOrder, SizeTask
QueryIdsTask, SizeTask
class Operations:
@ -35,6 +36,7 @@ class Operations:
This is maintained as a 'task graph' (inspired by dask)
(see https://docs.dask.org/en/latest/spec.html)
"""
def __init__(self, tasks=None, field_names=None):
if tasks is None:
self._tasks = []
@ -94,7 +96,7 @@ class Operations:
# Only return requested field_names
fields = query_compiler.field_names
counts = {}
counts = OrderedDict()
for field in fields:
body = Query(query_params['query'])
body.exists(field, must=True)
@ -171,7 +173,7 @@ class Operations:
# "value" : 628.2536888148849
# }
# }
results = {}
results = OrderedDict()
if field_types == 'aggregatable':
for key, value in source_fields.items():
@ -220,7 +222,7 @@ class Operations:
size=0,
body=body.to_search_body())
results = {}
results = OrderedDict()
for key in aggregatable_field_names.keys():
# key is aggregatable field, value is label
@ -276,8 +278,8 @@ class Operations:
# },
# ...
bins = {}
weights = {}
bins = OrderedDict()
weights = OrderedDict()
# There is one more bin that weights
# len(bins) = len(weights) + 1
@ -415,7 +417,7 @@ class Operations:
sum 8.204365e+06 9.261629e+07 5.754909e+07 618150
min 1.000205e+02 0.000000e+00 0.000000e+00 0
"""
results = {}
results = OrderedDict()
for field in field_names:
values = list()
@ -455,7 +457,7 @@ class Operations:
size=0,
body=body.to_search_body())
results = {}
results = OrderedDict()
for field in numeric_source_fields:
values = list()

View File

@ -152,9 +152,15 @@ class Query:
def to_search_body(self):
if self._query.empty():
body = {"aggs": self._aggs}
if self._aggs:
body = {"aggs": self._aggs}
else:
body = {}
else:
body = {"query": self._query.build(), "aggs": self._aggs}
if self._aggs:
body = {"query": self._query.build(), "aggs": self._aggs}
else:
body = {"query": self._query.build()}
return body
def to_count_body(self):

View File

@ -13,6 +13,7 @@
# limitations under the License.
import warnings
from collections import OrderedDict
from typing import Union
import numpy as np
@ -24,7 +25,7 @@ from eland import Mappings
from eland import Operations
class ElandQueryCompiler:
class QueryCompiler:
"""
Some notes on what can and can not be mapped:
@ -73,7 +74,7 @@ class ElandQueryCompiler:
self.field_names = field_names
if name_mapper is None:
self._name_mapper = ElandQueryCompiler.DisplayNameToFieldNameMapper()
self._name_mapper = QueryCompiler.DisplayNameToFieldNameMapper()
else:
self._name_mapper = name_mapper
@ -276,7 +277,7 @@ class ElandQueryCompiler:
return partial_result, df
def _flatten_dict(self, y):
out = {}
out = OrderedDict()
def flatten(x, name=''):
# We flatten into source fields e.g. if type=geo_point
@ -360,14 +361,14 @@ class ElandQueryCompiler:
def _empty_pd_ef(self):
# Return an empty dataframe with correct columns and dtypes
df = pd.DataFrame()
for c, d in zip(self.columns, self.dtypes):
for c, d in zip(self.dtypes.index, self.dtypes.values):
df[c] = pd.Series(dtype=d)
return df
def copy(self):
return ElandQueryCompiler(client=self._client, index_pattern=self._index_pattern, field_names=None,
index_field=self._index.index_field, operations=self._operations.copy(),
name_mapper=self._name_mapper.copy())
return QueryCompiler(client=self._client, index_pattern=self._index_pattern, field_names=None,
index_field=self._index.index_field, operations=self._operations.copy(),
name_mapper=self._name_mapper.copy())
def rename(self, renames, inplace=False):
if inplace:
@ -500,7 +501,7 @@ class ElandQueryCompiler:
Parameters
----------
right: ElandQueryCompiler
right: QueryCompiler
The query compiler to compare self to
Raises
@ -508,7 +509,7 @@ class ElandQueryCompiler:
TypeError, ValueError
If arithmetic operations aren't possible
"""
if not isinstance(right, ElandQueryCompiler):
if not isinstance(right, QueryCompiler):
raise TypeError(
"Incompatible types "
"{0} != {1}".format(type(self), type(right))
@ -539,7 +540,7 @@ class ElandQueryCompiler:
Parameters
----------
right: ElandQueryCompiler
right: QueryCompiler
The query compiler to compare self to
Raises
@ -585,12 +586,12 @@ class ElandQueryCompiler:
if field_to_display_names is not None:
self._field_to_display_names = field_to_display_names
else:
self._field_to_display_names = dict()
self._field_to_display_names = {}
if display_to_field_names is not None:
self._display_to_field_names = display_to_field_names
else:
self._display_to_field_names = dict()
self._display_to_field_names = {}
def rename_display_name(self, renames):
for current_display_name, new_display_name in renames.items():

View File

@ -1055,7 +1055,8 @@ class Series(NDFrame):
# our operation is between series
op_type = op_type + tuple('s')
# check if fields are aggregatable
self.name, right.name = self._query_compiler.check_str_arithmetics(right._query_compiler, self.name, right.name)
self.name, right.name = self._query_compiler.check_str_arithmetics(right._query_compiler, self.name,
right.name)
series = Series(query_compiler=self._query_compiler.arithmetic_op_fields(
new_field_name, method_name, self.name, right.name, op_type))
@ -1067,7 +1068,7 @@ class Series(NDFrame):
# TODO - support limited ops on strings https://github.com/elastic/eland/issues/65
raise TypeError(
"unsupported operation type(s) ['{}'] for operands ['{}' with dtype '{}', '{}']"
.format(method_name, type(self), self._dtype, type(right).__name__)
.format(method_name, type(self), self._dtype, type(right).__name__)
)
# check left number and right numeric series
@ -1103,7 +1104,7 @@ class Series(NDFrame):
# TODO - support limited ops on strings https://github.com/elastic/eland/issues/65
raise TypeError(
"unsupported operation type(s) ['{}'] for operands ['{}' with dtype '{}', '{}']"
.format(method_name, type(self), self._dtype, type(right).__name__)
.format(method_name, type(self), self._dtype, type(right).__name__)
)
def _numeric_rop(self, left, method_name, op_type=None):
@ -1146,7 +1147,7 @@ class Series(NDFrame):
# TODO - support limited ops on strings https://github.com/elastic/eland/issues/65
raise TypeError(
"unsupported operation type(s) ['{}'] for operands ['{}' with dtype '{}', '{}']"
.format(op_method_name, type(self), self._dtype, type(left).__name__)
.format(op_method_name, type(self), self._dtype, type(left).__name__)
)
def max(self):

View File

@ -1,37 +1,11 @@
from abc import ABC, abstractmethod
from enum import Enum
import numpy as np
from eland import SortOrder
from eland.actions import HeadAction, TailAction, SortIndexAction
class SortOrder(Enum):
ASC = 0
DESC = 1
@staticmethod
def reverse(order):
if order == SortOrder.ASC:
return SortOrder.DESC
return SortOrder.ASC
@staticmethod
def to_string(order):
if order == SortOrder.ASC:
return "asc"
return "desc"
@staticmethod
def from_string(order):
if order == "asc":
return SortOrder.ASC
return SortOrder.DESC
# -------------------------------------------------------------------------------------------------------------------- #
# Tasks #
# -------------------------------------------------------------------------------------------------------------------- #
@ -305,7 +279,7 @@ class ArithmeticOpFieldsTask(Task):
raise NotImplementedError("Not implemented operation '{0}'".format(self._op_name))
if query_params['query_script_fields'] is None:
query_params['query_script_fields'] = {}
query_params['query_script_fields'] = dict()
query_params['query_script_fields'][self._field_name] = {
'script': {
'source': source
@ -428,7 +402,7 @@ class ArithmeticOpFieldsTask(Task):
raise NotImplementedError("Not implemented operation '{0}'".format(self._op_name))
if query_params['query_script_fields'] is None:
query_params['query_script_fields'] = {}
query_params['query_script_fields'] = dict()
query_params['query_script_fields'][self._field_name] = {
'script': {
'source': source

View File

@ -14,8 +14,8 @@
import os
from elasticsearch import Elasticsearch
import pandas as pd
from elasticsearch import Elasticsearch
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))

View File

@ -11,4 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -11,4 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -28,4 +28,7 @@ class TestDataFrameCount(TestData):
pd_count = pd_ecommerce.count()
ed_count = ed_ecommerce.count()
print(pd_count)
print(ed_count)
assert_series_equal(pd_count, ed_count)

View File

@ -15,7 +15,6 @@
# File called _pytest for PyCharm compatability
from datetime import datetime
from elasticsearch import Elasticsearch
import numpy as np
import pandas as pd
@ -27,7 +26,6 @@ from eland.tests.common import assert_pandas_eland_series_equal
class TestDataFrameDateTime(TestData):
times = ["2019-11-26T19:58:15.246+0000",
"1970-01-01T00:00:03.000+0000"]
time_index_name = 'test_time_formats'

View File

@ -40,5 +40,5 @@ class TestDataFrameInit:
df0 = ed.DataFrame(ES_TEST_CLIENT, FLIGHTS_INDEX_NAME)
df1 = ed.DataFrame(client=ES_TEST_CLIENT, index_pattern=FLIGHTS_INDEX_NAME)
qc = ed.ElandQueryCompiler(client=ES_TEST_CLIENT, index_pattern=FLIGHTS_INDEX_NAME)
qc = ed.QueryCompiler(client=ES_TEST_CLIENT, index_pattern=FLIGHTS_INDEX_NAME)
df2 = ed.DataFrame(query_compiler=qc)

View File

@ -15,7 +15,6 @@
# File called _pytest for PyCharm compatability
import pandas as pd
from elasticsearch import Elasticsearch
import eland as ed
from eland.tests.common import ES_TEST_CLIENT
@ -128,4 +127,4 @@ class TestDataFrameQuery(TestData):
assert_pandas_eland_frame_equal(pd_q4, ed_q4)
ES_TEST_CLIENT.indices.delete(index_name)
ES_TEST_CLIENT.indices.delete(index_name)

View File

@ -17,6 +17,7 @@
import pandas as pd
import pytest
from eland.compat import PY36
from eland.dataframe import DEFAULT_NUM_ROWS_DISPLAYED
from eland.tests.common import TestData
@ -198,7 +199,10 @@ class TestDataFrameRepr(TestData):
# print(ed_head_str)
# print(pd_head_str)
assert pd_head_str == ed_head_str
# Currently pandas display bold_rows=True with >=PY36 and bold_rows=False with 3.5
# TODO - fix this test for 3.5
if PY36:
assert pd_head_str == ed_head_str
def test_empty_dataframe_repr_html(self):
# TODO - there is a bug in 'show_dimensions' as it gets added after the last </div>

View File

@ -18,7 +18,6 @@ import ast
import time
import pandas as pd
from elasticsearch import Elasticsearch
from pandas.util.testing import assert_frame_equal
import eland as ed

View File

@ -54,3 +54,6 @@ class TestDataFrameUtils(TestData):
ed_df_head = ed_df.head()
assert_pandas_eland_frame_equal(df, ed_df_head)
def test_eland_to_pandas_performance(self):
pd_df = ed.eland_to_pandas(self.ed_flights())

View File

@ -11,4 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -11,4 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -188,20 +188,20 @@ class TestOperators:
exp = (GreaterEqual('a', 2) & GreaterEqual('b', 2)) & ~(IsIn('ids', [1, 2, 3]))
a = exp.build()
b = {
'bool': {
'must': [
{'range': {'a': {'gte': 2}}},
{'range': {'b': {'gte': 2}}},
{
'bool': {
'must_not': {
'ids': {'values': [1, 2, 3]}
}
}
}
]
}
}
'bool': {
'must': [
{'range': {'a': {'gte': 2}}},
{'range': {'b': {'gte': 2}}},
{
'bool': {
'must_not': {
'ids': {'values': [1, 2, 3]}
}
}
}
]
}
}
assert a == b
def test_must_not_and_must_filter(self):

View File

@ -14,7 +14,7 @@
# File called _pytest for PyCharm compatability
from eland import ElandQueryCompiler
from eland import QueryCompiler
from eland.tests.common import TestData
@ -24,7 +24,7 @@ class TestQueryCompilerRename(TestData):
field_names = []
display_names = []
mapper = ElandQueryCompiler.DisplayNameToFieldNameMapper()
mapper = QueryCompiler.DisplayNameToFieldNameMapper()
assert field_names == mapper.field_names_to_list()
assert display_names == mapper.display_names_to_list()
@ -58,7 +58,7 @@ class TestQueryCompilerRename(TestData):
def test_query_compiler_basic_rename_columns(self):
columns = ['a', 'b', 'c', 'd']
mapper = ElandQueryCompiler.DisplayNameToFieldNameMapper()
mapper = QueryCompiler.DisplayNameToFieldNameMapper()
display_names = ['A', 'b', 'c', 'd']
update_A = {'a': 'A'}

View File

@ -11,4 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -14,7 +14,6 @@
# File called _pytest for PyCharm compatability
import pytest
import numpy as np
from eland.tests.common import TestData, assert_pandas_eland_series_equal
@ -60,7 +59,6 @@ class TestSeriesArithmetics(TestData):
assert_pandas_eland_series_equal(pdadd, edadd)
def test_ser_add_str_add_ser(self):
pdadd = self.pd_ecommerce()['customer_first_name'] + self.pd_ecommerce()['customer_last_name']
print(pdadd.name)
@ -84,5 +82,5 @@ class TestSeriesArithmetics(TestData):
assert self.ed_ecommerce()['customer_gender'] + self.ed_ecommerce()['customer_first_name']
def test_aggregatable_add_non_aggregatable(self):
with pytest.raises(ValueError):
assert self.ed_ecommerce()['customer_first_name'] + self.ed_ecommerce()['customer_gender']
with pytest.raises(ValueError):
assert self.ed_ecommerce()['customer_first_name'] + self.ed_ecommerce()['customer_gender']

View File

@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from elasticsearch.client import ClusterClient
@ -70,9 +69,9 @@ def _update_max_compilations_limit(es, limit="10000/1m"):
print('Updating script.max_compilations_rate to ', limit)
cluster_client = ClusterClient(es)
body = {
"transient" : {
"script.max_compilations_rate" : limit
}
"transient": {
"script.max_compilations_rate": limit
}
}
cluster_client.put_settings(body=body)

View File

@ -243,7 +243,7 @@ def read_csv(filepath_or_buffer,
Parameters
----------
es_params: Elasticsearch client argument(s)
es_client: Elasticsearch client argument(s)
- elasticsearch-py parameters or
- elasticsearch-py instance or
- eland.Client instance
@ -260,8 +260,6 @@ def read_csv(filepath_or_buffer,
* False: Include missing values - may cause bulk to fail
es_geo_points: list, default None
List of columns to map to geo_point data type
iterator
not supported
chunksize
number of csv rows to read before bulk index into Elasticsearch
@ -275,6 +273,8 @@ def read_csv(filepath_or_buffer,
Notes
-----
iterator not supported
TODO - currently the eland.DataFrame may not retain the order of the data in the csv.
"""
kwds = dict()

View File

@ -12,10 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from setuptools import setup, find_packages
from codecs import open
from os import path
from setuptools import setup
here = path.abspath(path.dirname(__file__))
with open(path.join(here, 'README.md'), encoding='utf-8') as f: