Merge pull request #51 from stevedodson/master

Updating docs + added supported methods doc
This commit is contained in:
stevedodson 2019-11-19 14:09:13 +00:00 committed by GitHub
commit 885a0a4aba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 480 additions and 315 deletions

View File

@ -1,3 +1,6 @@
.. _implementation/dataframe_supported:
===============================
pandas.DataFrame supported APIs
===============================
@ -8,20 +11,18 @@ also welcome!
The following table is structured as follows: The first column contains the method name.
The second column is a flag for whether or not there is an implementation in Modin for
the method in the left column. ``Y`` stands for yes, ``N`` stands for no, ``P`` stands
for partial (meaning some parameters may not be supported yet), and ``D`` stands for
default to pandas.
the method in the left column. ``Y`` stands for yes, ``N`` stands for no.
https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv represents a prioritised list.
+-------------------------+-------+------------------------------------------------+
| Method | Count | Notes |
+-------------------------+-------+------------------------------------------------+
| pd.read_csv | 1422 | Not implemented ed.read_es implemented instead |
| pd.read_csv | 1422 | y |
+-------------------------+-------+------------------------------------------------+
| pd.DataFrame | 886 | y |
+-------------------------+-------+------------------------------------------------+
| df.append | 792 | Not implemented |
| df.append | 792 | n |
+-------------------------+-------+------------------------------------------------+
| df.mean | 783 | y |
+-------------------------+-------+------------------------------------------------+
@ -31,407 +32,407 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep
+-------------------------+-------+------------------------------------------------+
| df.sum | 755 | y |
+-------------------------+-------+------------------------------------------------+
| df.to_csv | 693 | |
| df.to_csv | 693 | y |
+-------------------------+-------+------------------------------------------------+
| df.get | 669 | |
| df.get | 669 | y |
+-------------------------+-------+------------------------------------------------+
| df.mode | 653 | |
| df.mode | 653 | n |
+-------------------------+-------+------------------------------------------------+
| df.astype | 649 | |
| df.astype | 649 | n |
+-------------------------+-------+------------------------------------------------+
| df.sub | 637 | |
| df.sub | 637 | n |
+-------------------------+-------+------------------------------------------------+
| pd.concat | 582 | |
| pd.concat | 582 | n |
+-------------------------+-------+------------------------------------------------+
| df.apply | 577 | |
| df.apply | 577 | n |
+-------------------------+-------+------------------------------------------------+
| df.groupby | 557 | |
| df.groupby | 557 | n |
+-------------------------+-------+------------------------------------------------+
| df.join | 544 | |
| df.join | 544 | n |
+-------------------------+-------+------------------------------------------------+
| df.fillna | 543 | |
| df.fillna | 543 | n |
+-------------------------+-------+------------------------------------------------+
| df.max | 508 | |
| df.max | 508 | y |
+-------------------------+-------+------------------------------------------------+
| df.reset_index | 434 | |
| df.reset_index | 434 | n |
+-------------------------+-------+------------------------------------------------+
| pd.unique | 433 | |
| pd.unique | 433 | n |
+-------------------------+-------+------------------------------------------------+
| df.le | 405 | |
| df.le | 405 | n |
+-------------------------+-------+------------------------------------------------+
| df.count | 399 | |
| df.count | 399 | y |
+-------------------------+-------+------------------------------------------------+
| pd.value_counts | 397 | |
| pd.value_counts | 397 | n |
+-------------------------+-------+------------------------------------------------+
| df.sort_values | 390 | |
| df.sort_values | 390 | n |
+-------------------------+-------+------------------------------------------------+
| df.transform | 387 | |
| df.transform | 387 | n |
+-------------------------+-------+------------------------------------------------+
| df.merge | 376 | |
| df.merge | 376 | n |
+-------------------------+-------+------------------------------------------------+
| df.add | 346 | |
| df.add | 346 | n |
+-------------------------+-------+------------------------------------------------+
| df.isnull | 338 | |
| df.isnull | 338 | n |
+-------------------------+-------+------------------------------------------------+
| df.min | 321 | |
| df.min | 321 | y |
+-------------------------+-------+------------------------------------------------+
| df.copy | 314 | |
| df.copy | 314 | n |
+-------------------------+-------+------------------------------------------------+
| df.replace | 300 | |
| df.replace | 300 | n |
+-------------------------+-------+------------------------------------------------+
| df.std | 261 | |
| df.std | 261 | n |
+-------------------------+-------+------------------------------------------------+
| df.hist | 246 | |
| df.hist | 246 | y |
+-------------------------+-------+------------------------------------------------+
| df.filter | 234 | |
| df.filter | 234 | n |
+-------------------------+-------+------------------------------------------------+
| df.describe | 220 | |
| df.describe | 220 | y |
+-------------------------+-------+------------------------------------------------+
| df.ne | 218 | |
| df.ne | 218 | n |
+-------------------------+-------+------------------------------------------------+
| df.corr | 217 | |
| df.corr | 217 | n |
+-------------------------+-------+------------------------------------------------+
| df.median | 217 | |
| df.median | 217 | n |
+-------------------------+-------+------------------------------------------------+
| df.items | 212 | |
| df.items | 212 | n |
+-------------------------+-------+------------------------------------------------+
| pd.to_datetime | 204 | |
| pd.to_datetime | 204 | n |
+-------------------------+-------+------------------------------------------------+
| df.isin | 203 | |
| df.isin | 203 | n |
+-------------------------+-------+------------------------------------------------+
| df.dropna | 195 | |
| df.dropna | 195 | n |
+-------------------------+-------+------------------------------------------------+
| pd.get_dummies | 190 | |
| pd.get_dummies | 190 | n |
+-------------------------+-------+------------------------------------------------+
| df.rename | 185 | |
| df.rename | 185 | n |
+-------------------------+-------+------------------------------------------------+
| df.info | 180 | |
| df.info | 180 | y |
+-------------------------+-------+------------------------------------------------+
| df.set_index | 166 | |
| df.set_index | 166 | n |
+-------------------------+-------+------------------------------------------------+
| df.keys | 159 | |
| df.keys | 159 | y |
+-------------------------+-------+------------------------------------------------+
| df.sample | 155 | |
| df.sample | 155 | n |
+-------------------------+-------+------------------------------------------------+
| df.agg | 140 | |
| df.agg | 140 | y |
+-------------------------+-------+------------------------------------------------+
| df.where | 138 | |
| df.where | 138 | n |
+-------------------------+-------+------------------------------------------------+
| df.boxplot | 134 | |
| df.boxplot | 134 | n |
+-------------------------+-------+------------------------------------------------+
| df.clip | 116 | |
| df.clip | 116 | n |
+-------------------------+-------+------------------------------------------------+
| df.round | 116 | |
| df.round | 116 | n |
+-------------------------+-------+------------------------------------------------+
| df.abs | 101 | |
| df.abs | 101 | n |
+-------------------------+-------+------------------------------------------------+
| df.stack | 97 | |
| df.stack | 97 | n |
+-------------------------+-------+------------------------------------------------+
| df.tail | 94 | |
| df.tail | 94 | y |
+-------------------------+-------+------------------------------------------------+
| df.update | 92 | |
| df.update | 92 | n |
+-------------------------+-------+------------------------------------------------+
| df.iterrows | 90 | |
| df.iterrows | 90 | n |
+-------------------------+-------+------------------------------------------------+
| df.transpose | 87 | |
| df.transpose | 87 | n |
+-------------------------+-------+------------------------------------------------+
| df.any | 85 | |
| df.any | 85 | n |
+-------------------------+-------+------------------------------------------------+
| df.pipe | 80 | |
| df.pipe | 80 | n |
+-------------------------+-------+------------------------------------------------+
| pd.eval | 73 | |
| pd.eval | 73 | n |
+-------------------------+-------+------------------------------------------------+
| df.eval | 73 | |
| df.eval | 73 | n |
+-------------------------+-------+------------------------------------------------+
| pd.read_json | 72 | |
| pd.read_json | 72 | n |
+-------------------------+-------+------------------------------------------------+
| df.nunique | 70 | |
| df.nunique | 70 | y |
+-------------------------+-------+------------------------------------------------+
| df.pivot | 70 | |
| df.pivot | 70 | n |
+-------------------------+-------+------------------------------------------------+
| df.select | 68 | |
| df.select | 68 | n |
+-------------------------+-------+------------------------------------------------+
| df.as_matrix | 67 | |
| df.as_matrix | 67 | n |
+-------------------------+-------+------------------------------------------------+
| df.notnull | 66 | |
| df.notnull | 66 | n |
+-------------------------+-------+------------------------------------------------+
| df.cumsum | 66 | |
| df.cumsum | 66 | n |
+-------------------------+-------+------------------------------------------------+
| df.prod | 64 | |
| df.prod | 64 | n |
+-------------------------+-------+------------------------------------------------+
| df.unstack | 64 | |
| df.unstack | 64 | n |
+-------------------------+-------+------------------------------------------------+
| df.drop_duplicates | 63 | |
| df.drop_duplicates | 63 | n |
+-------------------------+-------+------------------------------------------------+
| df.div | 63 | |
| df.div | 63 | n |
+-------------------------+-------+------------------------------------------------+
| pd.crosstab | 59 | |
| pd.crosstab | 59 | n |
+-------------------------+-------+------------------------------------------------+
| df.select_dtypes | 57 | |
| df.select_dtypes | 57 | y |
+-------------------------+-------+------------------------------------------------+
| df.pow | 56 | |
| df.pow | 56 | n |
+-------------------------+-------+------------------------------------------------+
| df.sort_index | 56 | |
| df.sort_index | 56 | n |
+-------------------------+-------+------------------------------------------------+
| df.product | 52 | |
| df.product | 52 | n |
+-------------------------+-------+------------------------------------------------+
| df.isna | 51 | |
| df.isna | 51 | n |
+-------------------------+-------+------------------------------------------------+
| df.dot | 46 | |
| df.dot | 46 | n |
+-------------------------+-------+------------------------------------------------+
| pd.cut | 45 | |
| pd.cut | 45 | n |
+-------------------------+-------+------------------------------------------------+
| df.bool | 44 | |
| df.bool | 44 | n |
+-------------------------+-------+------------------------------------------------+
| df.to_dict | 44 | |
| df.to_dict | 44 | n |
+-------------------------+-------+------------------------------------------------+
| df.diff | 44 | |
| df.diff | 44 | n |
+-------------------------+-------+------------------------------------------------+
| df.insert | 44 | |
| df.insert | 44 | n |
+-------------------------+-------+------------------------------------------------+
| df.pop | 44 | |
| df.pop | 44 | n |
+-------------------------+-------+------------------------------------------------+
| df.query | 43 | |
| df.query | 43 | y |
+-------------------------+-------+------------------------------------------------+
| df.var | 43 | |
| df.var | 43 | n |
+-------------------------+-------+------------------------------------------------+
| df.__init__ | 41 | |
| df.__init__ | 41 | y |
+-------------------------+-------+------------------------------------------------+
| pd.to_numeric | 39 | |
| pd.to_numeric | 39 | n |
+-------------------------+-------+------------------------------------------------+
| df.squeeze | 39 | |
| df.squeeze | 39 | n |
+-------------------------+-------+------------------------------------------------+
| df.ge | 37 | |
| df.ge | 37 | n |
+-------------------------+-------+------------------------------------------------+
| df.quantile | 37 | |
| df.quantile | 37 | n |
+-------------------------+-------+------------------------------------------------+
| df.reindex | 37 | |
| df.reindex | 37 | n |
+-------------------------+-------+------------------------------------------------+
| df.rolling | 35 | |
| df.rolling | 35 | n |
+-------------------------+-------+------------------------------------------------+
| pd.factorize | 32 | |
| pd.factorize | 32 | n |
+-------------------------+-------+------------------------------------------------+
| pd.melt | 31 | |
| pd.melt | 31 | n |
+-------------------------+-------+------------------------------------------------+
| df.melt | 31 | |
| df.melt | 31 | n |
+-------------------------+-------+------------------------------------------------+
| df.rank | 31 | |
| df.rank | 31 | n |
+-------------------------+-------+------------------------------------------------+
| pd.read_table | 30 | |
| pd.read_table | 30 | n |
+-------------------------+-------+------------------------------------------------+
| pd.pivot_table | 30 | |
| pd.pivot_table | 30 | n |
+-------------------------+-------+------------------------------------------------+
| df.idxmax | 30 | |
| df.idxmax | 30 | n |
+-------------------------+-------+------------------------------------------------+
| pd.test | 29 | |
| pd.test | 29 | n |
+-------------------------+-------+------------------------------------------------+
| df.iteritems | 29 | |
| df.iteritems | 29 | n |
+-------------------------+-------+------------------------------------------------+
| df.shift | 28 | |
| df.shift | 28 | n |
+-------------------------+-------+------------------------------------------------+
| df.mul | 28 | |
| df.mul | 28 | n |
+-------------------------+-------+------------------------------------------------+
| pd.qcut | 25 | |
| pd.qcut | 25 | n |
+-------------------------+-------+------------------------------------------------+
| df.set_value | 25 | |
| df.set_value | 25 | n |
+-------------------------+-------+------------------------------------------------+
| df.all | 24 | |
| df.all | 24 | n |
+-------------------------+-------+------------------------------------------------+
| df.skew | 24 | |
| df.skew | 24 | n |
+-------------------------+-------+------------------------------------------------+
| df.aggregate | 23 | |
| df.aggregate | 23 | y |
+-------------------------+-------+------------------------------------------------+
| pd.match | 22 | |
| pd.match | 22 | n |
+-------------------------+-------+------------------------------------------------+
| df.nlargest | 22 | |
| df.nlargest | 22 | n |
+-------------------------+-------+------------------------------------------------+
| df.multiply | 21 | |
| df.multiply | 21 | n |
+-------------------------+-------+------------------------------------------------+
| df.set_axis | 19 | |
| df.set_axis | 19 | n |
+-------------------------+-------+------------------------------------------------+
| df.eq | 18 | |
| df.eq | 18 | n |
+-------------------------+-------+------------------------------------------------+
| df.resample | 18 | |
| df.resample | 18 | n |
+-------------------------+-------+------------------------------------------------+
| pd.read_sql | 17 | |
| pd.read_sql | 17 | n |
+-------------------------+-------+------------------------------------------------+
| df.duplicated | 16 | |
| df.duplicated | 16 | n |
+-------------------------+-------+------------------------------------------------+
| pd.date_range | 16 | |
| pd.date_range | 16 | n |
+-------------------------+-------+------------------------------------------------+
| df.interpolate | 15 | |
| df.interpolate | 15 | n |
+-------------------------+-------+------------------------------------------------+
| df.memory_usage | 15 | |
| df.memory_usage | 15 | n |
+-------------------------+-------+------------------------------------------------+
| df.divide | 14 | |
| df.divide | 14 | n |
+-------------------------+-------+------------------------------------------------+
| df.cov | 13 | |
| df.cov | 13 | n |
+-------------------------+-------+------------------------------------------------+
| df.assign | 12 | |
| df.assign | 12 | n |
+-------------------------+-------+------------------------------------------------+
| df.subtract | 12 | |
| df.subtract | 12 | n |
+-------------------------+-------+------------------------------------------------+
| pd.read_pickle | 11 | |
| pd.read_pickle | 11 | n |
+-------------------------+-------+------------------------------------------------+
| df.applymap | 11 | |
| df.applymap | 11 | n |
+-------------------------+-------+------------------------------------------------+
| df.first | 11 | |
| df.first | 11 | n |
+-------------------------+-------+------------------------------------------------+
| df.kurt | 10 | |
| df.kurt | 10 | n |
+-------------------------+-------+------------------------------------------------+
| df.truncate | 10 | |
| df.truncate | 10 | n |
+-------------------------+-------+------------------------------------------------+
| df.get_value | 9 | |
| df.get_value | 9 | n |
+-------------------------+-------+------------------------------------------------+
| pd.read_hdf | 9 | |
| pd.read_hdf | 9 | n |
+-------------------------+-------+------------------------------------------------+
| df.to_html | 9 | |
| df.to_html | 9 | y |
+-------------------------+-------+------------------------------------------------+
| pd.read_sql_query | 9 | |
| pd.read_sql_query | 9 | n |
+-------------------------+-------+------------------------------------------------+
| df.take | 8 | |
| df.take | 8 | n |
+-------------------------+-------+------------------------------------------------+
| df.to_pickle | 7 | |
| df.to_pickle | 7 | n |
+-------------------------+-------+------------------------------------------------+
| df.itertuples | 7 | |
| df.itertuples | 7 | n |
+-------------------------+-------+------------------------------------------------+
| df.to_string | 7 | |
| df.to_string | 7 | y |
+-------------------------+-------+------------------------------------------------+
| df.last | 7 | |
| df.last | 7 | n |
+-------------------------+-------+------------------------------------------------+
| df.sem | 7 | |
| df.sem | 7 | n |
+-------------------------+-------+------------------------------------------------+
| pd.to_pickle | 7 | |
| pd.to_pickle | 7 | n |
+-------------------------+-------+------------------------------------------------+
| df.to_json | 7 | |
| df.to_json | 7 | n |
+-------------------------+-------+------------------------------------------------+
| df.idxmin | 7 | |
| df.idxmin | 7 | n |
+-------------------------+-------+------------------------------------------------+
| df.xs | 6 | |
| df.xs | 6 | n |
+-------------------------+-------+------------------------------------------------+
| df.combine | 6 | |
| df.combine | 6 | n |
+-------------------------+-------+------------------------------------------------+
| pd.rolling_mean | 6 | |
| pd.rolling_mean | 6 | n |
+-------------------------+-------+------------------------------------------------+
| df.to_period | 6 | |
| df.to_period | 6 | n |
+-------------------------+-------+------------------------------------------------+
| df.convert_objects | 5 | |
| df.convert_objects | 5 | n |
+-------------------------+-------+------------------------------------------------+
| df.mask | 4 | |
| df.mask | 4 | n |
+-------------------------+-------+------------------------------------------------+
| df.pct_change | 4 | |
| df.pct_change | 4 | n |
+-------------------------+-------+------------------------------------------------+
| df.add_prefix | 4 | |
| df.add_prefix | 4 | n |
+-------------------------+-------+------------------------------------------------+
| pd.read_excel | 4 | |
| pd.read_excel | 4 | n |
+-------------------------+-------+------------------------------------------------+
| pd.rolling_std | 3 | |
| pd.rolling_std | 3 | n |
+-------------------------+-------+------------------------------------------------+
| df.to_records | 3 | |
| df.to_records | 3 | n |
+-------------------------+-------+------------------------------------------------+
| df.corrwith | 3 | |
| df.corrwith | 3 | n |
+-------------------------+-------+------------------------------------------------+
| df.swapaxes | 3 | |
| df.swapaxes | 3 | n |
+-------------------------+-------+------------------------------------------------+
| df.__iter__ | 3 | |
| df.__iter__ | 3 | n |
+-------------------------+-------+------------------------------------------------+
| df.to_sql | 3 | |
| df.to_sql | 3 | n |
+-------------------------+-------+------------------------------------------------+
| pd.read_feather | 3 | |
| pd.read_feather | 3 | n |
+-------------------------+-------+------------------------------------------------+
| df.to_feather | 3 | |
| df.to_feather | 3 | n |
+-------------------------+-------+------------------------------------------------+
| df.__len__ | 3 | |
| df.__len__ | 3 | n |
+-------------------------+-------+------------------------------------------------+
| df.kurtosis | 3 | |
| df.kurtosis | 3 | n |
+-------------------------+-------+------------------------------------------------+
| df.mod | 2 | |
| df.mod | 2 | n |
+-------------------------+-------+------------------------------------------------+
| df.to_sparse | 2 | |
| df.to_sparse | 2 | n |
+-------------------------+-------+------------------------------------------------+
| df.get_values | 2 | |
| df.get_values | 2 | n |
+-------------------------+-------+------------------------------------------------+
| df.__eq__ | 2 | |
| df.__eq__ | 2 | n |
+-------------------------+-------+------------------------------------------------+
| pd.bdate_range | 2 | |
| pd.bdate_range | 2 | n |
+-------------------------+-------+------------------------------------------------+
| df.get_dtype_counts | 2 | |
| df.get_dtype_counts | 2 | n |
+-------------------------+-------+------------------------------------------------+
| df.combine_first | 2 | |
| df.combine_first | 2 | n |
+-------------------------+-------+------------------------------------------------+
| df._get_numeric_data | 2 | |
| df._get_numeric_data | 2 | n |
+-------------------------+-------+------------------------------------------------+
| df.nsmallest | 2 | |
| df.nsmallest | 2 | n |
+-------------------------+-------+------------------------------------------------+
| pd.scatter_matrix | 2 | |
| pd.scatter_matrix | 2 | n |
+-------------------------+-------+------------------------------------------------+
| df.rename_axis | 2 | |
| df.rename_axis | 2 | n |
+-------------------------+-------+------------------------------------------------+
| df.__setstate__ | 2 | |
| df.__setstate__ | 2 | n |
+-------------------------+-------+------------------------------------------------+
| df.cumprod | 2 | |
| df.cumprod | 2 | n |
+-------------------------+-------+------------------------------------------------+
| df.__getstate__ | 2 | |
| df.__getstate__ | 2 | n |
+-------------------------+-------+------------------------------------------------+
| df.equals | 2 | |
| df.equals | 2 | n |
+-------------------------+-------+------------------------------------------------+
| df.__getitem__ | 2 | |
| df.__getitem__ | 2 | y |
+-------------------------+-------+------------------------------------------------+
| df.clip_upper | 2 | |
| df.clip_upper | 2 | n |
+-------------------------+-------+------------------------------------------------+
| df.floordiv | 2 | |
| df.floordiv | 2 | n |
+-------------------------+-------+------------------------------------------------+
| df.to_excel | 2 | |
| df.to_excel | 2 | n |
+-------------------------+-------+------------------------------------------------+
| df.reindex_axis | 1 | |
| df.reindex_axis | 1 | n |
+-------------------------+-------+------------------------------------------------+
| pd.to_timedelta | 1 | |
| pd.to_timedelta | 1 | n |
+-------------------------+-------+------------------------------------------------+
| df.ewm | 1 | |
| df.ewm | 1 | n |
+-------------------------+-------+------------------------------------------------+
| df.tz_localize | 1 | |
| df.tz_localize | 1 | n |
+-------------------------+-------+------------------------------------------------+
| df.tz_convert | 1 | |
| df.tz_convert | 1 | n |
+-------------------------+-------+------------------------------------------------+
| df.to_hdf | 1 | |
| df.to_hdf | 1 | n |
+-------------------------+-------+------------------------------------------------+
| df.lookup | 1 | |
| df.lookup | 1 | n |
+-------------------------+-------+------------------------------------------------+
| pd.merge_ordered | 1 | |
| pd.merge_ordered | 1 | n |
+-------------------------+-------+------------------------------------------------+
| df.swaplevel | 1 | |
| df.swaplevel | 1 | n |
+-------------------------+-------+------------------------------------------------+
| df.first_valid_index | 1 | |
| df.first_valid_index | 1 | n |
+-------------------------+-------+------------------------------------------------+
| df.lt | 1 | |
| df.lt | 1 | n |
+-------------------------+-------+------------------------------------------------+
| df.add_suffix | 1 | |
| df.add_suffix | 1 | n |
+-------------------------+-------+------------------------------------------------+
| pd.rolling_median | 1 | |
| pd.rolling_median | 1 | n |
+-------------------------+-------+------------------------------------------------+
| df.to_dense | 1 | |
| df.to_dense | 1 | n |
+-------------------------+-------+------------------------------------------------+
| df.mad | 1 | |
| df.mad | 1 | n |
+-------------------------+-------+------------------------------------------------+
| df.align | 1 | |
| df.align | 1 | n |
+-------------------------+-------+------------------------------------------------+
| df.__copy__ | 1 | |
| df.__copy__ | 1 | n |
+-------------------------+-------+------------------------------------------------+
| pd.set_eng_float_format | 1 | |
| pd.set_eng_float_format | 1 | n |
+-------------------------+-------+------------------------------------------------+
| df.add_suffix | 1 | |
| df.add_suffix | 1 | n |
+-------------------------+-------+------------------------------------------------+
| pd.rolling_median | 1 | |
| pd.rolling_median | 1 | n |
+-------------------------+-------+------------------------------------------------+
| df.to_dense | 1 | |
| df.to_dense | 1 | n |
+-------------------------+-------+------------------------------------------------+
| df.mad | 1 | |
| df.mad | 1 | n |
+-------------------------+-------+------------------------------------------------+
| df.align | 1 | |
| df.align | 1 | n |
+-------------------------+-------+------------------------------------------------+
| df.__copy__ | 1 | |
| df.__copy__ | 1 | n |
+-------------------------+-------+------------------------------------------------+
| pd.set_eng_float_format | 1 | |
| pd.set_eng_float_format | 1 | n |
+-------------------------+-------+------------------------------------------------+
+---------------------------+---------------------------------+----------------------------------------------------+
@ -447,7 +448,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep
+---------------------------+---------------------------------+----------------------------------------------------+
| ``add_suffix`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``agg`` | N | |
| ``agg`` | Y | |
| ``aggregate`` | | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``align`` | N | |
@ -512,7 +513,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep
+---------------------------+---------------------------------+----------------------------------------------------+
| ``corrwith`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``count`` | N | |
| ``count`` | Y | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``cov`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
@ -524,29 +525,29 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep
+---------------------------+---------------------------------+----------------------------------------------------+
| ``cumsum`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``describe`` | N | |
| ``describe`` | Y | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``diff`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``div`` | N | See ``add`` |
| ``div`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``divide`` | N | See ``add`` |
| ``divide`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``dot`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``drop`` | N | |
| ``drop`` | Y | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``drop_duplicates`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``dropna`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``dtypes`` | N | |
| ``dtypes`` | Y | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``duplicated`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``empty`` | N | |
| ``empty`` | Y | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``eq`` | N | See ``add`` |
| ``eq`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``equals`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
@ -566,7 +567,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep
+---------------------------+---------------------------------+----------------------------------------------------+
| ``first_valid_index`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``floordiv`` | N | See ``add`` |
| ``floordiv`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``from_csv`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
@ -578,9 +579,9 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep
+---------------------------+---------------------------------+----------------------------------------------------+
| ``ftypes`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``ge`` | N | See ``add`` |
| ``ge`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``get`` | N | |
| ``get`` | Y | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``get_dtype_counts`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
@ -592,11 +593,11 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep
+---------------------------+---------------------------------+----------------------------------------------------+
| ``groupby`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``gt`` | N | See ``add`` |
| ``gt`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``head`` | Y | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``hist`` | N | |
| ``hist`` | Y | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``iat`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
@ -608,7 +609,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep
+---------------------------+---------------------------------+----------------------------------------------------+
| ``infer_objects`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``info`` | N | |
| ``info`` | Y | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``insert`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
@ -634,7 +635,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep
+---------------------------+---------------------------------+----------------------------------------------------+
| ``join`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``keys`` | N | |
| ``keys`` | Y | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``kurt`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
@ -644,21 +645,21 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep
+---------------------------+---------------------------------+----------------------------------------------------+
| ``last_valid_index`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``le`` | N | See ``add`` |
| ``le`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``loc`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``lookup`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``lt`` | N | See ``add`` |
| ``lt`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``mad`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``mask`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``max`` | N | |
| ``max`` | Y | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``mean`` | N | |
| ``mean`` | Y | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``median`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
@ -668,19 +669,19 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep
+---------------------------+---------------------------------+----------------------------------------------------+
| ``merge`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``min`` | N | |
| ``min`` | Y | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``mod`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``mode`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``mul`` | N | See ``add`` |
| ``mul`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``multiply`` | N | See ``add`` |
| ``multiply`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``ndim`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``ne`` | N | See ``add`` |
| ``ne`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``nlargest`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
@ -690,7 +691,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep
+---------------------------+---------------------------------+----------------------------------------------------+
| ``nsmallest`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``nunique`` | N | |
| ``nunique`` | Y | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``pct_change`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
@ -704,7 +705,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep
+---------------------------+---------------------------------+----------------------------------------------------+
| ``pop`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``pow`` | N | See ``add`` |
| ``pow`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``prod`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
@ -712,13 +713,13 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep
+---------------------------+---------------------------------+----------------------------------------------------+
| ``quantile`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``query`` | N | |
| ``query`` | Y | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``radd`` | N | See ``add`` |
| ``radd`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``rank`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``rdiv`` | N | See ``add`` |
| ``rdiv`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``reindex`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
@ -738,27 +739,27 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep
+---------------------------+---------------------------------+----------------------------------------------------+
| ``reset_index`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``rfloordiv`` | N | See ``add`` |
| ``rfloordiv`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``rmod`` | N | See ``add`` |
| ``rmod`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``rmul`` | N | See ``add`` |
| ``rmul`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``rolling`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``round`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``rpow`` | N | See ``add`` |
| ``rpow`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``rsub`` | N | See ``add`` |
| ``rsub`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``rtruediv`` | N | See ``add`` |
| ``rtruediv`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``sample`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``select`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``select_dtypes`` | N | |
| ``select_dtypes`` | Y | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``sem`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
@ -768,7 +769,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep
+---------------------------+---------------------------------+----------------------------------------------------+
| ``set_value`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``shape`` | N | |
| ``shape`` | Y | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``shift`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
@ -792,11 +793,11 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep
+---------------------------+---------------------------------+----------------------------------------------------+
| ``style`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``sub`` | N | See ``add`` |
| ``sub`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``subtract`` | N | See ``add`` |
| ``subtract`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``sum`` | N | |
| ``sum`` | Y | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``swapaxes`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
@ -808,7 +809,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep
+---------------------------+---------------------------------+----------------------------------------------------+
| ``to_clipboard`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``to_csv`` | N | |
| ``to_csv`` | Y | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``to_dense`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
@ -822,7 +823,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep
+---------------------------+---------------------------------+----------------------------------------------------+
| ``to_hdf`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``to_html`` | N | |
| ``to_html`` | Y | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``to_json`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
@ -856,7 +857,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep
+---------------------------+---------------------------------+----------------------------------------------------+
| ``transpose`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``truediv`` | N | See ``add`` |
| ``truediv`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+
| ``truncate`` | N | |
+---------------------------+---------------------------------+----------------------------------------------------+

View File

@ -0,0 +1,61 @@
.. _implementation/details:
======================
Implementation Details
======================
The goal of an ``eland.DataFrame`` is to enable users who are familiar with ``pandas.DataFrame``
to access, explore and manipulate data that resides in Elasticsearch.
Ideally, all data should reside in Elasticsearch and not to reside in memory.
This restricts the API, but allows access to huge data sets that do not fit into memory, and allows
use of powerful Elasticsearch features such as aggrergations.
Pandas and 3rd Party Storage Systems
------------------------------------
Generally, integrations with [3rd party storage systems](https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html)
(SQL, Google Big Query etc.) involve accessing these systems and reading all external data into an
in-core pandas data structure. This also applies to [Apache Arrow](https://arrow.apache.org/docs/python/pandas.html)
structures.
Whilst this provides access to data in these systems, for large datasets this can require significant
in-core memory, and for systems such as Elasticsearch, bulk export of data can be an inefficient way
of exploring the data.
An alternative option is to create an API that proxies ``pandas.DataFrame``-like calls to Elasticsearch
queries and operations. This could allow the Elasticsearch cluster to perform operations such as
aggregations rather than exporting all the data and performing this operation in-core.
Implementation Options
----------------------
An option would be to replace the ``pandas.DataFrame`` backend in-core memory structures with Elasticsearch
accessors. This would allow full access to the ``pandas.DataFrame`` APIs. However, this has issues:
* If a ``pandas.DataFrame`` instance maps to an index, typical manipulation of a ``pandas.DataFrame``
may involve creating many derived ``pandas.DataFrame`` instances. Constructing an index per
``pandas.DataFrame`` may result in many Elasticsearch indexes and a significant load on Elasticsearch.
For example, ``df_a = df['a']`` should not require Elasticsearch indices ``df`` and ``df_a``
* Not all ``pandas.DataFrame`` APIs map to things we may want to do in Elasticsearch. In particular,
API calls that involve exporting all data from Elasticsearch into memory e.g. ``df.to_dict()``.
* The backend ``pandas.DataFrame`` structures are not easily abstractable and are deeply embedded in
the implementation.
Another option is to create a ``eland.DataFrame`` API that mimics appropriate aspects of
the ``pandas.DataFrame`` API. This resolves some of the issues above as:
* ``df_a = df['a']`` could be implemented as a change to the Elasticsearch query used, rather
than a new index
* Instead of supporting the enitre ``pandas.DataFrame`` API we can support a subset appropriate for
Elasticsearch. If addition calls are required, we could to create a ``eland.DataFrame._to_pandas()``
method which would explicitly export all data to a ``pandas.DataFrame``
* Creating a new ``eland.DataFrame`` API gives us full flexibility in terms of implementation. However,
it does create a large amount of work which may duplicate a lot of the ``pandas`` code - for example,
printing objects etc. - this creates maintenance issues etc.

View File

@ -0,0 +1,11 @@
.. _implementation:
====================
Implementation Notes
====================
.. toctree::
:maxdepth: 2
details.rst
dataframe_supported.rst

View File

@ -23,10 +23,17 @@ In general, the data resides in elasticsearch and not in memory, which allows el
:hidden:
reference/index
implementation/index
* :doc:`reference/index`
* :doc:`reference/io`
* :doc:`reference/general_utility_functions`
* :doc:`reference/dataframe`
* :doc:`reference/index`
* :doc:`reference/indexing`
* :doc:`implementation/index`
* :doc:`implementation/details`
* :doc:`implementation/dataframe_supported`

View File

@ -561,7 +561,7 @@ class DataFrame(NDFrame):
See Also
--------
:pandas_api_docs:`to_html` for argument details.
:pandas_api_docs:`pandas.DataFrame.to_html` for argument details.
"""
# In pandas calling 'to_string' without max_rows set, will dump ALL rows - we avoid this
# by limiting rows by default.
@ -621,7 +621,7 @@ class DataFrame(NDFrame):
See Also
--------
:pandas_api_docs:`to_string` for argument details.
:pandas_api_docs:`pandas.DataFrame.to_string` for argument details.
"""
# In pandas calling 'to_string' without max_rows set, will dump ALL rows - we avoid this
# by limiting rows by default.
@ -787,7 +787,7 @@ class DataFrame(NDFrame):
See Also
--------
:pandas_api_docs:`to_csv` for argument details.
:pandas_api_docs:`pandas.DataFrame.to_csv` for argument details.
"""
kwargs = {
"path_or_buf": path_or_buf,

View File

@ -452,28 +452,23 @@ class Mappings:
numeric_source_fields: list of str
List of source fields where pd_dtype == (int64 or float64 or bool)
"""
if columns is not None:
if include_bool == True:
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64') |
(self._mappings_capabilities.pd_dtype == 'bool'))].reindex(
columns).index.tolist()
else:
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64'))].reindex(
columns).index.tolist()
if include_bool == True:
df = self._mappings_capabilities[(self._mappings_capabilities._source == True) &
((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64') |
(self._mappings_capabilities.pd_dtype == 'bool'))]
else:
if include_bool == True:
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64') |
(self._mappings_capabilities.pd_dtype == 'bool'))].index.tolist()
else:
return self._mappings_capabilities[(self._mappings_capabilities._source == True) &
((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64'))].index.tolist()
df = self._mappings_capabilities[(self._mappings_capabilities._source == True) &
((self._mappings_capabilities.pd_dtype == 'int64') |
(self._mappings_capabilities.pd_dtype == 'float64'))]
# if columns exists, filter index with columns
if columns is not None:
# reindex adds NA for non-existing columns (non-numeric), so drop these after reindex
df = df.reindex(columns)
df.dropna(inplace=True)
# return as list
return df.index.to_list()
def source_fields(self):
"""

View File

@ -287,7 +287,7 @@ class NDFrame:
"""
Return mean value for each numeric column
TODO - implement remainder of pandas arguments
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Returns
-------
@ -321,7 +321,7 @@ class NDFrame:
"""
Return sum for each numeric column
TODO - implement remainder of pandas arguments
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Returns
-------
@ -355,7 +355,7 @@ class NDFrame:
"""
Return the minimum value for each numeric column
TODO - implement remainder of pandas arguments
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Returns
-------
@ -389,7 +389,7 @@ class NDFrame:
"""
Return the maximum value for each numeric column
TODO - implement remainder of pandas arguments
TODO - implement remainder of pandas arguments, currently non-numerics are not supported
Returns
-------
@ -488,16 +488,16 @@ class NDFrame:
Examples
--------
>>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelay'])
>>> df = ed.DataFrame('localhost', 'flights', columns=['AvgTicketPrice', 'FlightDelayMin'])
>>> df.describe() # ignoring percentiles as they don't generate consistent results
AvgTicketPrice FlightDelay
count 13059.000000 13059.000000
mean 628.253689 0.251168
std 266.386661 0.433685
min 100.020531 0.000000
AvgTicketPrice FlightDelayMin
count 13059.000000 13059.000000
mean 628.253689 47.335171
std 266.386661 96.743006
min 100.020531 0.000000
...
...
...
max 1199.729004 1.000000
max 1199.729004 360.000000
"""
return self._query_compiler.describe()

View File

@ -4,41 +4,54 @@ from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData
import eland as ed
class TestDataFrameMetrics(TestData):
def test_mean(self):
funcs = ['max', 'min', 'mean', 'sum']
def test_flights_metrics(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
pd_mean = pd_flights.mean(numeric_only=True)
ed_mean = ed_flights.mean(numeric_only=True)
for func in self.funcs:
pd_metric = getattr(pd_flights, func)(numeric_only=True)
ed_metric = getattr(ed_flights, func)(numeric_only=True)
assert_series_equal(pd_mean, ed_mean)
assert_series_equal(pd_metric, ed_metric)
def test_sum(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
def test_ecommerce_selected_non_numeric_source_fields(self):
# None of these are numeric
columns = ['category', 'currency', 'customer_birth_date', 'customer_first_name', 'user']
pd_sum = pd_flights.sum(numeric_only=True)
ed_sum = ed_flights.sum(numeric_only=True)
pd_ecommerce = self.pd_ecommerce()[columns]
ed_ecommerce = self.ed_ecommerce()[columns]
assert_series_equal(pd_sum, ed_sum)
for func in self.funcs:
assert_series_equal(getattr(pd_ecommerce, func)(numeric_only=True), getattr(ed_ecommerce, func)(numeric_only=True),
check_less_precise=True)
def test_min(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
def test_ecommerce_selected_mixed_numeric_source_fields(self):
# Some of these are numeric
columns = ['category', 'currency', 'taxless_total_price', 'customer_birth_date',
'total_quantity', 'customer_first_name', 'user']
pd_min = pd_flights.min(numeric_only=True)
ed_min = ed_flights.min(numeric_only=True)
pd_ecommerce = self.pd_ecommerce()[columns]
ed_ecommerce = self.ed_ecommerce()[columns]
assert_series_equal(pd_min, ed_min)
for func in self.funcs:
assert_series_equal(getattr(pd_ecommerce, func)(numeric_only=True), getattr(ed_ecommerce, func)(numeric_only=True),
check_less_precise=True)
def test_max(self):
pd_flights = self.pd_flights()
ed_flights = self.ed_flights()
pd_max = pd_flights.max(numeric_only=True)
ed_max = ed_flights.max(numeric_only=True)
def test_ecommerce_selected_all_numeric_source_fields(self):
# All of these are numeric
columns = ['total_quantity', 'taxful_total_price', 'taxless_total_price']
assert_series_equal(pd_max, ed_max)
pd_ecommerce = self.pd_ecommerce()[columns]
ed_ecommerce = self.ed_ecommerce()[columns]
for func in self.funcs:
assert_series_equal(getattr(pd_ecommerce, func)(numeric_only=True), getattr(ed_ecommerce, func)(numeric_only=True),
check_less_precise=True)

View File

@ -0,0 +1,77 @@
# File called _pytest for PyCharm compatability
import numpy as np
from pandas.util.testing import assert_series_equal
from eland.tests.common import TestData
class TestMappingsNumericSourceFields(TestData):
def test_flights_numeric_source_fields(self):
ed_flights = self.ed_flights()
pd_flights = self.pd_flights()
ed_numeric = ed_flights._query_compiler._mappings.numeric_source_fields(columns=None, include_bool=False)
pd_numeric = pd_flights.select_dtypes(include=np.number)
assert pd_numeric.columns.to_list() == ed_numeric
def test_ecommerce_selected_non_numeric_source_fields(self):
columns = ['category', 'currency', 'customer_birth_date', 'customer_first_name', 'user']
"""
Note: non of there are numeric
category object
currency object
customer_birth_date datetime64[ns]
customer_first_name object
user object
"""
ed_ecommerce = self.ed_ecommerce()[columns]
pd_ecommerce = self.pd_ecommerce()[columns]
ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(columns=columns, include_bool=False)
pd_numeric = pd_ecommerce.select_dtypes(include=np.number)
assert pd_numeric.columns.to_list() == ed_numeric
def test_ecommerce_selected_mixed_numeric_source_fields(self):
columns = ['category', 'currency', 'customer_birth_date', 'customer_first_name', 'total_quantity', 'user']
"""
Note: one is numeric
category object
currency object
customer_birth_date datetime64[ns]
customer_first_name object
total_quantity int64
user object
"""
ed_ecommerce = self.ed_ecommerce()[columns]
pd_ecommerce = self.pd_ecommerce()[columns]
ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(columns=columns, include_bool=False)
pd_numeric = pd_ecommerce.select_dtypes(include=np.number)
assert pd_numeric.columns.to_list() == ed_numeric
def test_ecommerce_selected_all_numeric_source_fields(self):
columns = ['total_quantity', 'taxful_total_price', 'taxless_total_price']
"""
Note: all are numeric
total_quantity int64
taxful_total_price float64
taxless_total_price float64
"""
ed_ecommerce = self.ed_ecommerce()[columns]
pd_ecommerce = self.pd_ecommerce()[columns]
ed_numeric = ed_ecommerce._query_compiler._mappings.numeric_source_fields(columns=columns, include_bool=False)
pd_numeric = pd_ecommerce.select_dtypes(include=np.number)
assert pd_numeric.columns.to_list() == ed_numeric

View File

@ -8,7 +8,7 @@ from eland import Client
from eland import DataFrame
from eland import Mappings
_default_chunk_size = 10000
DEFAULT_CHUNK_SIZE = 10000
def read_es(es_params, index_pattern):
@ -80,7 +80,7 @@ def pandas_to_eland(pd_df, es_params, destination_index, if_exists='fail', chunk
eland.eland_to_pandas: Create a pandas.Dataframe from eland.DataFrame
"""
if chunksize is None:
chunksize = _default_chunk_size
chunksize = DEFAULT_CHUNK_SIZE
client = Client(es_params)
@ -99,7 +99,7 @@ def pandas_to_eland(pd_df, es_params, destination_index, if_exists='fail', chunk
client.index_delete(index=destination_index)
client.index_create(index=destination_index, body=mapping)
# elif if_exists == "append":
# TODO validate mapping is compatible
# TODO validate mapping are compatible
else:
client.index_create(index=destination_index, body=mapping)
@ -226,7 +226,7 @@ def read_csv(filepath_or_buffer,
**Modifies an Elasticsearch index**
**Note iteration not supported**
**Note pandas iteration options not supported**
Parameters
----------
@ -248,17 +248,17 @@ def read_csv(filepath_or_buffer,
es_geo_points: list, default None
List of columns to map to geo_point data type
iterator
ignored
not supported
chunksize
number of csv rows to read before bulk index into Elasticsearch
Other Parameters
----------------
Parameters derived from :pandas_api_docs:`read_csv`.
Parameters derived from :pandas_api_docs:`pandas.read_csv`.
See Also
--------
:pandas_api_docs:`read_csv` - for all parameters
:pandas_api_docs:`pandas.read_csv` - for all parameters
Notes
-----
@ -318,7 +318,7 @@ def read_csv(filepath_or_buffer,
)
if chunksize is None:
kwds.update(chunksize=_default_chunk_size)
kwds.update(chunksize=DEFAULT_CHUNK_SIZE)
client = Client(es_client)