diff --git a/eland/docs/dataframe_supported.rst b/docs/source/implementation/dataframe_supported.rst similarity index 82% rename from eland/docs/dataframe_supported.rst rename to docs/source/implementation/dataframe_supported.rst index a9353ef..0589632 100644 --- a/eland/docs/dataframe_supported.rst +++ b/docs/source/implementation/dataframe_supported.rst @@ -1,3 +1,6 @@ +.. _implementation/dataframe_supported: + +=============================== pandas.DataFrame supported APIs =============================== @@ -8,20 +11,18 @@ also welcome! The following table is structured as follows: The first column contains the method name. The second column is a flag for whether or not there is an implementation in Modin for -the method in the left column. ``Y`` stands for yes, ``N`` stands for no, ``P`` stands -for partial (meaning some parameters may not be supported yet), and ``D`` stands for -default to pandas. +the method in the left column. ``Y`` stands for yes, ``N`` stands for no. https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv represents a prioritised list. +-------------------------+-------+------------------------------------------------+ | Method | Count | Notes | +-------------------------+-------+------------------------------------------------+ -| pd.read_csv | 1422 | Not implemented ed.read_es implemented instead | +| pd.read_csv | 1422 | y | +-------------------------+-------+------------------------------------------------+ | pd.DataFrame | 886 | y | +-------------------------+-------+------------------------------------------------+ -| df.append | 792 | Not implemented | +| df.append | 792 | n | +-------------------------+-------+------------------------------------------------+ | df.mean | 783 | y | +-------------------------+-------+------------------------------------------------+ @@ -31,407 +32,407 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep +-------------------------+-------+------------------------------------------------+ | df.sum | 755 | y | +-------------------------+-------+------------------------------------------------+ -| df.to_csv | 693 | | +| df.to_csv | 693 | y | +-------------------------+-------+------------------------------------------------+ -| df.get | 669 | | +| df.get | 669 | y | +-------------------------+-------+------------------------------------------------+ -| df.mode | 653 | | +| df.mode | 653 | n | +-------------------------+-------+------------------------------------------------+ -| df.astype | 649 | | +| df.astype | 649 | n | +-------------------------+-------+------------------------------------------------+ -| df.sub | 637 | | +| df.sub | 637 | n | +-------------------------+-------+------------------------------------------------+ -| pd.concat | 582 | | +| pd.concat | 582 | n | +-------------------------+-------+------------------------------------------------+ -| df.apply | 577 | | +| df.apply | 577 | n | +-------------------------+-------+------------------------------------------------+ -| df.groupby | 557 | | +| df.groupby | 557 | n | +-------------------------+-------+------------------------------------------------+ -| df.join | 544 | | +| df.join | 544 | n | +-------------------------+-------+------------------------------------------------+ -| df.fillna | 543 | | +| df.fillna | 543 | n | +-------------------------+-------+------------------------------------------------+ -| df.max | 508 | | +| df.max | 508 | y | +-------------------------+-------+------------------------------------------------+ -| df.reset_index | 434 | | +| df.reset_index | 434 | n | +-------------------------+-------+------------------------------------------------+ -| pd.unique | 433 | | +| pd.unique | 433 | n | +-------------------------+-------+------------------------------------------------+ -| df.le | 405 | | +| df.le | 405 | n | +-------------------------+-------+------------------------------------------------+ -| df.count | 399 | | +| df.count | 399 | y | +-------------------------+-------+------------------------------------------------+ -| pd.value_counts | 397 | | +| pd.value_counts | 397 | n | +-------------------------+-------+------------------------------------------------+ -| df.sort_values | 390 | | +| df.sort_values | 390 | n | +-------------------------+-------+------------------------------------------------+ -| df.transform | 387 | | +| df.transform | 387 | n | +-------------------------+-------+------------------------------------------------+ -| df.merge | 376 | | +| df.merge | 376 | n | +-------------------------+-------+------------------------------------------------+ -| df.add | 346 | | +| df.add | 346 | n | +-------------------------+-------+------------------------------------------------+ -| df.isnull | 338 | | +| df.isnull | 338 | n | +-------------------------+-------+------------------------------------------------+ -| df.min | 321 | | +| df.min | 321 | y | +-------------------------+-------+------------------------------------------------+ -| df.copy | 314 | | +| df.copy | 314 | n | +-------------------------+-------+------------------------------------------------+ -| df.replace | 300 | | +| df.replace | 300 | n | +-------------------------+-------+------------------------------------------------+ -| df.std | 261 | | +| df.std | 261 | n | +-------------------------+-------+------------------------------------------------+ -| df.hist | 246 | | +| df.hist | 246 | y | +-------------------------+-------+------------------------------------------------+ -| df.filter | 234 | | +| df.filter | 234 | n | +-------------------------+-------+------------------------------------------------+ -| df.describe | 220 | | +| df.describe | 220 | y | +-------------------------+-------+------------------------------------------------+ -| df.ne | 218 | | +| df.ne | 218 | n | +-------------------------+-------+------------------------------------------------+ -| df.corr | 217 | | +| df.corr | 217 | n | +-------------------------+-------+------------------------------------------------+ -| df.median | 217 | | +| df.median | 217 | n | +-------------------------+-------+------------------------------------------------+ -| df.items | 212 | | +| df.items | 212 | n | +-------------------------+-------+------------------------------------------------+ -| pd.to_datetime | 204 | | +| pd.to_datetime | 204 | n | +-------------------------+-------+------------------------------------------------+ -| df.isin | 203 | | +| df.isin | 203 | n | +-------------------------+-------+------------------------------------------------+ -| df.dropna | 195 | | +| df.dropna | 195 | n | +-------------------------+-------+------------------------------------------------+ -| pd.get_dummies | 190 | | +| pd.get_dummies | 190 | n | +-------------------------+-------+------------------------------------------------+ -| df.rename | 185 | | +| df.rename | 185 | n | +-------------------------+-------+------------------------------------------------+ -| df.info | 180 | | +| df.info | 180 | y | +-------------------------+-------+------------------------------------------------+ -| df.set_index | 166 | | +| df.set_index | 166 | n | +-------------------------+-------+------------------------------------------------+ -| df.keys | 159 | | +| df.keys | 159 | y | +-------------------------+-------+------------------------------------------------+ -| df.sample | 155 | | +| df.sample | 155 | n | +-------------------------+-------+------------------------------------------------+ -| df.agg | 140 | | +| df.agg | 140 | y | +-------------------------+-------+------------------------------------------------+ -| df.where | 138 | | +| df.where | 138 | n | +-------------------------+-------+------------------------------------------------+ -| df.boxplot | 134 | | +| df.boxplot | 134 | n | +-------------------------+-------+------------------------------------------------+ -| df.clip | 116 | | +| df.clip | 116 | n | +-------------------------+-------+------------------------------------------------+ -| df.round | 116 | | +| df.round | 116 | n | +-------------------------+-------+------------------------------------------------+ -| df.abs | 101 | | +| df.abs | 101 | n | +-------------------------+-------+------------------------------------------------+ -| df.stack | 97 | | +| df.stack | 97 | n | +-------------------------+-------+------------------------------------------------+ -| df.tail | 94 | | +| df.tail | 94 | y | +-------------------------+-------+------------------------------------------------+ -| df.update | 92 | | +| df.update | 92 | n | +-------------------------+-------+------------------------------------------------+ -| df.iterrows | 90 | | +| df.iterrows | 90 | n | +-------------------------+-------+------------------------------------------------+ -| df.transpose | 87 | | +| df.transpose | 87 | n | +-------------------------+-------+------------------------------------------------+ -| df.any | 85 | | +| df.any | 85 | n | +-------------------------+-------+------------------------------------------------+ -| df.pipe | 80 | | +| df.pipe | 80 | n | +-------------------------+-------+------------------------------------------------+ -| pd.eval | 73 | | +| pd.eval | 73 | n | +-------------------------+-------+------------------------------------------------+ -| df.eval | 73 | | +| df.eval | 73 | n | +-------------------------+-------+------------------------------------------------+ -| pd.read_json | 72 | | +| pd.read_json | 72 | n | +-------------------------+-------+------------------------------------------------+ -| df.nunique | 70 | | +| df.nunique | 70 | y | +-------------------------+-------+------------------------------------------------+ -| df.pivot | 70 | | +| df.pivot | 70 | n | +-------------------------+-------+------------------------------------------------+ -| df.select | 68 | | +| df.select | 68 | n | +-------------------------+-------+------------------------------------------------+ -| df.as_matrix | 67 | | +| df.as_matrix | 67 | n | +-------------------------+-------+------------------------------------------------+ -| df.notnull | 66 | | +| df.notnull | 66 | n | +-------------------------+-------+------------------------------------------------+ -| df.cumsum | 66 | | +| df.cumsum | 66 | n | +-------------------------+-------+------------------------------------------------+ -| df.prod | 64 | | +| df.prod | 64 | n | +-------------------------+-------+------------------------------------------------+ -| df.unstack | 64 | | +| df.unstack | 64 | n | +-------------------------+-------+------------------------------------------------+ -| df.drop_duplicates | 63 | | +| df.drop_duplicates | 63 | n | +-------------------------+-------+------------------------------------------------+ -| df.div | 63 | | +| df.div | 63 | n | +-------------------------+-------+------------------------------------------------+ -| pd.crosstab | 59 | | +| pd.crosstab | 59 | n | +-------------------------+-------+------------------------------------------------+ -| df.select_dtypes | 57 | | +| df.select_dtypes | 57 | y | +-------------------------+-------+------------------------------------------------+ -| df.pow | 56 | | +| df.pow | 56 | n | +-------------------------+-------+------------------------------------------------+ -| df.sort_index | 56 | | +| df.sort_index | 56 | n | +-------------------------+-------+------------------------------------------------+ -| df.product | 52 | | +| df.product | 52 | n | +-------------------------+-------+------------------------------------------------+ -| df.isna | 51 | | +| df.isna | 51 | n | +-------------------------+-------+------------------------------------------------+ -| df.dot | 46 | | +| df.dot | 46 | n | +-------------------------+-------+------------------------------------------------+ -| pd.cut | 45 | | +| pd.cut | 45 | n | +-------------------------+-------+------------------------------------------------+ -| df.bool | 44 | | +| df.bool | 44 | n | +-------------------------+-------+------------------------------------------------+ -| df.to_dict | 44 | | +| df.to_dict | 44 | n | +-------------------------+-------+------------------------------------------------+ -| df.diff | 44 | | +| df.diff | 44 | n | +-------------------------+-------+------------------------------------------------+ -| df.insert | 44 | | +| df.insert | 44 | n | +-------------------------+-------+------------------------------------------------+ -| df.pop | 44 | | +| df.pop | 44 | n | +-------------------------+-------+------------------------------------------------+ -| df.query | 43 | | +| df.query | 43 | y | +-------------------------+-------+------------------------------------------------+ -| df.var | 43 | | +| df.var | 43 | n | +-------------------------+-------+------------------------------------------------+ -| df.__init__ | 41 | | +| df.__init__ | 41 | y | +-------------------------+-------+------------------------------------------------+ -| pd.to_numeric | 39 | | +| pd.to_numeric | 39 | n | +-------------------------+-------+------------------------------------------------+ -| df.squeeze | 39 | | +| df.squeeze | 39 | n | +-------------------------+-------+------------------------------------------------+ -| df.ge | 37 | | +| df.ge | 37 | n | +-------------------------+-------+------------------------------------------------+ -| df.quantile | 37 | | +| df.quantile | 37 | n | +-------------------------+-------+------------------------------------------------+ -| df.reindex | 37 | | +| df.reindex | 37 | n | +-------------------------+-------+------------------------------------------------+ -| df.rolling | 35 | | +| df.rolling | 35 | n | +-------------------------+-------+------------------------------------------------+ -| pd.factorize | 32 | | +| pd.factorize | 32 | n | +-------------------------+-------+------------------------------------------------+ -| pd.melt | 31 | | +| pd.melt | 31 | n | +-------------------------+-------+------------------------------------------------+ -| df.melt | 31 | | +| df.melt | 31 | n | +-------------------------+-------+------------------------------------------------+ -| df.rank | 31 | | +| df.rank | 31 | n | +-------------------------+-------+------------------------------------------------+ -| pd.read_table | 30 | | +| pd.read_table | 30 | n | +-------------------------+-------+------------------------------------------------+ -| pd.pivot_table | 30 | | +| pd.pivot_table | 30 | n | +-------------------------+-------+------------------------------------------------+ -| df.idxmax | 30 | | +| df.idxmax | 30 | n | +-------------------------+-------+------------------------------------------------+ -| pd.test | 29 | | +| pd.test | 29 | n | +-------------------------+-------+------------------------------------------------+ -| df.iteritems | 29 | | +| df.iteritems | 29 | n | +-------------------------+-------+------------------------------------------------+ -| df.shift | 28 | | +| df.shift | 28 | n | +-------------------------+-------+------------------------------------------------+ -| df.mul | 28 | | +| df.mul | 28 | n | +-------------------------+-------+------------------------------------------------+ -| pd.qcut | 25 | | +| pd.qcut | 25 | n | +-------------------------+-------+------------------------------------------------+ -| df.set_value | 25 | | +| df.set_value | 25 | n | +-------------------------+-------+------------------------------------------------+ -| df.all | 24 | | +| df.all | 24 | n | +-------------------------+-------+------------------------------------------------+ -| df.skew | 24 | | +| df.skew | 24 | n | +-------------------------+-------+------------------------------------------------+ -| df.aggregate | 23 | | +| df.aggregate | 23 | y | +-------------------------+-------+------------------------------------------------+ -| pd.match | 22 | | +| pd.match | 22 | n | +-------------------------+-------+------------------------------------------------+ -| df.nlargest | 22 | | +| df.nlargest | 22 | n | +-------------------------+-------+------------------------------------------------+ -| df.multiply | 21 | | +| df.multiply | 21 | n | +-------------------------+-------+------------------------------------------------+ -| df.set_axis | 19 | | +| df.set_axis | 19 | n | +-------------------------+-------+------------------------------------------------+ -| df.eq | 18 | | +| df.eq | 18 | n | +-------------------------+-------+------------------------------------------------+ -| df.resample | 18 | | +| df.resample | 18 | n | +-------------------------+-------+------------------------------------------------+ -| pd.read_sql | 17 | | +| pd.read_sql | 17 | n | +-------------------------+-------+------------------------------------------------+ -| df.duplicated | 16 | | +| df.duplicated | 16 | n | +-------------------------+-------+------------------------------------------------+ -| pd.date_range | 16 | | +| pd.date_range | 16 | n | +-------------------------+-------+------------------------------------------------+ -| df.interpolate | 15 | | +| df.interpolate | 15 | n | +-------------------------+-------+------------------------------------------------+ -| df.memory_usage | 15 | | +| df.memory_usage | 15 | n | +-------------------------+-------+------------------------------------------------+ -| df.divide | 14 | | +| df.divide | 14 | n | +-------------------------+-------+------------------------------------------------+ -| df.cov | 13 | | +| df.cov | 13 | n | +-------------------------+-------+------------------------------------------------+ -| df.assign | 12 | | +| df.assign | 12 | n | +-------------------------+-------+------------------------------------------------+ -| df.subtract | 12 | | +| df.subtract | 12 | n | +-------------------------+-------+------------------------------------------------+ -| pd.read_pickle | 11 | | +| pd.read_pickle | 11 | n | +-------------------------+-------+------------------------------------------------+ -| df.applymap | 11 | | +| df.applymap | 11 | n | +-------------------------+-------+------------------------------------------------+ -| df.first | 11 | | +| df.first | 11 | n | +-------------------------+-------+------------------------------------------------+ -| df.kurt | 10 | | +| df.kurt | 10 | n | +-------------------------+-------+------------------------------------------------+ -| df.truncate | 10 | | +| df.truncate | 10 | n | +-------------------------+-------+------------------------------------------------+ -| df.get_value | 9 | | +| df.get_value | 9 | n | +-------------------------+-------+------------------------------------------------+ -| pd.read_hdf | 9 | | +| pd.read_hdf | 9 | n | +-------------------------+-------+------------------------------------------------+ -| df.to_html | 9 | | +| df.to_html | 9 | y | +-------------------------+-------+------------------------------------------------+ -| pd.read_sql_query | 9 | | +| pd.read_sql_query | 9 | n | +-------------------------+-------+------------------------------------------------+ -| df.take | 8 | | +| df.take | 8 | n | +-------------------------+-------+------------------------------------------------+ -| df.to_pickle | 7 | | +| df.to_pickle | 7 | n | +-------------------------+-------+------------------------------------------------+ -| df.itertuples | 7 | | +| df.itertuples | 7 | n | +-------------------------+-------+------------------------------------------------+ -| df.to_string | 7 | | +| df.to_string | 7 | y | +-------------------------+-------+------------------------------------------------+ -| df.last | 7 | | +| df.last | 7 | n | +-------------------------+-------+------------------------------------------------+ -| df.sem | 7 | | +| df.sem | 7 | n | +-------------------------+-------+------------------------------------------------+ -| pd.to_pickle | 7 | | +| pd.to_pickle | 7 | n | +-------------------------+-------+------------------------------------------------+ -| df.to_json | 7 | | +| df.to_json | 7 | n | +-------------------------+-------+------------------------------------------------+ -| df.idxmin | 7 | | +| df.idxmin | 7 | n | +-------------------------+-------+------------------------------------------------+ -| df.xs | 6 | | +| df.xs | 6 | n | +-------------------------+-------+------------------------------------------------+ -| df.combine | 6 | | +| df.combine | 6 | n | +-------------------------+-------+------------------------------------------------+ -| pd.rolling_mean | 6 | | +| pd.rolling_mean | 6 | n | +-------------------------+-------+------------------------------------------------+ -| df.to_period | 6 | | +| df.to_period | 6 | n | +-------------------------+-------+------------------------------------------------+ -| df.convert_objects | 5 | | +| df.convert_objects | 5 | n | +-------------------------+-------+------------------------------------------------+ -| df.mask | 4 | | +| df.mask | 4 | n | +-------------------------+-------+------------------------------------------------+ -| df.pct_change | 4 | | +| df.pct_change | 4 | n | +-------------------------+-------+------------------------------------------------+ -| df.add_prefix | 4 | | +| df.add_prefix | 4 | n | +-------------------------+-------+------------------------------------------------+ -| pd.read_excel | 4 | | +| pd.read_excel | 4 | n | +-------------------------+-------+------------------------------------------------+ -| pd.rolling_std | 3 | | +| pd.rolling_std | 3 | n | +-------------------------+-------+------------------------------------------------+ -| df.to_records | 3 | | +| df.to_records | 3 | n | +-------------------------+-------+------------------------------------------------+ -| df.corrwith | 3 | | +| df.corrwith | 3 | n | +-------------------------+-------+------------------------------------------------+ -| df.swapaxes | 3 | | +| df.swapaxes | 3 | n | +-------------------------+-------+------------------------------------------------+ -| df.__iter__ | 3 | | +| df.__iter__ | 3 | n | +-------------------------+-------+------------------------------------------------+ -| df.to_sql | 3 | | +| df.to_sql | 3 | n | +-------------------------+-------+------------------------------------------------+ -| pd.read_feather | 3 | | +| pd.read_feather | 3 | n | +-------------------------+-------+------------------------------------------------+ -| df.to_feather | 3 | | +| df.to_feather | 3 | n | +-------------------------+-------+------------------------------------------------+ -| df.__len__ | 3 | | +| df.__len__ | 3 | n | +-------------------------+-------+------------------------------------------------+ -| df.kurtosis | 3 | | +| df.kurtosis | 3 | n | +-------------------------+-------+------------------------------------------------+ -| df.mod | 2 | | +| df.mod | 2 | n | +-------------------------+-------+------------------------------------------------+ -| df.to_sparse | 2 | | +| df.to_sparse | 2 | n | +-------------------------+-------+------------------------------------------------+ -| df.get_values | 2 | | +| df.get_values | 2 | n | +-------------------------+-------+------------------------------------------------+ -| df.__eq__ | 2 | | +| df.__eq__ | 2 | n | +-------------------------+-------+------------------------------------------------+ -| pd.bdate_range | 2 | | +| pd.bdate_range | 2 | n | +-------------------------+-------+------------------------------------------------+ -| df.get_dtype_counts | 2 | | +| df.get_dtype_counts | 2 | n | +-------------------------+-------+------------------------------------------------+ -| df.combine_first | 2 | | +| df.combine_first | 2 | n | +-------------------------+-------+------------------------------------------------+ -| df._get_numeric_data | 2 | | +| df._get_numeric_data | 2 | n | +-------------------------+-------+------------------------------------------------+ -| df.nsmallest | 2 | | +| df.nsmallest | 2 | n | +-------------------------+-------+------------------------------------------------+ -| pd.scatter_matrix | 2 | | +| pd.scatter_matrix | 2 | n | +-------------------------+-------+------------------------------------------------+ -| df.rename_axis | 2 | | +| df.rename_axis | 2 | n | +-------------------------+-------+------------------------------------------------+ -| df.__setstate__ | 2 | | +| df.__setstate__ | 2 | n | +-------------------------+-------+------------------------------------------------+ -| df.cumprod | 2 | | +| df.cumprod | 2 | n | +-------------------------+-------+------------------------------------------------+ -| df.__getstate__ | 2 | | +| df.__getstate__ | 2 | n | +-------------------------+-------+------------------------------------------------+ -| df.equals | 2 | | +| df.equals | 2 | n | +-------------------------+-------+------------------------------------------------+ -| df.__getitem__ | 2 | | +| df.__getitem__ | 2 | y | +-------------------------+-------+------------------------------------------------+ -| df.clip_upper | 2 | | +| df.clip_upper | 2 | n | +-------------------------+-------+------------------------------------------------+ -| df.floordiv | 2 | | +| df.floordiv | 2 | n | +-------------------------+-------+------------------------------------------------+ -| df.to_excel | 2 | | +| df.to_excel | 2 | n | +-------------------------+-------+------------------------------------------------+ -| df.reindex_axis | 1 | | +| df.reindex_axis | 1 | n | +-------------------------+-------+------------------------------------------------+ -| pd.to_timedelta | 1 | | +| pd.to_timedelta | 1 | n | +-------------------------+-------+------------------------------------------------+ -| df.ewm | 1 | | +| df.ewm | 1 | n | +-------------------------+-------+------------------------------------------------+ -| df.tz_localize | 1 | | +| df.tz_localize | 1 | n | +-------------------------+-------+------------------------------------------------+ -| df.tz_convert | 1 | | +| df.tz_convert | 1 | n | +-------------------------+-------+------------------------------------------------+ -| df.to_hdf | 1 | | +| df.to_hdf | 1 | n | +-------------------------+-------+------------------------------------------------+ -| df.lookup | 1 | | +| df.lookup | 1 | n | +-------------------------+-------+------------------------------------------------+ -| pd.merge_ordered | 1 | | +| pd.merge_ordered | 1 | n | +-------------------------+-------+------------------------------------------------+ -| df.swaplevel | 1 | | +| df.swaplevel | 1 | n | +-------------------------+-------+------------------------------------------------+ -| df.first_valid_index | 1 | | +| df.first_valid_index | 1 | n | +-------------------------+-------+------------------------------------------------+ -| df.lt | 1 | | +| df.lt | 1 | n | +-------------------------+-------+------------------------------------------------+ -| df.add_suffix | 1 | | +| df.add_suffix | 1 | n | +-------------------------+-------+------------------------------------------------+ -| pd.rolling_median | 1 | | +| pd.rolling_median | 1 | n | +-------------------------+-------+------------------------------------------------+ -| df.to_dense | 1 | | +| df.to_dense | 1 | n | +-------------------------+-------+------------------------------------------------+ -| df.mad | 1 | | +| df.mad | 1 | n | +-------------------------+-------+------------------------------------------------+ -| df.align | 1 | | +| df.align | 1 | n | +-------------------------+-------+------------------------------------------------+ -| df.__copy__ | 1 | | +| df.__copy__ | 1 | n | +-------------------------+-------+------------------------------------------------+ -| pd.set_eng_float_format | 1 | | +| pd.set_eng_float_format | 1 | n | +-------------------------+-------+------------------------------------------------+ -| df.add_suffix | 1 | | +| df.add_suffix | 1 | n | +-------------------------+-------+------------------------------------------------+ -| pd.rolling_median | 1 | | +| pd.rolling_median | 1 | n | +-------------------------+-------+------------------------------------------------+ -| df.to_dense | 1 | | +| df.to_dense | 1 | n | +-------------------------+-------+------------------------------------------------+ -| df.mad | 1 | | +| df.mad | 1 | n | +-------------------------+-------+------------------------------------------------+ -| df.align | 1 | | +| df.align | 1 | n | +-------------------------+-------+------------------------------------------------+ -| df.__copy__ | 1 | | +| df.__copy__ | 1 | n | +-------------------------+-------+------------------------------------------------+ -| pd.set_eng_float_format | 1 | | +| pd.set_eng_float_format | 1 | n | +-------------------------+-------+------------------------------------------------+ +---------------------------+---------------------------------+----------------------------------------------------+ @@ -447,7 +448,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep +---------------------------+---------------------------------+----------------------------------------------------+ | ``add_suffix`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``agg`` | N | | +| ``agg`` | Y | | | ``aggregate`` | | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``align`` | N | | @@ -512,7 +513,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep +---------------------------+---------------------------------+----------------------------------------------------+ | ``corrwith`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``count`` | N | | +| ``count`` | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``cov`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ @@ -524,29 +525,29 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep +---------------------------+---------------------------------+----------------------------------------------------+ | ``cumsum`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``describe`` | N | | +| ``describe`` | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``diff`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``div`` | N | See ``add`` | +| ``div`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``divide`` | N | See ``add`` | +| ``divide`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``dot`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``drop`` | N | | +| ``drop`` | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``drop_duplicates`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``dropna`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``dtypes`` | N | | +| ``dtypes`` | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``duplicated`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``empty`` | N | | +| ``empty`` | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``eq`` | N | See ``add`` | +| ``eq`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``equals`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ @@ -566,7 +567,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep +---------------------------+---------------------------------+----------------------------------------------------+ | ``first_valid_index`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``floordiv`` | N | See ``add`` | +| ``floordiv`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``from_csv`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ @@ -578,9 +579,9 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep +---------------------------+---------------------------------+----------------------------------------------------+ | ``ftypes`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``ge`` | N | See ``add`` | +| ``ge`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``get`` | N | | +| ``get`` | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``get_dtype_counts`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ @@ -592,11 +593,11 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep +---------------------------+---------------------------------+----------------------------------------------------+ | ``groupby`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``gt`` | N | See ``add`` | +| ``gt`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``head`` | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``hist`` | N | | +| ``hist`` | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``iat`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ @@ -608,7 +609,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep +---------------------------+---------------------------------+----------------------------------------------------+ | ``infer_objects`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``info`` | N | | +| ``info`` | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``insert`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ @@ -634,7 +635,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep +---------------------------+---------------------------------+----------------------------------------------------+ | ``join`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``keys`` | N | | +| ``keys`` | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``kurt`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ @@ -644,21 +645,21 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep +---------------------------+---------------------------------+----------------------------------------------------+ | ``last_valid_index`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``le`` | N | See ``add`` | +| ``le`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``loc`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``lookup`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``lt`` | N | See ``add`` | +| ``lt`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``mad`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``mask`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``max`` | N | | +| ``max`` | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``mean`` | N | | +| ``mean`` | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``median`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ @@ -668,19 +669,19 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep +---------------------------+---------------------------------+----------------------------------------------------+ | ``merge`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``min`` | N | | +| ``min`` | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``mod`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``mode`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``mul`` | N | See ``add`` | +| ``mul`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``multiply`` | N | See ``add`` | +| ``multiply`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``ndim`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``ne`` | N | See ``add`` | +| ``ne`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``nlargest`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ @@ -690,7 +691,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep +---------------------------+---------------------------------+----------------------------------------------------+ | ``nsmallest`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``nunique`` | N | | +| ``nunique`` | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``pct_change`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ @@ -704,7 +705,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep +---------------------------+---------------------------------+----------------------------------------------------+ | ``pop`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``pow`` | N | See ``add`` | +| ``pow`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``prod`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ @@ -712,13 +713,13 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep +---------------------------+---------------------------------+----------------------------------------------------+ | ``quantile`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``query`` | N | | +| ``query`` | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``radd`` | N | See ``add`` | +| ``radd`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``rank`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``rdiv`` | N | See ``add`` | +| ``rdiv`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``reindex`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ @@ -738,27 +739,27 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep +---------------------------+---------------------------------+----------------------------------------------------+ | ``reset_index`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``rfloordiv`` | N | See ``add`` | +| ``rfloordiv`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``rmod`` | N | See ``add`` | +| ``rmod`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``rmul`` | N | See ``add`` | +| ``rmul`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``rolling`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``round`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``rpow`` | N | See ``add`` | +| ``rpow`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``rsub`` | N | See ``add`` | +| ``rsub`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``rtruediv`` | N | See ``add`` | +| ``rtruediv`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``sample`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``select`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``select_dtypes`` | N | | +| ``select_dtypes`` | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``sem`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ @@ -768,7 +769,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep +---------------------------+---------------------------------+----------------------------------------------------+ | ``set_value`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``shape`` | N | | +| ``shape`` | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``shift`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ @@ -792,11 +793,11 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep +---------------------------+---------------------------------+----------------------------------------------------+ | ``style`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``sub`` | N | See ``add`` | +| ``sub`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``subtract`` | N | See ``add`` | +| ``subtract`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``sum`` | N | | +| ``sum`` | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``swapaxes`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ @@ -808,7 +809,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep +---------------------------+---------------------------------+----------------------------------------------------+ | ``to_clipboard`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``to_csv`` | N | | +| ``to_csv`` | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``to_dense`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ @@ -822,7 +823,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep +---------------------------+---------------------------------+----------------------------------------------------+ | ``to_hdf`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``to_html`` | N | | +| ``to_html`` | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``to_json`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ @@ -856,7 +857,7 @@ https://github.com/adgirish/kaggleScape/blob/master/results/annotResults.csv rep +---------------------------+---------------------------------+----------------------------------------------------+ | ``transpose`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ -| ``truediv`` | N | See ``add`` | +| ``truediv`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``truncate`` | N | | +---------------------------+---------------------------------+----------------------------------------------------+ diff --git a/docs/source/implementation/details.rst b/docs/source/implementation/details.rst new file mode 100644 index 0000000..79c33d4 --- /dev/null +++ b/docs/source/implementation/details.rst @@ -0,0 +1,61 @@ +.. _implementation/details: + +====================== +Implementation Details +====================== + +The goal of an ``eland.DataFrame`` is to enable users who are familiar with ``pandas.DataFrame`` +to access, explore and manipulate data that resides in Elasticsearch. + +Ideally, all data should reside in Elasticsearch and not to reside in memory. +This restricts the API, but allows access to huge data sets that do not fit into memory, and allows +use of powerful Elasticsearch features such as aggrergations. + + +Pandas and 3rd Party Storage Systems +------------------------------------ + +Generally, integrations with [3rd party storage systems](https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html) +(SQL, Google Big Query etc.) involve accessing these systems and reading all external data into an +in-core pandas data structure. This also applies to [Apache Arrow](https://arrow.apache.org/docs/python/pandas.html) +structures. + +Whilst this provides access to data in these systems, for large datasets this can require significant +in-core memory, and for systems such as Elasticsearch, bulk export of data can be an inefficient way +of exploring the data. + +An alternative option is to create an API that proxies ``pandas.DataFrame``-like calls to Elasticsearch +queries and operations. This could allow the Elasticsearch cluster to perform operations such as +aggregations rather than exporting all the data and performing this operation in-core. + +Implementation Options +---------------------- + +An option would be to replace the ``pandas.DataFrame`` backend in-core memory structures with Elasticsearch +accessors. This would allow full access to the ``pandas.DataFrame`` APIs. However, this has issues: + +* If a ``pandas.DataFrame`` instance maps to an index, typical manipulation of a ``pandas.DataFrame`` + may involve creating many derived ``pandas.DataFrame`` instances. Constructing an index per + ``pandas.DataFrame`` may result in many Elasticsearch indexes and a significant load on Elasticsearch. + For example, ``df_a = df['a']`` should not require Elasticsearch indices ``df`` and ``df_a`` + +* Not all ``pandas.DataFrame`` APIs map to things we may want to do in Elasticsearch. In particular, + API calls that involve exporting all data from Elasticsearch into memory e.g. ``df.to_dict()``. + +* The backend ``pandas.DataFrame`` structures are not easily abstractable and are deeply embedded in + the implementation. + +Another option is to create a ``eland.DataFrame`` API that mimics appropriate aspects of +the ``pandas.DataFrame`` API. This resolves some of the issues above as: + +* ``df_a = df['a']`` could be implemented as a change to the Elasticsearch query used, rather + than a new index + +* Instead of supporting the enitre ``pandas.DataFrame`` API we can support a subset appropriate for + Elasticsearch. If addition calls are required, we could to create a ``eland.DataFrame._to_pandas()`` + method which would explicitly export all data to a ``pandas.DataFrame`` + +* Creating a new ``eland.DataFrame`` API gives us full flexibility in terms of implementation. However, + it does create a large amount of work which may duplicate a lot of the ``pandas`` code - for example, + printing objects etc. - this creates maintenance issues etc. + diff --git a/docs/source/implementation/index.rst b/docs/source/implementation/index.rst new file mode 100644 index 0000000..0d0eab5 --- /dev/null +++ b/docs/source/implementation/index.rst @@ -0,0 +1,11 @@ +.. _implementation: + +==================== +Implementation Notes +==================== + +.. toctree:: + :maxdepth: 2 + + details.rst + dataframe_supported.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index bacd12b..f8ba777 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -23,10 +23,17 @@ In general, the data resides in elasticsearch and not in memory, which allows el :hidden: reference/index + implementation/index * :doc:`reference/index` * :doc:`reference/io` * :doc:`reference/general_utility_functions` * :doc:`reference/dataframe` - * :doc:`reference/index` + * :doc:`reference/indexing` + +* :doc:`implementation/index` + + * :doc:`implementation/details` + * :doc:`implementation/dataframe_supported` + diff --git a/eland/dataframe.py b/eland/dataframe.py index 6532138..92d88a2 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -561,7 +561,7 @@ class DataFrame(NDFrame): See Also -------- - :pandas_api_docs:`to_html` for argument details. + :pandas_api_docs:`pandas.DataFrame.to_html` for argument details. """ # In pandas calling 'to_string' without max_rows set, will dump ALL rows - we avoid this # by limiting rows by default. @@ -621,7 +621,7 @@ class DataFrame(NDFrame): See Also -------- - :pandas_api_docs:`to_string` for argument details. + :pandas_api_docs:`pandas.DataFrame.to_string` for argument details. """ # In pandas calling 'to_string' without max_rows set, will dump ALL rows - we avoid this # by limiting rows by default. @@ -787,7 +787,7 @@ class DataFrame(NDFrame): See Also -------- - :pandas_api_docs:`to_csv` for argument details. + :pandas_api_docs:`pandas.DataFrame.to_csv` for argument details. """ kwargs = { "path_or_buf": path_or_buf, diff --git a/eland/ndframe.py b/eland/ndframe.py index a186714..a196af8 100644 --- a/eland/ndframe.py +++ b/eland/ndframe.py @@ -287,7 +287,7 @@ class NDFrame: """ Return mean value for each numeric column - TODO - implement remainder of pandas arguments + TODO - implement remainder of pandas arguments, currently non-numerics are not supported Returns ------- @@ -321,7 +321,7 @@ class NDFrame: """ Return sum for each numeric column - TODO - implement remainder of pandas arguments + TODO - implement remainder of pandas arguments, currently non-numerics are not supported Returns ------- @@ -355,7 +355,7 @@ class NDFrame: """ Return the minimum value for each numeric column - TODO - implement remainder of pandas arguments + TODO - implement remainder of pandas arguments, currently non-numerics are not supported Returns ------- @@ -389,7 +389,7 @@ class NDFrame: """ Return the maximum value for each numeric column - TODO - implement remainder of pandas arguments + TODO - implement remainder of pandas arguments, currently non-numerics are not supported Returns ------- diff --git a/eland/utils.py b/eland/utils.py index 962588e..b9f9d18 100644 --- a/eland/utils.py +++ b/eland/utils.py @@ -8,7 +8,7 @@ from eland import Client from eland import DataFrame from eland import Mappings -_default_chunk_size = 10000 +DEFAULT_CHUNK_SIZE = 10000 def read_es(es_params, index_pattern): @@ -80,7 +80,7 @@ def pandas_to_eland(pd_df, es_params, destination_index, if_exists='fail', chunk eland.eland_to_pandas: Create a pandas.Dataframe from eland.DataFrame """ if chunksize is None: - chunksize = _default_chunk_size + chunksize = DEFAULT_CHUNK_SIZE client = Client(es_params) @@ -99,7 +99,7 @@ def pandas_to_eland(pd_df, es_params, destination_index, if_exists='fail', chunk client.index_delete(index=destination_index) client.index_create(index=destination_index, body=mapping) # elif if_exists == "append": - # TODO validate mapping is compatible + # TODO validate mapping are compatible else: client.index_create(index=destination_index, body=mapping) @@ -226,7 +226,7 @@ def read_csv(filepath_or_buffer, **Modifies an Elasticsearch index** - **Note iteration not supported** + **Note pandas iteration options not supported** Parameters ---------- @@ -248,17 +248,17 @@ def read_csv(filepath_or_buffer, es_geo_points: list, default None List of columns to map to geo_point data type iterator - ignored + not supported chunksize number of csv rows to read before bulk index into Elasticsearch Other Parameters ---------------- - Parameters derived from :pandas_api_docs:`read_csv`. + Parameters derived from :pandas_api_docs:`pandas.read_csv`. See Also -------- - :pandas_api_docs:`read_csv` - for all parameters + :pandas_api_docs:`pandas.read_csv` - for all parameters Notes ----- @@ -318,7 +318,7 @@ def read_csv(filepath_or_buffer, ) if chunksize is None: - kwds.update(chunksize=_default_chunk_size) + kwds.update(chunksize=DEFAULT_CHUNK_SIZE) client = Client(es_client)