The following table shows the pandas APIs that implemented or non-implemented from pandas API on Spark. Some pandas API do not implement full parameters, so the third column shows missing parameters for each API.
‘Y’ in the second column means it’s implemented including its whole parameter.
‘N’ means it’s not implemented yet.
‘P’ means it’s partially implemented with the missing of some parameters.
All API in the list below computes the data with distributed execution except the ones that require the local execution by design. For example, DataFrame.to_numpy() requires to collect the data to the driver side.
If there is non-implemented pandas API or parameter you want, you can create an Apache Spark JIRA to request or to contribute by your own.
The API list is updated based on the pandas 2.0.0 pre-release.
API
Implemented
Missing parameters
add_categories()
Y
argsort
N
as_ordered()
as_unordered()
astype
equals
is_dtype_equal
map()
max
min
reindex
remove_categories()
remove_unused_categories()
rename_categories()
reorder_categories()
searchsorted
set_categories()
take_nd
tolist
add()
P
axis , fill_value , level
axis
fill_value
level
agg()
aggregate()
align()
broadcast_axis , fill_axis , fill_value , level , limit and more. See the pandas.DataFrame.align and pyspark.pandas.DataFrame.align for detail.
broadcast_axis
fill_axis
limit
all()
any()
level , skipna
skipna
append()
apply()
raw , result_type
raw
result_type
applymap()
na_action
asfreq
assign()
bfill
boxplot()
ax , backend , by , column , figsize and more. See the pandas.DataFrame.boxplot and pyspark.pandas.DataFrame.boxplot for detail.
ax
backend
by
column
figsize
clip()
axis , inplace
inplace
combine
combine_first()
compare
corr()
numeric_only
corrwith()
count
cov()
cummax
cummin
cumprod
cumsum
diff()
div()
divide()
dot()
drop()
errors , inplace , level
errors
drop_duplicates()
dropna()
duplicated()
eq()
axis , level
eval()
explode()
ffill
fillna()
downcast
floordiv()
ge()
groupby()
group_keys , level , observed , sort , squeeze
group_keys
observed
sort
squeeze
gt()
hist()
ax , backend , by , column , data and more. See the pandas.DataFrame.hist and pyspark.pandas.DataFrame.hist for detail.
data
idxmax()
numeric_only , skipna
idxmin()
info()
memory_usage , show_counts
memory_usage
show_counts
insert()
interpolate()
axis , downcast , inplace
isetitem
isin()
isna()
isnull()
items()
iteritems()
iterrows()
itertuples()
join()
other , sort , validate
other
validate
kurt
kurtosis
le()
lookup
lt()
mad()
mask()
axis , errors , inplace , level , try_cast
try_cast
mean
median
melt()
col_level , ignore_index
col_level
ignore_index
merge()
copy , indicator , sort , validate
copy
indicator
mod()
mode()
mul()
multiply()
ne()
nlargest()
notna()
notnull()
nsmallest()
nunique()
pivot()
pivot_table()
dropna , margins , margins_name , observed , sort
dropna
margins
margins_name
pop()
pow()
prod
product
quantile()
interpolation , method
interpolation
method
query()
radd()
rdiv()
reindex()
level , limit , method , tolerance
tolerance
rename()
reorder_levels
replace()
resample()
axis , base , convention , group_keys , kind and more. See the pandas.DataFrame.resample and pyspark.pandas.DataFrame.resample for detail.
base
convention
kind
reset_index()
allow_duplicates , names
allow_duplicates
names
rfloordiv()
rmod()
rmul()
round()
rpow()
rsub()
rtruediv()
select_dtypes()
sem
set_axis
set_index()
verify_integrity
shift()
axis , freq
freq
skew
sort_index()
key , sort_remaining
key
sort_remaining
sort_values()
axis , key , kind
stack()
dropna , level
std
sub()
subtract()
sum
swaplevel()
to_dict()
to_feather
to_gbq
to_html()
encoding
to_markdown
to_numpy
to_orc()
engine , engine_kwargs , index
engine
engine_kwargs
index
to_parquet()
engine , index , storage_options
storage_options
to_period
to_records()
to_stata
to_string()
encoding , max_colwidth , min_rows
max_colwidth
min_rows
to_timestamp
to_xml
transform()
transpose()
truediv()
unstack()
fill_value , level
update()
errors , filter_func
filter_func
value_counts
var
where()
errors , inplace , level , try_cast
ceil()
day_name()
floor()
get_loc
indexer_at_time()
indexer_between_time()
isocalendar
month_name()
normalize()
slice_indexer
snap
strftime()
to_julian_date
to_perioddelta
to_pydatetime
to_series
tz_convert
tz_localize
union_many
all
any
argmax()
axis , skipna
argmin()
asof()
asof_locs
copy()
dtype , names
dtype
delete()
difference()
droplevel()
duplicated
equals()
format
get_indexer
get_indexer_for
get_indexer_non_unique
get_level_values()
get_slice_bound
get_value
groupby
holds_integer()
identical()
intersection()
is_
is_boolean()
is_categorical()
is_floating()
is_integer()
is_interval()
is_mixed
is_numeric()
is_object()
is_type_compatible()
isin
isna
isnull
join
max()
min()
notna
notnull
putmask
ravel
repeat()
set_names()
set_value
shift
slice_locs
sort()
key , na_position
na_position
sortlevel
symmetric_difference()
take
to_flat_index
to_frame()
to_native_types
to_series()
union()
unique()
view()
where
append
codes , dtype , levels , name , names
codes
levels
name
delete
equal_levels()
fillna
get_loc_level
get_locs
is_lexsorted
remove_unused_levels
rename
repeat
set_codes
set_levels
set_names
truncate
unique
view
broadcast_axis , fill_axis , fill_value , level , limit and more. See the pandas.Series.align and pyspark.pandas.Series.align for detail.
convert_dtype
argsort()
axis , kind , order
order
autocorr()
between()
compare()
align_axis , result_names
align_axis
result_names
divmod()
axis , errors
how
ax , backend , by , figsize , grid and more. See the pandas.Series.hist and pyspark.pandas.Series.hist for detail.
grid
info
keys()
axis , level , skipna
keep
rdivmod()
axis , copy , errors , inplace , level
inplace , limit , method
axis , base , convention , group_keys , kind and more. See the pandas.Series.resample and pyspark.pandas.Series.resample for detail.
searchsorted()
sorter
ceil
floor
round
to_pytimedelta
total_seconds
array
bdate_range
concat()
copy , keys , levels , names , verify_integrity
keys
crosstab
cut
date_range()
inclusive
eval
factorize
from_dummies
get_dummies()
infer_freq
interval_range
json_normalize
lreshape
copy , indicator , left , sort , validate
left
merge_asof()
merge_ordered
period_range
pivot
pivot_table
qcut
read_clipboard()
read_csv()
cache_dates , chunksize , compression , converters , date_parser and more. See the pandas.read_csv and pyspark.pandas.read_csv for detail.
cache_dates
chunksize
compression
converters
date_parser
read_excel()
decimal , na_filter , storage_options
decimal
na_filter
read_feather
read_fwf
read_gbq
read_hdf
read_html()
extract_links
read_json()
chunksize , compression , convert_axes , convert_dates , date_unit and more. See the pandas.read_json and pyspark.pandas.read_json for detail.
convert_axes
convert_dates
date_unit
read_orc()
read_parquet()
engine , storage_options , use_nullable_dtypes
use_nullable_dtypes
read_pickle
read_sas
read_spss
read_sql()
chunksize , coerce_float , params , parse_dates
coerce_float
params
parse_dates
read_sql_query()
chunksize , coerce_float , dtype , params , parse_dates
read_sql_table()
chunksize , coerce_float , parse_dates
read_stata
read_table()
cache_dates , chunksize , comment , compression , converters and more. See the pandas.read_table and pyspark.pandas.read_table for detail.
comment
read_xml
set_eng_float_format
show_versions
test
timedelta_range()
to_datetime()
cache , dayfirst , exact , utc , yearfirst
cache
dayfirst
exact
utc
yearfirst
to_numeric()
to_pickle
to_timedelta()
wide_to_long
agg
aggregate
apply
corr
count()
cov
kurt()
engine , engine_kwargs , numeric_only
mean()
interpolation , numeric_only
rank
skew()
std()
ddof , engine , engine_kwargs , numeric_only
ddof
sum()
var()
boxplot
filter
idxmax
idxmin
nunique
transform
backfill()
bfill()
cumcount()
cummax()
axis , numeric_only
cummin()
cumprod()
cumsum()
describe
ewm()
expanding()
ffill()
first()
head()
last()
engine , engine_kwargs
median()
ngroup
ohlc
pad()
pct_change
prod()
rank()
axis , na_option , pct
na_option
pct
resample
rolling()
sample
sem()
size()
tail()
engine , engine_kwargs , func
func
value_counts()
bins , normalize
bins
normalize