The following table shows the pandas APIs that implemented or non-implemented from pandas API on Spark. Some pandas API do not implement full parameters, so the third column shows missing parameters for each API.
‘Y’ in the second column means it’s implemented including its whole parameter.
‘N’ means it’s not implemented yet.
‘P’ means it’s partially implemented with the missing of some parameters.
All API in the list below computes the data with distributed execution except the ones that require the local execution by design. For example, DataFrame.to_numpy() requires to collect the data to the driver side.
If there is non-implemented pandas API or parameter you want, you can create an Apache Spark JIRA to request or to contribute by your own.
The API list is updated based on the pandas 2.0.0 pre-release.
API
Implemented
Missing parameters
add_categories()
Y
all()
any()
append()
argmax()
P
axis , skipna
axis
skipna
argmin()
argsort
N
as_ordered()
as_unordered()
asof()
asof_locs
astype()
copy
copy()
dtype , names
dtype
names
delete()
difference()
drop()
errors
drop_duplicates()
droplevel()
dropna()
duplicated
equals()
factorize()
use_na_sentinel
fillna()
downcast
format
get_indexer
get_indexer_for
get_indexer_non_unique
get_level_values()
get_loc
get_slice_bound
get_value
groupby
holds_integer()
identical()
insert()
intersection()
sort
is_
is_boolean()
is_categorical()
is_dtype_equal
is_floating()
is_integer()
is_interval()
is_mixed
is_numeric()
is_object()
is_type_compatible()
isin()
level
isna()
isnull()
item()
join
map()
max()
memory_usage
min()
notna()
notnull()
nunique()
putmask
ravel
reindex
remove_categories()
remove_unused_categories()
rename()
rename_categories()
reorder_categories()
repeat()
searchsorted
set_categories()
set_names()
set_value
shift()
freq
slice_indexer
slice_locs
sort()
sort_values()
key , na_position
key
na_position
sortlevel
symmetric_difference()
take()
allow_fill , axis , fill_value
allow_fill
fill_value
take_nd
to_flat_index
to_frame()
to_list()
to_native_types
to_numpy()
na_value
to_series()
index
tolist()
transpose()
union()
unique()
value_counts()
view()
where
abs()
add()
axis , fill_value , level
add_prefix()
add_suffix()
agg()
aggregate()
align()
broadcast_axis , fill_axis , fill_value , level , limit and more. See the pandas.DataFrame.align and pyspark.pandas.DataFrame.align for detail.
broadcast_axis
fill_axis
limit
level , skipna
apply()
raw , result_type
raw
result_type
applymap()
na_action
asfreq
asof
assign()
copy , errors
at_time()
backfill()
between_time()
inclusive
bfill()
bool()
boxplot()
ax , backend , by , column , figsize and more. See the pandas.DataFrame.boxplot and pyspark.pandas.DataFrame.boxplot for detail.
ax
backend
by
column
figsize
clip()
axis , inplace
inplace
combine
combine_first()
compare
convert_dtypes
corr()
numeric_only
corrwith()
count()
cov()
cummax()
cummin()
cumprod()
cumsum()
describe()
datetime_is_numeric , exclude , include
datetime_is_numeric
exclude
include
diff()
div()
divide()
dot()
errors , inplace , level
duplicated()
eq()
axis , level
eval()
ewm()
adjust , axis , method , times
adjust
method
times
expanding()
axis , center , method
center
explode()
ffill()
filter()
first()
first_valid_index()
floordiv()
ge()
get()
groupby()
group_keys , level , observed , sort , squeeze
group_keys
observed
squeeze
gt()
head()
hist()
ax , backend , by , column , data and more. See the pandas.DataFrame.hist and pyspark.pandas.DataFrame.hist for detail.
data
idxmax()
numeric_only , skipna
idxmin()
infer_objects
info()
memory_usage , show_counts
show_counts
interpolate()
axis , downcast , inplace
isetitem
items()
iteritems()
iterrows()
itertuples()
join()
other , sort , validate
other
validate
keys()
kurt()
kurtosis()
last()
last_valid_index()
le()
lookup
lt()
mad()
mask()
axis , errors , inplace , level , try_cast
try_cast
mean()
median()
melt()
col_level , ignore_index
col_level
ignore_index
merge()
copy , indicator , sort , validate
indicator
mod()
mode()
mul()
multiply()
ne()
nlargest()
nsmallest()
pad()
pct_change()
fill_method , freq , limit
fill_method
pipe()
pivot()
pivot_table()
dropna , margins , margins_name , observed , sort
dropna
margins
margins_name
pop()
pow()
prod()
product()
quantile()
interpolation , method
interpolation
query()
radd()
rank()
axis , na_option , pct
na_option
pct
rdiv()
reindex()
level , limit , method , tolerance
tolerance
reindex_like()
limit , method , tolerance
rename_axis()
reorder_levels
replace()
resample()
axis , base , convention , group_keys , kind and more. See the pandas.DataFrame.resample and pyspark.pandas.DataFrame.resample for detail.
base
convention
kind
reset_index()
allow_duplicates , names
allow_duplicates
rfloordiv()
rmod()
rmul()
rolling()
axis , center , closed , method , on and more. See the pandas.DataFrame.rolling and pyspark.pandas.DataFrame.rolling for detail.
closed
on
round()
rpow()
rsub()
rtruediv()
sample()
axis , weights
weights
select_dtypes()
sem()
set_axis
set_flags
set_index()
verify_integrity
axis , freq
skew()
slice_shift
sort_index()
key , sort_remaining
sort_remaining
axis , key , kind
squeeze()
stack()
dropna , level
std()
sub()
subtract()
sum()
swapaxes()
axis1 , axis2
axis1
axis2
swaplevel()
tail()
is_copy
to_clipboard()
to_csv()
chunksize , compression , decimal , doublequote , encoding and more. See the pandas.DataFrame.to_csv and pyspark.pandas.DataFrame.to_csv for detail.
chunksize
compression
decimal
doublequote
encoding
to_dict()
to_excel()
storage_options
to_feather
to_gbq
to_hdf
to_html()
to_json()
date_format , date_unit , default_handler , double_precision , force_ascii and more. See the pandas.DataFrame.to_json and pyspark.pandas.DataFrame.to_json for detail.
date_format
date_unit
default_handler
double_precision
force_ascii
to_latex()
caption , label , position
caption
label
position
to_markdown()
index , storage_options
copy , dtype , na_value
to_orc()
engine , engine_kwargs , index
engine
engine_kwargs
to_parquet()
engine , index , storage_options
to_period
to_pickle
to_records()
to_sql
to_stata
to_string()
encoding , max_colwidth , min_rows
max_colwidth
min_rows
to_timestamp
to_xarray
to_xml
transform()
truediv()
truncate()
tshift
tz_convert
tz_localize
unstack()
fill_value , level
update()
errors , filter_func
filter_func
value_counts
var()
where()
errors , inplace , level , try_cast
xs()
drop_level
ceil()
day_name()
floor()
indexer_at_time()
indexer_between_time()
isocalendar
mean
month_name()
normalize()
snap
std
strftime()
to_julian_date
to_perioddelta
to_pydatetime
index , keep_tz
keep_tz
union_many
codes , dtype , levels , name , names
codes
levels
name
equal_levels()
get_loc_level
get_locs
is_lexsorted
remove_unused_levels
level , names
set_codes
set_levels
truncate
broadcast_axis , fill_axis , fill_value , level , limit and more. See the pandas.Series.align and pyspark.pandas.Series.align for detail.
bool_only , level
bool_only
bool_only , level , skipna
convert_dtype
argsort()
axis , kind , order
order
subset
autocorr()
between()
compare()
align_axis , result_names
align_axis
result_names
divmod()
axis , errors
how
ax , backend , by , figsize , grid and more. See the pandas.Series.hist and pyspark.pandas.Series.hist for detail.
grid
info
axis , level , skipna
keep
rdivmod()
copy , limit , method , tolerance
axis , copy , errors , inplace , level
inplace , limit , method
axis , base , convention , group_keys , kind and more. See the pandas.Series.resample and pyspark.pandas.Series.resample for detail.
axis , center , closed , method , on and more. See the pandas.Series.rolling and pyspark.pandas.Series.rolling for detail.
searchsorted()
sorter
axis , is_copy
chunksize , compression , decimal , doublequote , encoding and more. See the pandas.Series.to_csv and pyspark.pandas.Series.to_csv for detail.
date_format , date_unit , default_handler , double_precision , force_ascii and more. See the pandas.Series.to_json and pyspark.pandas.Series.to_json for detail.
view
axis , drop_level
ceil
floor
median
round
sum
to_pytimedelta
total_seconds
array
bdate_range
concat()
copy , keys , levels , names , verify_integrity
keys
crosstab
cut
date_range()
eval
factorize
from_dummies
get_dummies()
infer_freq
interval_range
json_normalize
lreshape
copy , indicator , left , sort , validate
left
merge_asof()
merge_ordered
period_range
pivot
pivot_table
qcut
read_clipboard()
read_csv()
cache_dates , chunksize , compression , converters , date_parser and more. See the pandas.read_csv and pyspark.pandas.read_csv for detail.
cache_dates
converters
date_parser
read_excel()
decimal , na_filter , storage_options
na_filter
read_feather
read_fwf
read_gbq
read_hdf
read_html()
extract_links
read_json()
chunksize , compression , convert_axes , convert_dates , date_unit and more. See the pandas.read_json and pyspark.pandas.read_json for detail.
convert_axes
convert_dates
read_orc()
read_parquet()
engine , storage_options , use_nullable_dtypes
use_nullable_dtypes
read_pickle
read_sas
read_spss
read_sql()
chunksize , coerce_float , params , parse_dates
coerce_float
params
parse_dates
read_sql_query()
chunksize , coerce_float , dtype , params , parse_dates
read_sql_table()
chunksize , coerce_float , parse_dates
read_stata
read_table()
cache_dates , chunksize , comment , compression , converters and more. See the pandas.read_table and pyspark.pandas.read_table for detail.
comment
read_xml
set_eng_float_format
show_versions
test
timedelta_range()
to_datetime()
cache , dayfirst , exact , utc , yearfirst
cache
dayfirst
exact
utc
yearfirst
to_numeric()
to_timedelta()
unique
wide_to_long
agg
aggregate
apply
corr
cov
engine , engine_kwargs , numeric_only
interpolation , numeric_only
rank
sem
ddof , engine , engine_kwargs , numeric_only
ddof
var
engine , engine_kwargs , func
func
boxplot
cumcount()
axis , numeric_only
get_group()
obj
engine , engine_kwargs
ngroup
ohlc
pct_change
pipe
resample
sample
size()
describe
bins , normalize
bins
normalize