Code Sample, a copy-pastable example if possible
#%%
import matplotlib as ml
import pandas as pd
import re
#%%
dataFile1 = 'bitcoin_cash_price.csv'
dataFile2 = 'dash_price.csv'
#%%
def cleanData(data: str) -> (int,None):
cleanedData = re.findall("[0-9]+", data)
out = ""
for stuff in cleanedData:
out = out + stuff
if out != '':
return int(out)
else:
return None
# %%
bitcoinDF = pd.read_csv(dataFile1)
dashPriceDF = pd.read_csv(dataFile2)
# %%
#Market Cap is initially of type string
bitcoinDF["Market Cap"].apply(type)
# %%
#The cleanData function is applied with an type annotation of (int,None)
bitcoinDF["Market Cap"] = bitcoinDF["Market Cap"].apply(cleanData)
#%%
#Pandas casts them as float
bitcoinDF["Market Cap"].apply(type)
# %%
#The second data set is also a str
dashPriceDF["Market Cap"].apply(type)
#%%
#cleanData is once again applied
dashPriceDF["Market Cap"] = dashPriceDF["Market Cap"].apply(cleanData)
#%%
#Pandas correctly classifies it as type int and the behavior is inconsistent compared to the other dataset
dashPriceDF["Market Cap"].apply(type)
#The expected behavior should be that pandas will go off a type annotation when calling apply before guessing at the data type.
Problem description
The current behavior is an issue because the pandas library ignores any type annotations when using the apply method. When using the apply method the type annotation should take precedence before pandas attempts to guess at the data type.
Comment From: mroeschke
I don't think this is in scope for pandas since type annotations are not used for run time checking so closing