Still new to python. Pardon me for asking a noob equation. Using the mftool library which helps in downloading NAV data for mutual funds. It gives a key Error'Date'.
Would be really grateful on identifying and helping on the error.
Input:
from mftool import Mftool
mf = Mftool()
scheme_codes = mf.get_scheme_codes()
scheme_code_list = [x for x in scheme_codes.keys()]
def HistoricalNav(scheme_code_list, start_date, end_date):
assert (isinstance(scheme_code_list, list) is True), "Argument scheme_code_list should be a list"
assert (isinstance(start_date, str) is True), "start_date must be a str in %d-%m-%Y format" # checks whether start date is present and is in correct format.
assert (isinstance(end_date, str) is True), "end_date must be a str in %d-%m-%Y format" # checks whether end date is present and is in correct format
main_df = pd.DataFrame() #empty dataframe
for schemes in scheme_code_list:
data = mf.get_scheme_historical_nav_for_dates(schemes, start_date, end_date) # requesting NAV data from the api.
df = pd.DataFrame(data['data'])
df['scheme_code'] = pd.Series([data['scheme_code'] for x in range(len(df.index))]) #adding Pandas Series(scheme_code) as a column in Pandas Dataframe.
df['scheme_name'] = pd.Series([data['scheme_name'] for x in range(len(df.index))]) #adding Pandas Series(scheme_name) as a column in Pandas Dataframe.
df = df.sort_values(by = 'date') # sorting the values of every Scheme code based on Date
main_df = main_df.append(df) # appending the data in the main_df dataframe.
main_df = main_df[['scheme_code', 'scheme_name', 'date', 'nav']] #creating names of dataframe columns
main_df.reset_index(drop = True, inplace = True)
return main_df #Returning the required Dataframe.
values_df = HistoricalNav(scheme_code_list = scheme_code_list[0:5], start_date= '01-05-2021', end_date= '01-05-2021')
values_df
Error:
Traceback (most recent call last):
File "C:/Users/am364971/Desktop/Python/Working/amfi.py", line 31, in
values_df = HistoricalNav(scheme_code_list = scheme_code_list[0:5], start_date= '01-05-2021', end_date= '01-05-2021')
File "C:/Users/am364971/Desktop/Python/Working/amfi.py", line 22, in HistoricalNav
df = df.sort_values(by = 'date') # sorting the values of every Scheme code based on Date
File "C:\Users\am364971\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\frame.py", line 5455, in sort_values
k = self._get_label_or_level_values(by, axis=axis)
File "C:\Users\am364971\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\generic.py", line 1684, in _get_label_or_level_values
raise KeyError(key)
KeyError: 'date'
I can see that date is not a column in the given dataframe. You could check by typing df.columns if there is date column.
Related
Getting error :- 'last' only supports a DatetimeIndex index
def create_excel_file():
master_list = []
for name in filelist:
new_path = Path(name).parent
base = os.path.basename(new_path)
final = os.path.splitext(base)[0]
with open(name,"r") as f:
soupObj = bs4.BeautifulSoup(f, "lxml")
df = pd.DataFrame([(x["uri"], *x["t"].split("T"), x["u"], x["desc"])
for x in soupObj.find_all("log")],
columns=["Document", "Date", "Time", "User", "Description"])
df.insert(0, 'Database', f'{final}')
df['Document'] = df['Document'].astype(str)
df['Date'] = pd.to_datetime(df['Date']).dt.date
master_list.append(df)
df = pd.concat(master_list, axis=0, ignore_index=True)
df = df.sort_values(by='Date', ascending=True).set_index('Date').last('3M')
df = df.sort_values(by='Date', ascending=False)
df.to_excel("logfile.xlsx", index=True)
create_excel_file()
suggest me what I am doing wrong
Error message:-
Traceback (most recent call last):
File "C:\Users\Desktop\project\Final test.py", line 40, in <module>
create_excel_file()
File "C:\Users\Desktop\project\Final test.py", line 34, in create_excel_file
df = df.sort_values(by='Date', ascending=True).set_index('Date').last('3M')
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\AppData\Roaming\Python\Python311\site-packages\pandas\core\generic.py", line 9001, in last
raise TypeError("'last' only supports a DatetimeIndex index")
TypeError: 'last' only supports a DatetimeIndex index
Process finished with exit code 1
getting Error as shows above
From documentation
For a DataFrame with a sorted DatetimeIndex, this function selects the last few rows based on a date offset.
So, you need to make sure that you did your sort on a column whose values are of type Datetime
From your code below, make sure the data in column 'Date' are actually datetime
df = df.sort_values(by='Date', ascending=True).set_index('Date').last('3M')
I am having some trouble iterating through Google trends data using the pseudo google trends API PyTrends. When I use the google trends website it is fine with me using multiple words together so long as I use the "+" symbol in between words. For example: "a-kasse+akasse+arbejdsformidling+arbejdsformidlinger+dagpenge+dagpengeperiode". When I use a single word in my code, the program works as expected. But when I use multiple words concatenated it breaks.
When I use the following function, it throws a list index out of range error:
def my_funct(Keyword, Dates, Country, Col_name):
KEYWORDS=[Keyword]
KEYWORDS_CODES=[pytrend.suggestions(keyword=i)[0] for i in KEYWORDS]
df_CODES= pd.DataFrame(KEYWORDS_CODES)
EXACT_KEYWORDS=df_CODES['mid'].to_list()
DATE_INTERVAL= Dates
COUNTRY=[Country] #Use this link for iso country code
CATEGORY=0 # Use this link to select categories
SEARCH_TYPE='' #default is 'web searches',others include 'images','news','youtube','froogle' (google shopping)
Individual_EXACT_KEYWORD = list(zip(*[iter(EXACT_KEYWORDS)]*1))
Individual_EXACT_KEYWORD = [list(x) for x in Individual_EXACT_KEYWORD]
dicti = {}
i = 1
for Country in COUNTRY:
for keyword in Individual_EXACT_KEYWORD:
try:
pytrend.build_payload(kw_list=keyword,
timeframe = DATE_INTERVAL,
geo = Country,
cat = CATEGORY,
gprop = SEARCH_TYPE)
dicti[i] = pytrend.interest_over_time()
i+=1
time.sleep(9)
print(dicti)
except requests.exceptions.Timeout:
print("Timeout occured")
df_trends = pd.concat(dicti, axis=1)
df_trends.columns = df_trends.columns.droplevel(0) #drop outside header
df_trends = df_trends.drop('isPartial', axis = 1) #drop "isPartial"
df_trends.reset_index(level=0,inplace=True) #reset_index
df_trends.columns=['date', Col_name] #change column names
return df_trends
I execute that function through another function as follows:
df_merged3 = excelConcatFunct("a-kasse+akasse+arbejdsformidling+arbejdsformidlinger+dagpenge+dagpengeperiode", 'DK', 'DANISH search terms')
And here is how that function works:
def excelConcatFunct(Word, Country_code, Col_name):
# generic plug-n-chug data
x1 = my_funct(Word, '2004-01-04 2009-01-04', Country_code, Col_name)
x2 = my_funct(Word, '2009-01-05 2014-01-05', Country_code, Col_name)
x3 = my_funct(Word, '2014-01-06 2019-01-06', Country_code, Col_name)
x4 = my_funct(Word, '2019-01-07 {0}'.format(Today), Country_code, Col_name)
# generic plug-n-chug data
df1 = pd.DataFrame(x1)
df2 = pd.DataFrame(x2)
df3 = pd.DataFrame(x3)
df4 = pd.DataFrame(x4)
# Creates an empty dataframe to add to merged df for column spacing
df0 = pd.DataFrame()
df0[''] = ''
# this concats the df horizontally
df_merged = pd.concat([df1, df0, df2, df0, df3, df0, df4], axis=1)
df_merged.reset_index(inplace=True)
# This removes the dangling last column that said "Unamed:0"
df_merged = df_merged.loc[:, ~df_merged.columns.str.contains('^Unnamed')]
# This removes the dangling first index column
df_merged = df_merged.loc[:, ~df_merged.columns.str.contains('^index')]
# returns the merged dataframe
return df_merged
And here is the error message I am getting:
File "C:\Users\JohnReese\Desktop\G_Trends\G_Trender.py", line 111, in <module>
df_merged3 = excelConcatFunct("a-kasse+akasse+arbejdsformidling+arbejdsformidlinger+dagpenge+dagpengeperiode", 'DK', 'DANISH search terms')
File "C:\Users\JohnReese\Desktop\G_Trends\G_Trender.py", line 83, in excelConcatFunct
x1 = my_funct(Word, '2004-01-04 2009-01-04', Country_code, Col_name)
File "C:\Users\JohnReese\Desktop\G_Trends\G_Trender.py", line 31, in my_funct
KEYWORDS_CODES=[pytrend.suggestions(keyword=i)[0] for i in KEYWORDS]
File "C:\Users\JohnReese\Desktop\G_Trends\G_Trender.py", line 31, in <listcomp>
KEYWORDS_CODES=[pytrend.suggestions(keyword=i)[0] for i in KEYWORDS]
IndexError: list index out of range
Please help. Any and all help is welcomed.
Thank you!
In a df comprised of the columns asset_id, event_start_date, event_end_date,
I wish to add a forth column datediff that for each asset_id will capture how many days passed between a end_date and the following start_date for the same asset_id, but in case that following start_date is earlier than the current end_date, I would like to capture the difference between the two start_dates. Dataset is sorted by (asset_id, start_date asc).
In Excel it would look something like:
I tried:
events['datediff'] = df.groupby('asset_id').apply(lambda x: x['event_start_date'].shift(-1)-x['event_end_date'] if
x['event_start_date'].shift(-1)>x['event_end_date'] else x['event_start_date'].shift(-1)-x['event_start_date'] ).\
fillna(pd.Timedelta(seconds=0)).reset_index(drop=True)
But this is:
not working. Throwing ValueError: The truth value of a Series is ambiguous.
so un-elegant.
Thanks!
df = pd.DataFrame({
'asset_id':[0,0,1,1],
'event_start_date':['2019-07-08','2019-07-11','2019-07-15','2019-07-25'],
'event_end_date':['2019-07-08','2019-07-23','2019-07-29','2019-07-25']
})
df['event_end_date'] = pd.to_datetime(df['event_end_date'])
df['event_start_date'] = pd.to_datetime(df['event_start_date'])
df['next_start']=df.groupby('asset_id')['event_start_date'].shift(-1)
df['date_diff'] = np.where(
df['next_start']>df['event_end_date'],
(df['next_start']-df['event_end_date']).dt.days,
(df['next_start']-df['event_start_date']).dt.days
)
df = df.drop(columns=['next_start']).fillna(0)
So I have this script
mport pandas as pd
import numpy as np
PRIMARY_TUMOR_PATIENT_ID_REGEX = '^.{4}-.{2}-.{4}-01.*'
SHORTEN_PATIENT_REGEX = '^(.{4}-.{2}-.{4}).*'
def mutations_for_gene(df):
mutated_patients = df['identifier'].unique()
return pd.DataFrame({'mutated': np.ones(len(mutated_patients))}, index=mutated_patients)
def prep_data(mutation_path):
df = pd.read_csv(mutation_path, low_memory=True, dtype=str, header = 0)#Line 24 reads in a line memory csv file from the given path and parses it based on '\t' delimators, and casts the data to str
df = df[~df['Hugo_Symbol'].str.contains('Hugo_Symbol')] #analyzes the 'Hugo_Symbol' heading within the data and makes a new dataframe where any row that contains 'Hugo_Symbol' is dropped
df['Hugo_Symbol'] = '\'' + df['Hugo_Symbol'].astype(str) # Appends ''\'' to all the data remaining in that column
df['Tumor_Sample_Barcode'] = df['Tumor_Sample_Barcode'].str.strip() #strips away whitespace from the data within this heading
non_silent = df.where(df['Variant_Classification'] != 'Silent') #creates a new dataframe where the data within the column 'Variant_Classification' is not equal to 'Silent'
df = non_silent.dropna(subset=['Variant_Classification']) #Drops all the rows that are missing at least one element
non_01_barcodes = df[~df['Tumor_Sample_Barcode'].str.contains(PRIMARY_TUMOR_PATIENT_ID_REGEX)]
#TODO: Double check that the extra ['Tumor_Sample_Barcode'] serves no purpose
df = df.drop(non_01_barcodes.index)
print(df)
shortened_patients = df['Tumor_Sample_Barcode'].str.extract(SHORTEN_PATIENT_REGEX, expand=False)
df['identifier'] = shortened_patients
gene_mutation_df = df.groupby(['Hugo_Symbol']).apply(mutations_for_gene)
gene_mutation_df.columns = gene_mutation_df.columns.str.strip()
gene_mutation_df.set_index(['Hugo_Symbol', 'patient'], inplace=True)
gene_mutation_df = gene_mutation_df.reset_index()
gene_patient_mutations = gene_mutation_df.pivot(index='Hugo_Symbol', columns='patient', values='mutated')
return gene_patient_mutations.transpose().fillna(0)
This is the csv file that the script reads in:
identifier,Hugo_Symbol,Tumor_Sample_Barcode,Variant_Classification,patient
1,patient,a,Silent,6
22,mutated,d,e,7
1,Hugo_Symbol,f,g,88
The script gives this error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-60-3f9c00f320bc> in <module>
----> 1 prep_data('test.csv')
<ipython-input-59-2a67d5c44e5a> in prep_data(mutation_path)
21 display(gene_mutation_df)
22 gene_mutation_df.columns = gene_mutation_df.columns.str.strip()
---> 23 gene_mutation_df.set_index(['Hugo_Symbol', 'patient'], inplace=True)
24 gene_mutation_df = gene_mutation_df.reset_index()
25 gene_patient_mutations = gene_mutation_df.pivot(index='Hugo_Symbol', columns='patient', values='mutated')
e:\Anaconda3\lib\site-packages\pandas\core\frame.py in set_index(self, keys, drop, append, inplace, verify_integrity)
4546
4547 if missing:
-> 4548 raise KeyError(f"None of {missing} are in the columns")
4549
4550 if inplace:
KeyError: "None of ['Hugo_Symbol', 'patient'] are in the columns"
Previously, I had this is as that line
gene_mutation_df.index.set_names(['Hugo_Symbol', 'patient'], inplace=True)
But that also gave an error that the set_name length expects one argument but got two
Any help would be much appreciated
I would really prefer if the csv data was changed instead of the script and somehow the script could work with set_names instead of set_index
The issue is:
gene_mutation_df = df.groupby(['Hugo_Symbol']).apply(mutations_for_gene)
'Hugo_Symbol is used for a groupby, so now it's in the index, not a column
In the case of the sample data, an empty dataframe, with no columns, has been created.
gene_mutation_df = df.groupby(['Hugo_Symbol']).apply(mutations_for_gene)
print(gene_mutation_df) # print the dataframe to see what it looks like
print(gene_mutation_df.info()) # print the information for the dataframe
gene_mutation_df.columns = gene_mutation_df.columns.str.strip()
gene_mutation_df.set_index(['Hugo_Symbol', 'patient'], inplace=True)
# output
Empty DataFrame
Columns: [identifier, Hugo_Symbol, Tumor_Sample_Barcode, Variant_Classification, patient]
Index: []
Empty DataFrame
Columns: []
Index: []
<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Empty DataFrameNone
reset the index
Resetting the index, will make Hugo_Symbol a column again
As long as the dataframe is not empty, the KeyError should be resolved.
gene_mutation_df = gene_mutation_df.reset_index() # try adding this line
gene_mutation_df.set_index(['Hugo_Symbol', 'patient'], inplace=True)
Addition Notes
There are a number of lines of code, that may be resulting in an empty dataframe
non_01_barcodes = df[~df['Tumor_Sample_Barcode'].str.contains(PRIMARY_TUMOR_PATIENT_ID_REGEX)]
shortened_patients = df['Tumor_Sample_Barcode'].str.extract(SHORTEN_PATIENT_REGEX, expand=False)
gene_mutation_df = df.groupby(['Hugo_Symbol']).apply(mutations_for_gene)
Test if the dataframe is empty
Use .empty to determine if a dataframe is empty
def prep_data(mutation_path):
df = pd.read_csv(mutation_path, low_memory=True, dtype=str, header = 0)#Line 24 reads in a line memory csv file from the given path and parses it based on '\t' delimators, and casts the data to str
df.columns = df.columns.str.strip() # clean the column names here if there is leading or trailing whitespace.
df = df[~df['Hugo_Symbol'].str.contains('Hugo_Symbol')] #analyzes the 'Hugo_Symbol' heading within the data and makes a new dataframe where any row that contains 'Hugo_Symbol' is dropped
df['Hugo_Symbol'] = '\'' + df['Hugo_Symbol'].astype(str) # Appends ''\'' to all the data remaining in that column
df['Tumor_Sample_Barcode'] = df['Tumor_Sample_Barcode'].str.strip() #strips away whitespace from the data within this heading
non_silent = df.where(df['Variant_Classification'] != 'Silent') #creates a new dataframe where the data within the column 'Variant_Classification' is not equal to 'Silent'
df = non_silent.dropna(subset=['Variant_Classification']) #Drops all the rows that are missing at least one element
non_01_barcodes = df[~df['Tumor_Sample_Barcode'].str.contains(PRIMARY_TUMOR_PATIENT_ID_REGEX)]
#TODO: Double check that the extra ['Tumor_Sample_Barcode'] serves no purpose
df = df.drop(non_01_barcodes.index)
print(df)
shortened_patients = df['Tumor_Sample_Barcode'].str.extract(SHORTEN_PATIENT_REGEX, expand=False)
df['identifier'] = shortened_patients
gene_mutation_df = df.groupby(['Hugo_Symbol']).apply(mutations_for_gene)
gene_mutation_df = gene_mutation_df.reset_index() # reset the index here
print(gene_mutation_df)
if gene_mutation_df.empty: # check if the dataframe is empty
print('The dataframe is empty')
else:
# gene_mutation_df.set_index(['Hugo_Symbol', 'patient'], inplace=True) # this is not needed, pivot won't work if you do this
# gene_mutation_df = gene_mutation_df.reset_index() # this is not needed, the dataframe was reset already
gene_patient_mutations = gene_mutation_df.pivot(index='Hugo_Symbol', columns='patient', values='mutated') # values needs to be a column in the dataframe
return gene_patient_mutations.transpose().fillna(0)
In Python, I have a DataFrame with column 'Date' (format e.g. 2020-06-26). This column is sorted in descending order: 2020-06-26, 2020-06-25, 2020-06-24...
The other column 'Reviews' is made of text reviews of a website. My data can have multiple reviews on a given date or no reviews on another date. I want to find what dates are missing in column 'Date'. Then, for each missing date, add one row with date in ´´format='%Y-%m-%d'´´, and an empty review on 'Reviews', to be able to plot them. How should I do this?
from datetime import date, timedelta
d = data['Date']
print(d[0])
print(d[-1])
date_set = set(d[-1] + timedelta(x) for x in range((d[0] - d[-1]).days))
missing = sorted(date_set - set(d))
missing = pd.to_datetime(missing, format='%Y-%m-%d')
idx = pd.date_range(start=min(data.Date), end=max(data.Date), freq='D')
#tried this
data = data.reindex(idx, fill_value=0)
data.head()
#Got TypeError: 'fill_value' ('0') is not in this Categorical's categories.
#also tried this
df2 = (pd.DataFrame(data.set_index('Date'), index=idx).fillna(0) + data.set_index('Date')).ffill().stack()
df2.head()
#Got ValueError: cannot reindex from a duplicate axis
This is my code:
for i in range(len(df)):
if i > 0:
prev = df.loc[i-1]["Date"]
current =df.loc[i]["Date"]
for a in range((prev-current).days):
if a > 0:
df.loc[df["Date"].count()] = [prev-timedelta(days = a), None]
df = df.sort_values("Date", ascending=False)
print(df)