I am learning Python and right now I am trying to examine percent change in stock values from a database. However, one of the variables I am trying to examine is coming from a database which is of type Series. And whenever I try to convert into a float to use it for multiplication and division, I receive an error "TypeError: cannot convert the series to ". I have seen solutions that stated to use .astype(float), but that didn't work for me. Any help would be appreciated.
import pandas as pd
import os
import time
from datetime import datetime
path = "C:/Users/andre/AppData/Local/Programs/Python/Python37/SciKit-learn Tutorial/intraQuarter"
def Key_Stats(gather = "Total Debt/Equity (mrq)"):
statspath = path + '/_KeyStats'
stock_list = [x[0] for x in os.walk(statspath)]
counter = 0
df = pd.DataFrame(columns = ['Date','Unix','Folder','DE Ratio','Price',
'Stock_pct_change','SP500','SP500_pct_change', 'Difference'])
sp500_df = pd.read_csv("YAHOO-INDEX_GSPC.csv")
ticker_list = []
for each_dir in stock_list[1:]:
each_file = os.listdir(each_dir)
folder = each_dir.split("\\")[1]
ticker_list.append(folder)
#Reset starting point for each directory
starting_stock_value = False
starting_sp500_value = False
if len(each_file) > 0:
for file in each_file:
date_stamp = datetime.strptime(file, '%Y%m%d%H%M%S.html')
unix_time = time.mktime(date_stamp.timetuple())
full_file_path = each_dir + '/' + file
file_content_source = open(full_file_path, 'r').read()
try:
value = file_content_source.split(gather)[1].split('<td class="yfnc_tabledata1">')[1].split('</td>')[0]
try:
sp500_date = datetime.fromtimestamp(unix_time).strftime('%Y-%m-%d')
row = sp500_df[(sp500_df['Date'] == sp500_date)]
sp500_value = row['Adj Close']
print(type(sp500_value))
print(sp500_value)
except:
sp500_date = datetime.fromtimestamp(unix_time-259200).strftime('%Y-%m-%d')
row = sp500_df[(sp500_df['Date'] == sp500_date)]
sp500_value = row['Adj Close']
try:
stock_price = file_content_source.split('</small><big><b>')[1].split('</b></big>')[0]
if(stock_price.endswith('</span>')):
stock_price = stock_price.split('>')[1].split('</span')[0]
except IndexError:
try:
stock_price = file_content_source.split('</small><big><b>')[1].split('</span>')[0].split('>')[1]
except IndexError:
try:
stock_price = file_content_source.split('<span id="yfs_')
seglist = [] #Created a list to store all the possible numbers that arise
for parts in stock_price:
segment = parts.split('</span>')[0].split('>')[1]
try:
#The numbers are usually 4,5, or 6 characters in length and check if str is a number
if((len(segment) == 4 or len(segment) == 5 or len(segment) == 6) and float(segment) >= 0):
seglist.append(segment) #Add potential number to list
stock_price = seglist[0] #Noticed the first number is usually the correct one
except ValueError:
pass
except IndexError:
print('Error in Folder:', folder, ' File: ', file, ' Stock Price=', stock_price)
#print('Folder:', folder, ' File', file, ' Stock Price: ', stock_price)
if not starting_stock_value:
starting_stock_value = float(stock_price)
if not starting_sp500_value:
starting_sp500_value = float(sp500_value)
#percentage change = (new-old)/old x 100
stock_pct_change = ((float(stock_price) - starting_stock_value) / starting_stock_value) * 100
#-------------------------------------------------------------------------------
#ERROR OCCURS HERE!!!!
sp500_pct_change = ((float(sp500_value) - starting_sp500_value) / starting_sp500_value) * 100
#-------------------------------------------------------------------------------
df = df.append({'Date': date_stamp,'Unix': unix_time,
'Folder': folder,'DE Ratio': value,
'Price': stock_price,
'Stock_pct_change': stock_pct_change,
'SP500': sp500_value,
'SP500_pct_change': sp500_pct_change,
'Difference': stock_pct_change-sp500_pct_change},
ignore_index = True)
except IndexError:
stock_price = file_content_source.split('<span id="yfs_')[5].split('</span>')[0].split('>')[1]
print('Error in Folder:', folder, ' File: ', file, "Value=", value, 'Stock Price=', stock_price)
#Plot
for each_ticker in ticker_list:
try:
plot_df = df[(df['Folder'] == each_ticker)]
plot_df = plot_df.set_index(['Date'])
plot_df['Difference'].plot(label = each_ticker)
plt.legend()
except:
pass
plt.show()
Key_Stats()
Error:
<class 'pandas.core.series.Series'>
2997 1131.130005
Name: Adj Close, dtype: float64
<class 'pandas.core.series.Series'>
2947 1129.439941
Name: Adj Close, dtype: float64
<class 'pandas.core.series.Series'>
2778 1198.680054
Name: Adj Close, dtype: float64
<class 'pandas.core.series.Series'>
Series([], Name: Adj Close, dtype: float64)
Traceback (most recent call last):
File "C:\Users\andre\AppData\Local\Programs\Python\Python37\SciKit-learn Tutorial\Tutorial 6 - Playing with the Data (pct_change).py", line 103, in <module>
Key_Stats()
File "C:\Users\andre\AppData\Local\Programs\Python\Python37\SciKit-learn Tutorial\Tutorial 6 - Playing with the Data (pct_change).py", line 83, in Key_Stats
sp500_pct_change = ((float(sp500_value) - starting_sp500_value) / starting_sp500_value) * 100
File "C:\Users\andre\AppData\Local\Programs\Python\Python37\lib\site-packages\pandas\core\series.py", line 93, in wrapper
"{0}".format(str(converter)))
TypeError: cannot convert the series to <class 'float'>
I guess we are working on the same project and on the same path. Here it is. I hope u understand where to insert this portion of the code
try:
sp500_date = datetime.fromtimestamp(unix_time).strftime('%Y-%m-%d')
row = sp500_df[sp500_df["Date"] == sp500_date]
sp500_value = row["Adj Close"]
sp500_value1 = sp500_value.values[0]
print(sp500_value1)
except:
sp500_date = datetime.fromtimestamp(unix_time-259200).strftime('%Y-%m-%d')
row = sp500_df[sp500_df["Date"] == sp500_date]
sp500_value = row["Adj Close"]
sp500_value1 = sp500_value.values[0]
print(sp500_value1)
Related
I am attempting to create a python script to iterate over all the rows of a specific column in an excel spredsheet. This column contains dates, I need to compare each of these dates in order to find and return the oldest date in the excel sheet. After which, I will need to modify the data in that row.
I have tried to append the dates into a numpy array as datetime objects, this was working but I cannot traverse through the array and compare the dates. I have also tried to reformat the dates in the excel sheet to datetime objects in python and then compare but I get the following error:
AttributeError: type object 'datetime.datetime' has no attribute 'datetime'
I have tried some other unsuccessful methods. These are the ones where I got closest to achieving what I want. I'm quite lost, please help!
import openpyxl
import numpy as np
import datetime
def main():
wb = openpyxl.load_workbook("C:\\Users\\User\\Desktop\\Python Telecom Project.xlsx")
sheet = wb.active
def menuSelection():
while True:
menuChoice = input("Please select one of the following options:\n1. Add User\n2.Delete User\n3.Modify User\n")
if menuChoice not in ('1', '2', '3'):
print("The input entered is invalid, please try again")
continue
else:
break
return menuChoice
def findOldestDate():
wb = openpyxl.load_workbook("C:\\Users\\User\\Desktop\\Python Telecom Project.xlsx")
sheet = wb.active
## startMult = np.empty((0,1000), dtype='datetime64[D]')
## value = datetime.date.strftime("%Y-%m-%d")
for rowNum in range(2, sheet.max_row+1):
status = sheet.cell(row=rowNum, column=5).value
d8 = sheet.cell(row=rowNum, column=6).value
d8_2 = sheet.cell(row=rowNum+1, column=6).value
d8.value = datetime.date.strftime(d8, "%Y-%m-%d")
d8_2.value = datetime.date.strftime(d8_2, "%Y-%m-%d")
d8.number_format = 'YYYY MM DD'
d8_2.number_format = 'YYYY MM DD'
if d8 < d8_2:
oldestDate = d8
elif d8 > d8_2:
oldestDate = d8_2
else:
continue
return oldestDate
## array.append(startMult, date)
##
## while counter < len(array)-1:
##
## if array[counter] < array[counter + 1]:
##
## oldestDate = array[counter]
## counter += 1
##
## elif array[counter] > array[counter + 1]:
##
## oldestDate = array[counter + 1]
## counter += 1
##
## else:
## oldestDate = array[counter]
## continue
##
## return oldestDate
def addUser():
wb = openpyxl.load_workbook("C:\\Users\\User\\Desktop\\Python Telecom Project.xlsx")
sheet = wb.active
dateTimeObj = datetime.date.today()
print("Please enter the following information:\n")
inputName = input("Name: ")
inputNTID = input("NTID: ")
inputRATSID = input("RATSID: ")
inputStatus = input("Status: ")
inputTaskNum = input("Task #: ")
for rowVal in range(2, sheet.max_row+1):
oldestDate = findOldDate()
phoneNum = sheet.cell(row=rowVal, column=1).value
name = sheet.cell(row=rowVal, column=2).value
ntID = sheet.cell(row=rowVal, column=3).value
ratsID = sheet.cell(row=rowVal, column=4).value
status = sheet.cell(row=rowVal, column=5).value
date = sheet.cell(row=rowVal, column=6).value
if date == oldestDate:
name = inputName
ntID = inputNTID
ratsID = inputRATSID
status = inputStatus
date = dateTimeObj
print("\nChanges have been implemented successfully!")
##def deleteUser():
##
##
##
##def modifyUser():
addUser()
This is the current error message:
AttributeError: type object 'datetime.datetime' has no attribute 'datetime'
Prior to this one, I was getting:
can't compare 'str' to 'datetime'
What I want is the oldest date in the column to be returned from this function.
Finding the oldest date can be achieved with a one-liner like the following:
from datetime import datetime as dt
from re import match
def oldest(sheet, column):
"""
Returns the tuple (index, timestamp) of the oldest date in the given sheet at the given column.
"""
return min([(i, dt.strptime(sheet.cell(row=i, column=column).value, '%Y %m %d').timestamp()) for i in range(2, sheet.max_row+1) if isinstance(sheet.cell(row=i, column=column).value, str) and match(r'\d{4}\s\d{2}\s\d{2}', sheet.cell(row=i, column=column).value)], key=lambda x:x[1])
The longer, slower but more readable version follow:
def oldest(sheet, column):
"""
Returns the tuple (index, timestamp) of the oldest date in the given sheet at the given column.
"""
format = '%Y %m %d'
values = list()
for i in range(2, sheet.max_row+1):
if isinstance(sheet.cell(row=i, column=column).value, str) and match(r'\d{4}\s\d{2}\s\d{2}', sheet.cell(row=i, column=column).value):
values.append((i, dt.strptime(sheet.cell(row=i, column=column).value, format).timestamp()))
return min(values, key=lambda x: x[1])
If you need that, you can convert the retrieved timestamp back in the date format you had as shown in this sample session at the python REPL:
>>> row, timestamp = oldest(sheet, 1)
>>> date = dt.utcfromtimestamp(timestamp[1]).strftime('%Y %m %d')
>>> date
'2019 10 31'
>>> row
30
I have developed a code in Python in which -in order to run the program- I need to take some arguments from the command line. But I am getting continuously the same error:
Traceback (most recent call last):
File "<string>", line 1, in <fragment>
invalid syntax: <string>, line 1, pos 16
I have the faintest idea what is wrong with my code. So, I present my code below in case someone could help me:
import QSTK.qstkutil.qsdateutil as du
import QSTK.qstkutil.tsutil as tsu
import QSTK.qstkutil.DataAccess as da
import datetime as dt
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time
import math
import copy
import QSTK.qstkstudy.EventProfiler as ep
import csv
import sys
import argparse
def readData(li_startDate, li_endDate, ls_symbols):
#Create datetime objects for Start and End dates (STL)
dt_start = dt.datetime(li_startDate[0], li_startDate[1], li_startDate[2])
dt_end = dt.datetime(li_endDate[0], li_endDate[1], li_endDate[2])
#Initialize daily timestamp: closing prices, so timestamp should be hours=16 (STL)
dt_timeofday = dt.timedelta(hours=16)
#Get a list of trading days between the start and end dates (QSTK)
ldt_timestamps = du.getNYSEdays(dt_start, dt_end, dt_timeofday)
#Create an object of the QSTK-dataaccess class with Yahoo as the source (QSTK)
c_dataobj = da.DataAccess('Yahoo', cachestalltime=0)
#Keys to be read from the data
ls_keys = ['open', 'high', 'low', 'close', 'volume', 'actual_close']
#Read the data and map it to ls_keys via dict() (i.e. Hash Table structure)
ldf_data = c_dataobj.get_data(ldt_timestamps, ls_symbols, ls_keys)
d_data = dict(zip(ls_keys, ldf_data))
return [d_data, dt_start, dt_end, dt_timeofday, ldt_timestamps]
def marketsim(cash,orders_file,values_file):
orders = pd.read_csv(orders_file,index_col='Date',parse_dates=True,header=None)
ls_symbols = list(set(orders['X.4'].values))
df_lastrow = len(orders) - 1
dt_start = dt.datetime(orders.get_value(0, 'X.1'),orders.get_value(0, 'X.2'),orders.get_value(0, 'X.3'))
dt_end = dt.datetime(orders.get_value(df_lastrow, 'X.1'),orders.get_value(df_lastrow, 'X.2'),orders.get_value(df_lastrow, 'X.3') + 1 )
#d_data = readData(dt_start,dt_end,ls_symbols)
#Initialize daily timestamp: closing prices, so timestamp should be hours=16 (STL)
dt_timeofday = dt.timedelta(hours=16)
#Get a list of trading days between the start and end dates (QSTK)
ldt_timestamps = du.getNYSEdays(dt_start, dt_end, dt_timeofday)
#Create an object of the QSTK-dataaccess class with Yahoo as the source (QSTK)
c_dataobj = da.DataAccess('Yahoo', cachestalltime=0)
#Keys to be read from the data
ls_keys = ['open', 'high', 'low', 'close', 'volume', 'actual_close']
#Read the data and map it to ls_keys via dict() (i.e. Hash Table structure)
df_data = c_dataobj.get_data(ldt_timestamps, ls_symbols, ls_keys)
d_data = dict(zip(ls_keys, ldf_data))
ls_symbols.append("_CASH")
trades = pd.Dataframe(index=list(ldt_timestamps[0]),columns=list(ls_symbols))
current_cash = cash
trades["_CASH"][ldt_timestamps[0]] = current_cash
current_stocks = dict()
for symb in ls_symbols:
current_stocks[symb] = 0
trades[symb][ldt_timestamps[0]] = 0
for row in orders.iterrows():
row_data = row[1]
current_date = dt.datetime(row_data['X.1'],row_data['X.2'],row_data['X.3'],16)
symb = row_data['X.4']
stock_value = d_data['close'][symb][current_date]
stock_amount = row_data['X.6']
if row_data['X.5'] == "Buy":
current_cash = current_cash - (stock_value*stock_amount)
trades["_CASH"][current_date] = current_cash
current_stocks[symb] = current_stocks[symb] + stock_amount
trades[symb][current_date] = current_stocks[symb]
else:
current_cash = current_cash + (stock_value*stock_amount)
trades["_CASH"][current_date] = current_cash
current_stocks[symb] = current_stocks[symb] - stock_amount
trades[symb][current_date] = current_stocks[symb]
#trades.fillna(method='ffill',inplace=True)
#trades.fillna(method='bfill',inplace=False)
trades.fillna(0)
#alt_cash = current_cash
#alt_cash = trades.cumsum()
value_data = pd.Dataframe(index=list(ldt_timestamps),columns=list("V"))
value_data = value_data.fillna(0)
value_data = value_data.cumsum(axis=0)
for day in ldt_timestamps:
value = 0
for sym in ls_symbols:
if sym == "_CASH":
value = value + trades[sym][day]
else:
value = calue + trades[sym][day]*d_data['close'][sym][day]
value_data["V"][day] = value
fileout = open(values_file,"w")
for row in value_data.iterrows():
file_out.writelines(str(row[0].strftime('%Y,%m,%d')) + ", " + str(row[1]["V"].round()) + "\n" )
fileout.close()
def main(argv):
if len(sys.argv) != 3:
print "Invalid arguments for marketsim.py. It should be of the following syntax: marketsim.py orders_file.csv values_file.csv"
sys.exit(0)
#initial_cash = int (sys.argv[1])
initial_cash = 1000000
ordersFile = str(sys.argv[1])
valuesFile = str(sys.argv[2])
marketsim(initial_cash,ordersFile,valuesFile)
if __name__ == "__main__":
main(sys.argv[1:])
The input I gave to the command line was:
python marketsim.py orders.csv values.csv
I guess that the problem lies either into the imports or probably into the main function(incl. the if below the def main(argv)
I have to point out that the files orders.csv and values.csv exist and are located into the same folder.
I hope have made everything clear.
So, I am looking forward to reading your answers community-mates! :D
Thank you!
When I run it, it keeps telling me the dataframe object is not callable.
class OptionDataWebGleaner():
def __init__(self):
ticker = pd.read_csv('Yahoo_ticker_List.csv')['AUB.AX'].values
stock = raw_input('Please give the ticker of your selected option?\n')
if stock in ticker:
self.stock = stock
else:
raise TypeError('Your option is not available here.')
date_norm = raw_input('Please give your maturity date in the format of mm/dd/yyyy\n')
maturity_date = datetime.strptime(date_norm, '%m/%d/%Y').date()
self.maturity_date = maturity_date
self.today = date.today()
dates = ['1481846400', '1484870400', '1487289600']
maturity_dates = [date(2016, 12, 16), date(2017, 1, 20), date(2017, 2, 17)]
date_dict = {}
for v in zip(dates, maturity_dates):
date_dict[v[1]] = v[0]
try:
self.d = date_dict[self.maturity_date]
except:
print('Your maturuity date is not available')
option = raw_input('Please give the type of your option, either call or put\n')
self.option_type = option + 's'
#property
def crawl_data(self): # self #option_type: calls or puts. str
stock = self.stock
option_type = self.option_type
maturity_date = self.maturity_date
d = self.d
chromedriver = "/Users/Miya/Downloads/chromedriver.exe"
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)
today = self.today
## Get the url
url = 'http://finance.yahoo.com/quote/' + stock + '/options?date=' + d
## Crawl data
driver.get(url)
html_source = driver.page_source
## Beautifulsoup
soup = BeautifulSoup(html_source, 'html.parser')
if soup.find('table', option_type) is not None:
stock_price = [float(i.text) for i in soup.findAll('span', 'Fz(36px)')]
title = [i.text for i in soup.find('table', option_type).find_all('th')]
text = [i.text for i in soup.find('table', option_type).find_all('td')]
rows = [row for row in soup.find('table', option_type).find_all("tr")]
l_table = len(rows) - 1
## call/put data
dictionary = {}
dictionary['maturity_date'] = [maturity_date] * l_table
dictionary['date'] = [today] * l_table
dictionary['stock_price'] = stock_price * l_table
for j in range(10):
key = title[j]
dictionary[key] = []
for i in range(l_table):
dictionary[key].append(text[10 * i + j])
## write into dataframe
dataframe = pd.DataFrame(dictionary)
return dataframe
def clean_data(self):
dataframe = self.crawl_data()
print('Remove unexpected symbols...')
columns_to_set = ['Last Price', 'Open Interest', 'Strike', 'Volume', 'Implied Volatility']
for i in columns_to_set:
series = dataframe[i]
series_new = []
for j in series:
j = str(j)
j_new = ''.join(ch for ch in j if (ch != '%') and (ch != ','))
series_new.append(j_new)
dataframe[i] = series_new
print('Change the data type...')
## change the dtype
columns_to_change = ['Last Price', 'Open Interest', 'Strike', 'Volume', 'stock_price', 'Implied Volatility']
for i in columns_to_change:
dataframe_cleaned[i] = dataframe[i].astype(float)
print("Remove missing values...")
dataframe_cleaned = dataframe_cleaned.dropna()
# print("Clean Outliers...")
# dataframe = dataframe.loc[dataframe['Implied Volatility'] <= 2]
return dataframe_cleaned
def save_file(self):
save_file = raw_input("Do you want to save the file into csv? Type Y for yes, N or no\n ")
d = self.d
stock = self.stock
df_option = self.clean_data()
if save_file == 'Y':
csv_name = stock + d + '.csv'
df_option.to_csv(csv_name)
print("File Saved!")
def viz(self):
dataframe = self.clean_data()
stock = self.stock
time_to_maturity = []
dataframe = dataframe.sort_values(by='Strike')
## grab dataframe, then relevant data
for i, j in zip(dataframe.maturity_date, dataframe.date):
time_to_maturity.append((i - j).days / 365)
strike_price = dataframe['Strike']
# generate pseudo-implied volatility by using strike price and time-to-maturity as parameters
implied_vol = dataframe['Implied Volatility'].values
strike_price, time_to_maturity = np.meshgrid(strike_price, time_to_maturity)
fig = plot.figure(figsize=(10, 5)) ## a plot object
ax = Axes3D(fig) # create a 3D object/handle
##plot surface: array row/column stride(step size:2)
##plot surface: array row/column stride(step size:2)
surf = ax.plot_surface(strike_price, time_to_maturity, implied_vol, rstride=2, cstride=2, cmap=cm.coolwarm,
linewidth=0.5, antialiased=False)
# set x,y,a labels
ax.set_xlabel('Strike Price')
ax.set_ylabel('time to maturity')
ax.set_zlabel('implied volatility%')
plot.suptitle(stock)
plot.show()
def summary(self):
dataframe = self.clean_data
print(dataframe.describe())
OptionDataWebGleaner().viz()
The problem is the property decorator on crawl_data. This answer explains how the property decorator actually works, but basically, dataframe.crawl_data is the dataframe returned by the function, not the function. So dataframe.crawl_data() in the first line of clean_data is trying to call the dataframe, not the function.
Here's an example:
>>> class Test(object):
... #property
... def example(self):
... return 1
...
>>> t = Test()
>>> t.example
1
>>> t.example()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
TypeError: 'int' object is not callable
This question really could have done with the stacktrace. It would have lead us right to line with the problematic call.
I get this error
Traceback (most recent call last):
File "C:\Users\User1\Desktop\cellh5_scripts\ewa_pnas_fate.py", line 90, in <module>
ec.combine_classifiers("Event labels combined")
File "C:\Users\User1\Desktop\cellh5_scripts\ewa_pnas_fate.py", line 53, in combine_classifiers
pnas_class[pnas_class==3] = 1
TypeError: 'numpy.int32' object does not support item assignment
by runing the code
def combine_classifiers(self, output_name):
all_combined_classes = []
for _, (plate_name, w, p, t1, t2, track_ids, track_labels) in self.mapping[['Plate',
'Well',
'Site',
'Gene Symbol',
'siRNA ID',
'Event track ids',
'Event track labels']].iterrows():
combined_classes = []
ch5_file_handle = self.cellh5_handles[plate_name]
ch5_pos = ch5_file_handle.get_position(w, str(p))
for track_id, track_label in zip(track_ids, track_labels):
h2b_class = track_label.copy()
print(track_id)
pnas_class = ch5_pos.get_class_prediction('secondary__expanded')[track_id]['label_idx'] + 1
print(pnas_class)
inter_idx = h2b_class == 1
pnas_class[pnas_class==3] = 1
pnas_class[pnas_class==2]+=2
combined_class = h2b_class
combined_class[inter_idx] = pnas_class[inter_idx]
combined_classes.append(combined_class)
all_combined_classes.append(combined_classes)
self.mapping[output_name] = pandas.Series(all_combined_classes)
I print pnas_class which is 1, and track_id which is 50708. I'm wondering what the designer of code want to do in the part:
inter_idx = h2b_class == 1
pnas_class[pnas_class==3] = 1
pnas_class[pnas_class==2]+=2
combined_class = h2b_class
combined_class[inter_idx] = pnas_class[inter_idx]
How can I change that to have the same meaning?
pnas_class is a an integer so you can't select item from an integer by [pnas_class==3] = 1.
Maybe you are trying to affect 1 to pnas_class if it's equal to 3. In this case try this:
pnas_class= 1*(pnas_class == 3) + pnas_class*(pnas_class != 3 )
Ok I found the mistake. You arer right the pnas_class should not be an integer and I know why is it integer instead of array.
I am taking an Udacity programming course and have been sitting on the same problem for a week. I finally think I am close to getting it right, but I don't get the last objection. Here is my code:
def process_file(f):
# This is example of the datastructure you should return
# Each item in the list should be a dictionary containing all the relevant data
# Note - year, month, and the flight data should be integers
# You should skip the rows that contain the TOTAL data for a year
# data = [{"courier": "FL",
# "airport": "ATL",
# "year": 2012,
# "month": 12,
# "flights": {"domestic": 100,
# "international": 100}
# },
# {"courier": "..."}
# ]
data = []
info = {}
info["courier"], info["airport"] = f[:6].split("-")
with open("{}/{}".format(datadir, f), "r") as html:
soup = BeautifulSoup(html)
car = str(html)[17:19]
airp = str(html)[20:23]
mydict = {}
x = 0
table = soup.find("table", {"class": "dataTDRight"})
rows = table.find_all('tr')
for row in rows:
cells = row.find_all('td')
year = cells[0].get_text()
year = (year.encode('ascii'))
Month = cells[1].get_text()
Month = (Month.encode('ascii'))
domestic = cells[2].get_text()
domestic = (domestic.encode('ascii'))
international = cells[3].get_text()
international = (international.encode('ascii'))
if Month != "Month" and Month != "TOTAL":
Month = int(Month)
year = int(year)
domestic = int(domestic.replace(',', ''))
international = int(international.replace(',', ''))
mydict['courier'] = car
mydict['airport'] = airp
mydict['year'] = year
mydict['month'] = Month
mydict['flights'] = (domestic, international)
data.append(mydict.copy())
#print type(domestic)
#print mydict
print data
return data
def test():
print "Running a simple test..."
open_zip(datadir)
files = process_all(datadir)
data = []
for f in files:
data += process_file(f)
assert len(data) == 399
for entry in data[:3]:
assert type(entry["year"]) == int
assert type(entry["month"]) == int
assert type(entry["flights"]["domestic"]) == int
assert len(entry["airport"]) == 3
assert len(entry["courier"]) == 2
assert data[-1]["airport"] == "ATL"
assert data[-1]["flights"] == {'international': 108289, 'domestic': 701425}
print "... success!"
The error message I get is:
Traceback (most recent call last):
File "vm_main.py", line 33, in <module>
import main
File "/tmp/vmuser_elbzlfkcpw/main.py", line 2, in <module>
import studentMain
File "/tmp/vmuser_elbzlfkcpw/studentMain.py", line 2, in <module>
process.test()
File "/tmp/vmuser_elbzlfkcpw/process.py", line 114, in test
assert type(entry["flights"]["domestic"]) == int
TypeError: tuple indices must be integers, not str
I am a total beginner, I checked both the type of domestic, and international, they are both int.
Can anybody tell me where I can look up or what I did wrong?
You created a tuple here:
mydict['flights'] = (domestic, international)
so mydict['flights'] is a tuple. But you try to treat it as a dictionary here:
assert type(entry["flights"]["domestic"]) == int
That won't work; you'll need to use integer indices here:
assert type(entry["flights"][0]) == int
or better still, use isinstance() to test for types:
assert isinstance(entry["flights"][0], int)
Here you assign your data mydict['flights'] as a tuple.
def process_file(f):
# Omitted code...
mydict['flights'] = (domestic, international)
Your error then comes from an illegal access to that data type. You are attempting to access the first item of that tuple by the name of variable you used in assignment:
assert type(entry["flights"]["domestic"]) == int
You either need to access your data via an integer index:
assert type(entry["flights"][0]) == int
Or you need to change your assignment to:
mydict['flights'] = {"domestic":domestic, "international":international}
tuples are immutable data types which are indexed by integers. The type of access you are attempting is typical of a dictionary, where indexes can be of any type.