Getting financial data into data frames for multiple tickers - python

here is the code :
**tickers = ['AMZN','AAPL','MSFT','DIS','GOOG']
# Created individual dataframes for each category of data and tickers
BS0=yfs.get_balance_sheet(tickers[0])
IS0=yfs.get_income_statement(tickers[0])
CF0=yfs.get_cash_flow(tickers[0])
BS0.columns = ['Period0','Period1','Period2','Period3']
IS0.columns = ['Period0','Period1','Period2','Period3']
CF0.columns = ['Period0','Period1','Period2','Period3']
BS0.columns.name = tickers[0]
IS0.columns.name = tickers[0]
CF0.columns.name = tickers[0]
BS1=yfs.get_balance_sheet(tickers[1])
IS1=yfs.get_income_statement(tickers[1])
CF1=yfs.get_cash_flow(tickers[1])
BS1.columns = ['Period0','Period1','Period2','Period3']
IS1.columns = ['Period0','Period1','Period2','Period3']
CF1.columns = ['Period0','Period1','Period2','Period3']
BS1.columns.name = tickers[1]
IS1.columns.name = tickers[1]
CF1.columns.name = tickers[1]
BS2=yfs.get_balance_sheet(tickers[2])
IS2=yfs.get_income_statement(tickers[2])
CF2=yfs.get_cash_flow(tickers[2])
BS2.columns = ['Period0','Period1','Period2','Period3']
IS2.columns = ['Period0','Period1','Period2','Period3']
CF2.columns = ['Period0','Period1','Period2','Period3']
BS2.columns.name = tickers[2]
IS2.columns.name = tickers[2]
CF2.columns.name = tickers[2]
BS3=yfs.get_balance_sheet(tickers[3])
IS3=yfs.get_income_statement(tickers[3])
CF3=yfs.get_cash_flow(tickers[3])
BS3.columns = ['Period0','Period1','Period2','Period3']
IS3.columns = ['Period0','Period1','Period2','Period3']
CF3.columns = ['Period0','Period1','Period2','Period3']
BS3.columns.name = tickers[3]
IS3.columns.name = tickers[3]
CF3.columns.name = tickers[3]
BS4=yfs.get_balance_sheet(tickers[4])
IS4=yfs.get_income_statement(tickers[4])
CF4=yfs.get_cash_flow(tickers[4])
BS4.columns = ['Period0','Period1','Period2','Period3']
IS4.columns = ['Period0','Period1','Period2','Period3']
CF4.columns = ['Period0','Period1','Period2','Period3']
BS4.columns.name = tickers[4]
IS4.columns.name = tickers[4]
CF4.columns.name = tickers[4]**
I have tried writing with for ticker in tickers logic and then converting to data frame with pandas, but this gives me a huge data frame with all the information packed into individual cells instead of columns and I have no idea how to spread it out in a way that makes sense for referencing.
Maybe there is a way to do this or simply to create a loop to save different data frames such as the code above but with less lines.
Thanks in advance

You can try maintaining a dictionary of dataframes
import pandas as pd
tickers = ['AMZN','AAPL','MSFT','DIS','GOOG']
column_names = ['Period0','Period1','Period2','Period3']
ticker_dfs ={}
for index, ticker in enumerate(tickers):
bs_index = 'BS' + str(index)
is_index = 'IS' + str(index)
cf_index = 'CF' + str(index)
ticker_dfs[bs_index] = yfs.get_balance_sheet(ticker)
ticker_dfs[bs_index].columns = column_names
ticker_dfs[is_index] = yfs.get_income_statement(ticker)
ticker_dfs[is_index].columns = column_names
ticker_dfs[cf_index] = yfs.get_cash_flow(ticker)
ticker_dfs[cf_index].columns = column_names

Related

Creating new df from series of widget boxes

I have created an "input form" with several ipywidget boxes. I want to be able to reference all the values to create a new dataframe.
I'm currently doing this in a horrible way.
portfolio_df = pd.DataFrame([[VBox1.children[0].value, VBox2.children[0].value, VBox3.children[0].value, VBox4.children[0].value]],
columns=['Product Name','Units','Price', 'Invested Amount'])
row_2 = [VBox1.children[1].value, VBox2.children[1].value, VBox3.children[1].value, VBox4.children[21].value]
portfolio_df.loc[len(portfolio_df)] = row_2
row_3 = [VBox1.children[2].value, VBox2.children[2].value, VBox3.children[2].value, VBox4.children[2].value]
portfolio_df.loc[len(portfolio_df)] = row_3
row_4 = [VBox1.children[3].value, VBox2.children[3].value, VBox3.children[3].value, VBox4.children[3].value]
portfolio_df.loc[len(portfolio_df)] = row_4
and so on up till row 23 in this instance !! (but the length will vary up to the number of children within a VBox)
I suspect I can do this more pythonically using a for loop but cant figure it out.
Full code as per requests (I've edited columns so my live data is different but this is exact replica of the set up)
import pandas as pd
import numpy as np
import datetime as dt
import ipywidgets as ipw
from ipywidgets import *
barrier_list = pd.DataFrame(np.random.randn(24, 4), columns=('Product
Name','ISIN','A','B'))
barrier_list= barrier_list.astype(str)
dd_list = []
for i in range(len(barrier_list['Product Name'])):
dropdown = ipw.FloatText(description=barrier_list['ISIN'][i],
value=barrier_list['Product Name'][i],
disabled=False,
layout = {'width':'350px'})
dropdown.style.description_width = 'initial'
dd_list.append(dropdown)
dd_list1 = []
for i in range(len(barrier_list['Product Name'])):
dropdown1 = ipw.FloatText(description='Units',
value=0,
layout = {'width':'200px'})
dd_list1.append(dropdown1)
dd_list2 = []
for i in range(len(barrier_list['Product Name'])):
dropdown2 = ipw.FloatText(description='Price',
value=0,
layout = {'width':'200px'})
dd_list2.append(dropdown2)
dd_list3 = []
for i in range(len(barrier_list['Product Name'])):
dropdown3 = ipw.FloatText(description='Value',
value=0,
layout = {'width':'200px'})
dd_list3.append(dropdown3)
VBox1 = ipw.VBox(dd_list)
VBox2 = ipw.VBox(dd_list1)
VBox3 = ipw.VBox(dd_list2)
VBox4 = ipw.VBox(dd_list3)
HBox = widgets.HBox([VBox1, VBox2, VBox3, VBox4])
solved this one by looping through the VBoxes one by one and then concatenating the dataframes into one main one.
product_df = pd.DataFrame()
for i in range(len(dd_list)):
product_name_df = pd.DataFrame([[VBox1.children[i].value]],columns=
['Product Name'])
product_df = product_df.append(product_name_df)
unit_df = pd.DataFrame()
for i in range(len(dd_list)):
unit_amount_df = pd.DataFrame([[VBox2.children[i].value]],columns=
['Units'])
unit_df = unit_df.append(unit_amount_df)
price_df = pd.DataFrame()
for i in range(len(dd_list)):
price_amount_df = pd.DataFrame([[VBox3.children[i].value]],columns=
['Price'])
price_df = price_df.append(price_amount_df)
value_df = pd.DataFrame()
for i in range(len(dd_list)):
value_amount_df = pd.DataFrame([[VBox4.children[i].value]],columns=
['Value'])
value_df = value_df.append(value_amount_df)
df_list = [product_df.reset_index(drop=True),unit_df.reset_index(drop=True),
price_df.reset_ind ex(drop=True),value_df.reset_index(drop=True)]
portfolio_df = pd.concat((df_list), axis=1)
portfolio_df

How to convert a CSV table into COCO format in python?

I have a CSV table with the following columns:
column_names = ['image_id', 'xmin', 'ymin', 'width', 'height', 'xmax','ymax']
where xmin, ymin, xmax and ymax represent the bounding box that encloses some object; width and height, the image dimensions; and image_id, the file name (.JPG file). Since I want to do object detection, I need to convert this table into COCO format. Amazingly enough, I can't find any answer to this inquiry in the internet.
I had the same issue before, then I found this code it is very helpful
you will need to change the column names to this columns and update the csv file
column_names =['filename','class','width', 'height','xmin','ymin','xmax','ymax']
then try this code
import numpy as np
import json
import pandas as pd
path = 'annotations.csv' # the path to the CSV file
save_json_path = 'traincoco.json'
data = pd.read_csv(path)
images = []
categories = []
annotations = []
category = {}
category["supercategory"] = 'none'
category["id"] = 0
category["name"] = 'None'
categories.append(category)
data['fileid'] = data['filename'].astype('category').cat.codes
data['categoryid']= pd.Categorical(data['class'],ordered= True).codes
data['categoryid'] = data['categoryid']+1
data['annid'] = data.index
def image(row):
image = {}
image["height"] = row.height
image["width"] = row.width
image["id"] = row.fileid
image["file_name"] = row.filename
return image
def category(row):
category = {}
category["supercategory"] = 'None'
category["id"] = row.categoryid
category["name"] = row[2]
return category
def annotation(row):
annotation = {}
area = (row.xmax -row.xmin)*(row.ymax - row.ymin)
annotation["segmentation"] = []
annotation["iscrowd"] = 0
annotation["area"] = area
annotation["image_id"] = row.fileid
annotation["bbox"] = [row.xmin, row.ymin, row.xmax -row.xmin,row.ymax-row.ymin ]
annotation["category_id"] = row.categoryid
annotation["id"] = row.annid
return annotation
for row in data.itertuples():
annotations.append(annotation(row))
imagedf = data.drop_duplicates(subset=['fileid']).sort_values(by='fileid')
for row in imagedf.itertuples():
images.append(image(row))
catdf = data.drop_duplicates(subset=['categoryid']).sort_values(by='categoryid')
for row in catdf.itertuples():
categories.append(category(row))
data_coco = {}
data_coco["images"] = images
data_coco["categories"] = categories
data_coco["annotations"] = annotations
json.dump(data_coco, open(save_json_path, "w"), indent=4)

Webscraping data from a json source, why i get only 1 row?

I'am trying to get some information from a website with python, from a webshop.
I tried this one:
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
# print(df) ## print df
df.to_csv(r'/usr/src/Python-2.7.13/test.csv', sep=',', encoding='utf-8-sig',index = False )
while True:
mytime=datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print mytime
proba()
mytime=datetime.now().strftime("%H:%M:%S")
In this webshop there are 9 items, but i see only 1 row in the csv file.
Not entirely sure what you intend as end result. Are you wanting to update an existing file? Get data and write out all in one go? Example of latter shown below where I add each new dataframe to an overall dataframe and use a Return statement for the function call to provide each new dataframe.
import requests
from datetime import datetime
import pandas as pd
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
return df
headers = ['Name', 'Price', 'Url']
df = pd.DataFrame(columns = headers)
while True:
mytime = datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print(mytime)
dfCurrent = proba()
mytime=datetime.now().strftime("%H:%M:%S")
df = pd.concat([df, dfCurrent])
df.to_csv(r"C:\Users\User\Desktop\test.csv", encoding='utf-8')

Added a not desired column in csv

I have this code
from sklearn import tree
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url)
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1
train["Embarked"] = train["Embarked"].fillna("S")
train["Age"] = train["Age"].fillna(train["Age"].median())
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2
target = train["Survived"].values
features_one = train[["Pclass", "Sex", "Age", "Fare"]].values
my_tree_one = tree.DecisionTreeClassifier()
my_tree_one = my_tree_one.fit(features_one, target)
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url)
test.Fare[152] = test["Fare"].median()
test["Sex"][test["Sex"] == "male"] = 0
test["Sex"][test["Sex"] == "female"] = 1
test["Embarked"] = test["Embarked"].fillna("S")
test["Age"] = test["Age"].fillna(test["Age"].median())
test["Embarked"][test["Embarked"] == "S"] = 0
test["Embarked"][test["Embarked"] == "C"] = 1
test["Embarked"][test["Embarked"] == "Q"] = 2
test_features = test[["Pclass", "Sex", "Age", "Fare"]].values
my_prediction = my_tree_one.predict(test_features)
PassengerId = np.array(test["PassengerId"]).astype(int)
my_solution = pd.DataFrame(my_prediction, PassengerId)
my_solution.to_csv("5.csv", index_label = ["PassangerId", "Survived"])
As you can see I only want save a csv with two columns, but when I look at the file 5.csv it's added another column called 0..Anybody know why?
You're seeing this behaviour because you're adding two index_labels when there is only one index.
You can instead name your one column as such:
my_solution.columns = ['Survived']
And then label your index like so:
my_solution.to_csv("5.csv", index_label=["PassengerId"])
Try this slightly optimized solution:
from sklearn import tree
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
cols = ["Pclass", "Sex", "Age", "Fare"]
mappings = {
'Sex': {'male':0, 'female':1},
}
def cleanup(df, mappings=mappings):
# map non-numeric columns
for c in mappings.keys():
df[c] = df[c].map(mappings[c])
# replace NaN's with average value
for c in df.columns[df.isnull().any()]:
df[c].fillna(df[c].mean(), inplace=True)
return df
# parse train data set
train = cleanup(d.read_csv(train_url, usecols=cols + ['Survived']))
my_tree_one = tree.DecisionTreeClassifier()
my_tree_one.fit(train.drop('Survived',1), train['Survived'])
# parse test data set
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url, usecols=cols+['PassengerId'])
result = test.pop('PassengerId').to_frame('PassengerId')
test = cleanup(test)
result['Survived'] = my_tree_one.predict(test)
result.to_csv("5.csv", index=False)

I want to create a time series of monthly means in Pandas

I have a dataframe that consists of hourly data for a whole year. I want to calculate the monthly means and show them in a time series plot. I have one variable which is NO2 values.
#Cleaning data
ck_2000 = pd.read_csv('2000-CamdenKerbside.csv', header=0,skiprows=4,usecols=range(0,3),skipfooter = 1, na_values = 'No data',engine = 'python')
colnames = ['Date', 'Time', 'NO2']
ck_2000.columns = colnames
#Reformat date/time
ck_2000.Time.replace(to_replace = '24:00:00', value = '00:00:00', inplace = True)
dtw = pd.to_datetime(ck_2000.Date + ck_2000.Time,format='%d/%m/%Y%H:%M:%S')
ck_2000.index = dtw
#Index dataframe by date
firstDate = ck_2000.index[0]
lastDate = ck_2000.index[len(ck_2000.Date) - 1]
ck2000 = ck_2000.reindex(index=pd.date_range(start = firstDate, end =lastDate, freq = '1H'), fill_value= None)
#Change data type to float
ck2000['NO2'] = ck2000['NO2'].dropna().astype('int64')
#Interpolation
ck_2000_int = ck_2000.interpolate()
#df's for all months
ck_2000_jan = ck_2000_int['2000-01']
ck_2000_feb = ck_2000_int['2000-02']
ck_2000_mar = ck_2000_int['2000-03']
ck_2000_apr = ck_2000_int['2000-04']
ck_2000_may = ck_2000_int['2000-05']
ck_2000_jun = ck_2000_int['2000-06']
ck_2000_jul = ck_2000_int['2000-07']
ck_2000_aug = ck_2000_int['2000-08']
ck_2000_sept = ck_2000_int['2000-09']
ck_2000_oct = ck_2000_int['2000-10']
ck_2000_nov = ck_2000_int['2000-11']
ck_2000_dec = ck_2000_int['2000-12']
you should be able to use resample
Consider the following example
tidx = pd.date_range('2000-01-01', '2000-12-31 23:00', freq='H')
ck_2000_int = pd.DataFrame(dict(NO2=np.random.randn(len(tidx))), tidx)
ck_2000_int.resample('M').mean().plot()

Categories