Using .pivot() after saving and loading from CSV causes KeyError - python

I am trying to pull data from Yahoo! Finance for analysis and am having trouble when I want to read from a CSV file instead of downloading from Yahoo! every time I run the program.
import pandas_datareader as pdr
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
def get(tickers, startdate, enddate):
def data(ticker):
return pdr.get_data_yahoo(ticker, start = startdate, end = enddate)
datas = map(data, tickers)
return(pd.concat(datas, keys = tickers, names = ['Ticker', 'Date']))
tickers = ['AAPL', 'MSFT', 'GOOG']
all_data = get(tickers, datetime.datetime(2006, 10,1), datetime.datetime(2018, 1, 7))
all_data.to_csv('data/alldata.csv')
#Open file
all_data_csv = pd.read_csv('data/alldata.csv', header = 0, index_col = 'Date', parse_dates = True)
daily_close = all_data[['Adj Close']].reset_index().pivot('Date', 'Ticker', 'Adj Close')
I'm having problems with the 'daily_close' section. The above code works as it is using 'all_data' which comes directly from the web. How do I alter the bottom line of code so that the data is being pulled from my csv file? I have tried daily_close = all_data_csv[['Adj Close']].reset_index().pivot('Date', 'Ticker', 'Adj Close') however this results in a KeyError due to 'Ticker'.
The csv data is in the following format, with the first column containing all of the tickers:

Your current code for all_data_csv will not work as it did for all_data. This is a consequence of the fact that all_data contains a MultiIndex with all the information needed to carry out the pivot.
However, in the case of all_data_csv, the only index is Date. So, we'd need to do a little extra in order to get this to work.
First, reset the Date index
Select only the columns you need - ['Date', 'Ticker', 'Adj Close']
Now, pivot on these columns
c = ['Date', 'Ticker', 'Adj Close']
daily_close = all_data_csv.reset_index('Date')[c].pivot(*c)
daily_close.head()
Ticker AAPL GOOG MSFT
Date
2006-10-02 9.586717 199.422943 20.971155
2006-10-03 9.486828 200.714539 20.978823
2006-10-04 9.653308 206.506866 21.415722
2006-10-05 9.582876 204.574448 21.400393
2006-10-06 9.504756 208.891357 21.362070

Related

Is it possible to create a master dataframe for many companies from yfinance?

My objectives are to:
Get Yahoo finance OHLC (Open, High, Low, and Close) data into Postgres.
Being able to update the data easily.
Being able to easily add or remove tickers.
My current methodology:
Create pandas dataframe.
Dump data to .csv
From Postgres COPY
ISSUE:
I do not know how to create a dataframe for company A, then append (merge, join, concat, ECT) dataframe for other companies (~150 companies so far) and dump to .cvs.
Below are my actual code and a workaround that provides for the desired result but is clunky.
Let me know what you think.
ACTUAL (not working as expected)
import pandas as pd
import yfinance as yf
tickers = ['VIR','PATH']
#ticker = ['VIR']
for ticker in tickers:
df_yahoo = yf.download(ticker,
#start='2000-01-01',
#end='2010-12-31',
progress='True')
df = pd.DataFrame(df_yahoo)
df.insert(0, 'TICKER', ticker)
file_name = "/Users/kevin/Dropbox/Programming/Python/test_data/deleteme.csv"
df.to_csv(file_name)
print(df)
WORKAROUND (working)
import pandas as pd
import yfinance as yf
import pickle
tickers = ['VIR']
#ticker = ['VIR']
for ticker in tickers:
df_yahoo = yf.download(ticker,
#start='2000-01-01',
#end='2010-12-31',
progress='True')
df = pd.DataFrame(df_yahoo)
df.insert(0, 'TICKER', ticker)
tickers = ['PATH']
#ticker = ['VIR']
for ticker in tickers:
df_yahoo = yf.download(ticker,
#start='2000-01-01',
#end='2010-12-31',
progress='True')
df1 = pd.DataFrame(df_yahoo)
df1.insert(0, 'TICKER', ticker)
frames = [df1, df]
result = pd.concat(frames)
file_name = "/Users/kevin/Dropbox/Programming/Python/test_data/deleteme.csv"
result.to_csv(file_name)
print(df)
Given what I think you want to accomplish, this is how I would do it:
# Create a function to load the data and create the frame
# Assumes len(tickers) >= 1
def build_df(tickers):
df = pd.DataFrame(yf.download(tickers[0],
#start='2000-01-01',
#end='2010-12-31',
progress='True'))
df.insert(0, 'TICKER', tickers[0])
for ticker in tickers[1:]:
dx = pd.DataFrame(yf.download(ticker,
#start='2000-01-01',
#end='2010-12-31',
progress='True'))
dx.insert(0, 'TICKER', ticker)
df = pd.concat([df, dx])
return df
Then Call the function to assemble the desired DF as follows:
result = build_df(tickers)
Finally, output the completed frame to CSV
file_name = "/Users/kevin/Dropbox/Programming/Python/test_data/deleteme.csv"
result.to_csv(file_name)

How to put stock prices from csv file into one single dataframe

So I am gathering data from the S&P 500,from a csv file. My question is how would I create one large dataframe, that has 500 columns and with all of the prices. The code is currently:
import pandas as pd
import pandas_datareader as web
import datetime as dt
from datetime import date
import numpy as np
def get_data():
start = dt.datetime(2020, 5, 30)
end = dt.datetime.now()
csv_file = pd.read_csv(os.path.expanduser("/Users/benitocano/Downloads/copyOfSandP500.csv"), delimiter = ',')
tickers = pd.read_csv("/Users/benitocano/Downloads/copyOfSandP500.csv", delimiter=',', names = ['Symbol', 'Name', 'Sector'])
for i in tickers['Symbol'][:5]:
df = web.DataReader(i, 'yahoo', start, end)
df.drop(['High', 'Low', 'Open', 'Close', 'Volume'], axis=1, inplace=True)
get_data()
So as the code shows right now it is just going yo create 500 individual dataframes, and so I am asking how to make it into one large dataframe. Thanks!
EDIT:
The CSV file link is:
https://datahub.io/core/s-and-p-500-companies
I have tried this to the above code:
for stock in data:
series = pd.Series(stock['Adj Close'])
df = pd.DataFrame()
df[ticker] = series
print(df)
Though the output is only one column like so:
ADM
Date
2020-06-01 38.574604
2020-06-02 39.348278
2020-06-03 40.181465
2020-06-04 40.806358
2020-06-05 42.175167
... ...
2020-11-05 47.910000
2020-11-06 48.270000
2020-11-09 49.290001
2020-11-10 50.150002
2020-11-11 50.090000
Why is printing only one column, rather than the rest if them?
The answer depends on the structure of the dataframes that your current code produces. As the code depends on some files on your local drive, we cannot run it so hard to be specific here. In general, there are many options, among the most common I would say are
Put dfs into a list and use pandas.concat(..., axis=1) on that list to concatenate dfs column by column, see here
Merge (merge or join) your dfs on the Date column that I assume each df has, see here

adding a column to my dataframe with yFinance

beginner question coming up and cant seem to connect the dots.
I have a portfolio data frame called my_pf which includes the tickers that I use for collecting the opening price. I success in collecting the opening data via the next two steps.
#create a list from the column 'ticker'
my_tickers = my_pf['ticker'].tolist()
#collect the opening data per ticker
for ticker in my_tickers:
open_price = yf.Ticker(ticker).info.get('open')
print(ticker, open_price)
The next step is adding the extracted data to my initial data frame. But how would i go about this?
Thank you for your help in advance.
There are many ways to add data to a column, such as df.append() and pd.concat(), but we created our code with df.append(). We start with an empty data frame to create the stock column and the opening price column. Once we have the opening price, we add the brand name and opening price to the data frame we just created.
import pandas as pd
import yfinance as yf
# my_tickers = my_pf['ticker'].tolist()
my_tickers = ['msft', 'aapl', 'goog']
tickers = yf.Tickers(my_tickers)
df = pd.DataFrame(index=[], columns=['ticker','Open'])
for ticker in my_tickers:
open_price = yf.Ticker(ticker).info.get('open')
df = df.append(pd.Series([ticker,open_price], index=df.columns), ignore_index=True)
print(df)
ticker Open
0 msft 204.07
1 aapl 112.37
2 goog 1522.36

how to overrule / skip items in a list that cause error

Below is the python script i'm working on to pick out the stocks that meet certain price criteria (as written, tickerlist=[]collects tickers of the stocks whose max price and min price were >30 and <2 respectively.)
import matplotlib.pyplot as plt
import math
import csv
import pandas as pd
import datetime
import pandas.io.data as web
from filesortfunct import filesort
from scipy import stats
from scipy.stats.stats import pearsonr
import numpy as np
import math
dataname= 'NASDAQ.csv' #csv file from which to extract stock tickers
df = pd.read_csv(dataname, sep=',')
df = df[['Symbol']]
df.to_csv(new+dataname, sep=',', index=False)
x=open(new+dataname,'rb') #convert it into a form more managable
f = csv.reader(x) # csv is binary
Symbol = zip(*f)
print type(Symbol) #list format
Symbol=Symbol[0] #pick out the first column
Symbol = Symbol[1:len(Symbol)] #remove the first row "symbol" header
Symbol= Symbol[0:2] #slicing to coose which stocks to look at
#decide the two dates between which to look at stock prices
start = datetime.datetime.strptime('2/10/2016', '%m/%d/%Y')
end = datetime.datetime.strptime('2/24/2016', '%m/%d/%Y')
#intended to collect indeces and min/max prices
tickerlist=[]
maxpricelist = []
minpricelist =[]
for item in Symbol:
serious=web.DataReader([item], 'yahoo', start, end)['Adj Close']
serious2=serious.loc[:, item].tolist() #extract the column of 'Adj Close'
plt.figure()
ap = plt.plot(serious2)
indexmax, valuemax = max(enumerate(serious2))
indexmin, valuemin = min(enumerate(serious2))
if valuemax>30 and valuemin<2:
tickerlist.append(item)
maxpricelist.append(valuemax)
minpricelist.append(valuemin)
plt.show()
The issue that i have right now is that some of the stocks on the list are discontinued? or YAHOO does not have their stock prices listed I suppose. So, when those stock tickers are included in the slicing, i get the following error message.
RemoteDataError: No data fetched using '_get_hist_yahoo'
Is there a way to bypass that?
Thanks in advance!
-------------Add------------------------
I added except RemoteDataError: as suggested but i get either invalid syntax or unexpected indentation..
for item in Symbol:
print item
serious=web.DataReader([item], 'yahoo', start, end)['Adj Close']
except RemoteDataError:
serious2=serious.loc[:, item].tolist() #extract the column of 'Adj Close'
plt.figure()
ap = plt.plot(serious2)
indexmax, valuemax = max(enumerate(serious2))
indexmin, valuemin = min(enumerate(serious2))
if valuemax>30 and valuemin<100:
tickerlist.append(item)
maxpricelist.append(valuemax)
minpricelist.append(valuemin)
plt.show()
print tickerlist

For loop after for loop produces wrong output Python

I am trying to use for loops to iterate through some Yahoo Finance data and calculate the return the papers. The problem is that I want to do this for different times, and that I have a document containing the different start and end dates. This is the code I have been using:
import pandas as pd
import numpy as np
from pandas.io.data import DataReader
from datetime import datetime
# This function is just used to download the data I want and saveing
#it to a csv file.
def downloader():
start = datetime(2005,1,1)
end = datetime(2010,1,1)
tickers = ['VIS', 'VFH', 'VPU']
stock_data = DataReader(tickers, "yahoo", start, end)
price = stock_data['Adj Close']
price.to_csv('data.csv')
downloader()
#reads the data into a Pandas DataFrame.
price = pd.read_csv('data.csv', index_col = 'Date', parse_dates = True)
#Creates a Pandas DataFrame that holdt multiple dates. The formate on this is the same as the format I have on the dates when I load the full csv file of dates.
inp = [{'start' : datetime(2005,1,3), 'end' : datetime(2005,12,30)},
{'start' : datetime(2005,2,1), 'end' : datetime(2006,1,31)},
{'start' : datetime(2005,3,1), 'end' : datetime(2006,2,28)}]
df = pd.DataFrame(inp)
#Everything above this is not part of the original script, but this
#is just used to replicate the problem I am having.
results = pd.DataFrame()
for index, row in df.iterrows():
start = row['start']
end = row['end']
price_initial = price.ix[start:end]
for column1 in price_initial:
price1 = price_initial[column1]
startprice = price1.ix[end]
endprice = price1.ix[start]
momentum_value = (startprice / endprice)-1
results = results.append({'Ticker' : column1, 'Momentum' : momentum_value}, ignore_index=True)
results = results.sort(columns = "Momentum", ascending = False).head(1)
print(results.to_csv(sep= '\t', index=False))
I am not sure what I am doing wrong here. But I suspect there is something about the way I iterate over or the way I save the output from the script.
The output I get is this:
Momentum Ticker
0.16022263953253435 VPU
Momentum Ticker
0.16022263953253435 VPU
Momentum Ticker
0.16022263953253435 VPU
That is clearly not correct. Hope someone can help me get this right.

Categories