import urllib.request
import re
import csv
import pandas as pd
from bs4 import BeautifulSoup
columns = []
data = []
f = open('companylist.csv')
csv_f = csv.reader(f)
for row in csv_f:
stocklist = row
print(stocklist)
for s in stocklist:
print('http://finance.yahoo.com/q?s='+s)
optionsUrl = urllib.request.urlopen('http://finance.yahoo.com/q?s='+s).read()
soup = BeautifulSoup(optionsUrl, "html.parser")
stocksymbol = ['Symbol:', s]
optionsTable = [stocksymbol]+[
[x.text for x in y.parent.contents]
for y in soup.findAll('td', attrs={'class': 'yfnc_tabledata1','rtq_table': ''})
]
if not columns:
columns = [o[0] for o in optionsTable] #list(my_df.loc[0])
data.append(o[1] for o in optionsTable)
# create DataFrame from data
df = pd.DataFrame(data, columns=columns)
df.to_csv('test.csv', index=False)
The scripts works fine when I have about 200 to 300 stocks, but my company list has around 6000 symbols.
Is there a way I can download chunks of data, say like 200 stocks at a time, pause for while, and then resume the download again?
The export is one stock at a time; how do I write 200 at a time, and append the next batch to the initial batch (for the CSV)?
As #Merlin has recommended you - take a closer look at pandas_datareader module - you can do a LOT using this tool. Here is a small example:
import csv
import pandas_datareader.data as data
from pandas_datareader.yahoo.quotes import _yahoo_codes
stocklist = ['aapl','goog','fb','amzn','COP']
#http://www.jarloo.com/yahoo_finance/
#https://greenido.wordpress.com/2009/12/22/yahoo-finance-hidden-api/
_yahoo_codes.update({'Market Cap': 'j1'})
_yahoo_codes.update({'Div Yield': 'y'})
_yahoo_codes.update({'Bid': 'b'})
_yahoo_codes.update({'Ask': 'a'})
_yahoo_codes.update({'Prev Close': 'p'})
_yahoo_codes.update({'Open': 'o'})
_yahoo_codes.update({'1 yr Target Price': 't8'})
_yahoo_codes.update({'Earnings/Share': 'e'})
_yahoo_codes.update({"Day’s Range": 'm'})
_yahoo_codes.update({'52-week Range': 'w'})
_yahoo_codes.update({'Volume': 'v'})
_yahoo_codes.update({'Avg Daily Volume': 'a2'})
_yahoo_codes.update({'EPS Est Current Year': 'e7'})
_yahoo_codes.update({'EPS Est Next Quarter': 'e9'})
data.get_quote_yahoo(stocklist).to_csv('test.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)
Output: i've intentionally transposed the result set, because there are too many columns to show them here
In [2]: data.get_quote_yahoo(stocklist).transpose()
Out[2]:
aapl goog fb amzn COP
1 yr Target Price 124.93 924.83 142.87 800.92 51.23
52-week Range 89.47 - 132.97 515.18 - 789.87 72.000 - 121.080 422.6400 - 731.5000 31.0500 - 64.1300
Ask 97.61 718.75 114.58 716.73 44.04
Avg Daily Volume 3.81601e+07 1.75567e+06 2.56467e+07 3.94018e+06 8.94779e+06
Bid 97.6 718.57 114.57 716.65 44.03
Day’s Range 97.10 - 99.12 716.51 - 725.44 113.310 - 115.480 711.1600 - 721.9900 43.8000 - 44.9600
Div Yield 2.31 N/A N/A N/A 4.45
EPS Est Current Year 8.28 33.6 3.55 5.39 -2.26
EPS Est Next Quarter 1.66 8.38 0.87 0.96 -0.48
Earnings/Share 8.98 24.58 1.635 2.426 -4.979
Market Cap 534.65B 493.46B 327.71B 338.17B 54.53B
Open 98.6 716.51 115 713.37 43.96
PE 10.87 29.25 70.074 295.437 N/A
Prev Close 98.83 719.41 116.62 717.91 44.51
Volume 3.07086e+07 868366 2.70182e+07 2.42218e+06 5.20412e+06
change_pct -1.23% -0.09% -1.757% -0.1644% -1.0782%
last 97.61 718.75 114.571 716.73 44.0301
short_ratio 1.18 1.41 0.81 1.29 1.88
time 3:15pm 3:15pm 3:15pm 3:15pm 3:15pm
If you need more fields (codes for Yahoo Finance API) you may want to check the following links:
http://www.jarloo.com/yahoo_finance/
https://greenido.wordpress.com/2009/12/22/yahoo-finance-hidden-api/
Use python_datareader for this.
In [1]: import pandas_datareader.data as web
In [2]: import datetime
In [3]: start = datetime.datetime(2010, 1, 1)
In [4]: end = datetime.datetime(2013, 1, 27)
In [5]: f = web.DataReader("F", 'yahoo', start, end)
In [6]: f.ix['2010-01-04']
Out[6]:
Open 10.170000
High 10.280000
Low 10.050000
Close 10.280000
Volume 60855800.000000
Adj Close 9.151094
Name: 2010-01-04 00:00:00, dtype: float64
To pause after every 200 downloads, you could - also when you use pandas_datareader:
import time
for i, s in enumerate(stocklist):
if i % 200 == 0:
time.sleep(5) # in seconds
To save all data into a single file (IIUC):
stocks = pd.DataFrame() # to collect all results
In every iteration:
stocks = pd.concat([stocks, pd.DataFrame(data, columns=columns))
Finally:
stocks.to_csv(path, index=False)
Related
Good day, I am a student taking python classes. We are now learning about Beautiful Soup and I am having trouble extracting data from 2 tables as you will see in the code below:
import pandas as pd
import requests
list_of_urls = ['https://tradingeconomics.com/albania/gdp-growth-annual',
'https://trdingeconomics.com/south-africa/gdp-growth-annual']
final_df = pd.DataFrame()
for i in lists_of_urls:
table = pd.read_html(i, match='Related')
for row in table:
if row.loc['Related'] == 'GDP Annual Growth Rate':
final_df.append(row)
else:
pass
You don't need neither requests nor bs4. pd.read_html does the job.
list_of_urls = ['https://tradingeconomics.com/albania/gdp-growth-annual',
'https://tradingeconomics.com/south-africa/gdp-growth-annual']
data = {}
for i in list_of_urls:
country = i.split('/')[3]
df = pd.read_html(i, match='Related')[0]
data[country] = df.loc[df['Related'] == 'GDP Annual Growth Rate']
df = pd.concat(data)
Output:
>>> df
Related Last Previous Unit Reference
albania 1 GDP Annual Growth Rate 6.99 18.38 percent Sep 2021
south-africa 1 GDP Annual Growth Rate 1.70 2.90 percent Dec 2021
Goal:
Calculate 50day moving average for each day, based on the past 50 days. I can calculate the mean for the entire dataset, but I am trying to contiously calculate the mean based on the past 50 days...with it changing each day of course!
import numpy as np
import pandas_datareader.data as pdr
import pandas as pd
# Define the instruments to download. We would like to see Apple, Microsoft and the S&P500 index.
ticker = ['AAPL']
#Define the data period that you would like
start_date = '2017-07-01'
end_date = '2019-02-08'
# User pandas_reader.data.DataReader to load the stock prices from Yahoo Finance.
df = pdr.DataReader(ticker, 'yahoo', start_date, end_date)
# Yahoo Finance gives 'High', 'Low', 'Open', 'Close', 'Volume', 'Adj Close'.
#Export Close PRice, Volume, and Date from yahoo finance
CloseP = df['Close']
CloseP.head()
Volm = df['Volume']
Volm.head()
Date = df["Date"] = df.index
#create a table with Date, Close Price, and Volume
Table = pd.DataFrame(np.array(Date), columns = ['Date'])
Table['Close Price'] = np.array(CloseP)
Table['Volume'] = np.array(Volm)
print (Table)
#create a column that contiosuly calculates 50 day MA
#This is what I can't get to work!
MA = np.mean(df['Close'])
Table['Moving Average'] = np.array(MA)
print (Table)
First of all, please, don't use CamelCase to name your variables, as they look as class names otherwise.
Next, use merge() to join your data frames instead of those yours np.array way:
>>> table = CloseP.merge(Volm, left_index=True, right_index=True)
>>> table.columns = ['close', 'volume'] # give names to columns
>>> table.head(10)
close volume
Date
2017-07-03 143.500000 14277800.0
2017-07-05 144.089996 21569600.0
2017-07-06 142.729996 24128800.0
2017-07-07 144.179993 19201700.0
2017-07-10 145.059998 21090600.0
2017-07-11 145.529999 19781800.0
2017-07-12 145.740005 24884500.0
2017-07-13 147.770004 25199400.0
2017-07-14 149.039993 20132100.0
2017-07-17 149.559998 23793500.0
Finally, use combination of rolling(), mean() and dropna() to calculate moving average:
>>> ma50 = table.rolling(window=50).mean().dropna()
>>> ma50.head(10)
close volume
Date
2017-09-12 155.075401 26092540.0
2017-09-13 155.398401 26705132.0
2017-09-14 155.682201 26748954.0
2017-09-15 156.025201 27248670.0
2017-09-18 156.315001 27430024.0
2017-09-19 156.588401 27424424.0
2017-09-20 156.799201 28087816.0
2017-09-21 156.952201 28340360.0
2017-09-22 157.034601 28769280.0
2017-09-25 157.064801 29254384.0
Please, refer to the docs of mentioned API calls to get more info about their usage. Good luck!
I am using the below code to parse a large tickers list to yahoo datareader, I am trying to get back a dataframe as per below. If the list is large, I often get a RemoteError back but on different tickers each time. I am not sure how to handle the RemoteError and I am happy to drop the ticker and continue with the next ticker in the list. I would, however, like to try again to get adj close ticker data. I thought using a for loop and adding a time delay would help with yahoo requests but I am still getting a Remote error. Any ideas?
IBM MSFT ORCL TSLA YELP
Date
2014-01-02 184.52 36.88 37.61 150.10 67.92
2014-01-03 185.62 36.64 37.51 149.56 67.66
2014-01-06 184.99 35.86 37.36 147.00 71.72
2014-01-07 188.68 36.14 37.74 149.36 72.66
2014-01-08 186.95 35.49 37.61 151.28 78.42
import pandas_datareader.data as web
import datetime as dt
import pandas as pd
import time
from pandas_datareader._utils import RemoteDataError
Which_group = ['Accident & Health Insurance'] ##<<<<put in group here
df = pd.read_csv('/home/ross/Downloads/UdemyPairs/stocks1.csv')
df.set_index('categoryName', inplace = True)
df1 = df.loc[Which_group]
tickers = df1.Ticker.tolist()
print(tickers)
#tickers = ['SPY', 'AAPL', 'MSFT'] # add as many tickers
start = dt.datetime(2013, 1,1)
end = dt.datetime.today()
# Function starts here
def get_previous_close(strt, end, tick_list, this_price):
""" arg: `this_price` can take str Open, High, Low, Close, Volume"""
#make an empty dataframe in which we will append columns
adj_close = pd.DataFrame([])
# loop here.
for idx, i in enumerate(tick_list):
try:
# time.sleep(0.01)
total = web.DataReader(i, 'yahoo', strt, end)
adj_close[i] = total[this_price]
except RemoteDataError:
pass
return adj_close
#call the function
print(get_previous_close(start, end, tickers, 'Adj Close'))
Maybe you can look at this question. This proposes a solution that it might work for you.
Pandas Dataframe - RemoteDataError - Python
Why are the following commands provide me with the following. Why is the Date in "1396224000000000000"
import pandas as pd
h5 = pd.HDFStore('./vstoxx_data_31032014.h5', 'r')
futures_data = h5['futures_data'] # VSTOXX futures data
options_data = h5['options_data'] # VSTOXX call option data
h5.close()
The contents of futures_data is
Date EXP_YEAR EXP_MONTH PRICE MATURITY TTM
496 1396224000000000000 2014 4 17.85 13977792000000000000 0.094
(followed by more similar rows)
futures_data['DATE'] = pd.to_datetime(futures_data.DATE)
I have a set of calculated OHLCVA daily securities data in a pandas dataframe like this:
>>> type(data_dy)
<class 'pandas.core.frame.DataFrame'>
>>> data_dy
Open High Low Close Volume Adj Close
Date
2012-12-28 140.64 141.42 139.87 140.03 148806700 134.63
2012-12-31 139.66 142.56 139.54 142.41 243935200 136.92
2013-01-02 145.11 146.15 144.73 146.06 192059000 140.43
2013-01-03 145.99 146.37 145.34 145.73 144761800 140.11
2013-01-04 145.97 146.61 145.67 146.37 116817700 140.72
[5 rows x 6 columns]
I'm using the following dictionary and the pandas resample function to convert the dataframe to monthly data:
>>> ohlc_dict = {'Open':'first','High':'max','Low':'min','Close': 'last','Volume': 'sum','Adj Close': 'last'}
>>> data_dy.resample('M', how=ohlc_dict, closed='right', label='right')
Volume Adj Close High Low Close Open
Date
2012-12-31 392741900 136.92 142.56 139.54 142.41 140.64
2013-01-31 453638500 140.72 146.61 144.73 146.37 145.11
[2 rows x 6 columns]
This does the calculations correctly, but I'd like to use the Yahoo! date convention for monthly data of using the first trading day of the period rather than the last calendar day of the period that pandas uses.
So I'd like the answer set to be:
Volume Adj Close High Low Close Open
Date
2012-12-28 392741900 136.92 142.56 139.54 142.41 140.64
2013-01-02 453638500 140.72 146.61 144.73 146.37 145.11
I could do this by converting the daily data to a python list, process the data and return the data to a dataframe, but how do can this be done with pandas?
Instead of M you can pass MS as the resample rule:
df =pd.DataFrame( range(72), index = pd.date_range('1/1/2011', periods=72, freq='D'))
#df.resample('MS', how = 'mean') # pandas <0.18
df.resample('MS').mean() # pandas >= 0.18
Updated to use the first business day of the month respecting US Federal Holidays:
df =pd.DataFrame( range(200), index = pd.date_range('12/1/2012', periods=200, freq='D'))
from pandas.tseries.offsets import CustomBusinessMonthBegin
from pandas.tseries.holiday import USFederalHolidayCalendar
bmth_us = CustomBusinessMonthBegin(calendar=USFederalHolidayCalendar())
df.resample(bmth_us).mean()
if you want custom starts of the month using the min month found in the data try this. (It isn't pretty, but it should work).
month_index =df.index.to_period('M')
min_day_in_month_index = pd.to_datetime(df.set_index(new_index, append=True).reset_index(level=0).groupby(level=0)['level_0'].min())
custom_month_starts =CustomBusinessMonthBegin(calendar = min_day_in_month_index)
Pass custom_start_months to the fist parameter of resample
Thank you J Bradley, your solution worked perfectly. I did have to upgrade my version of pandas from their official website though as the version installed via pip did not have CustomBusinessMonthBegin in pandas.tseries.offsets. My final code was:
#----- imports -----
import pandas as pd
from pandas.tseries.offsets import CustomBusinessMonthBegin
import pandas.io.data as web
#----- get sample data -----
df = web.get_data_yahoo('SPY', '2012-12-01', '2013-12-31')
#----- build custom calendar -----
month_index =df.index.to_period('M')
min_day_in_month_index = pd.to_datetime(df.set_index(month_index, append=True).reset_index(level=0).groupby(level=0)['Open'].min())
custom_month_starts = CustomBusinessMonthBegin(calendar = min_day_in_month_index)
#----- convert daily data to monthly data -----
ohlc_dict = {'Open':'first','High':'max','Low':'min','Close': 'last','Volume': 'sum','Adj Close': 'last'}
mthly_ohlcva = df.resample(custom_month_starts, how=ohlc_dict)
This yielded the following:
>>> mthly_ohlcva
Volume Adj Close High Low Close Open
Date
2012-12-03 2889875900 136.92 145.58 139.54 142.41 142.80
2013-01-01 2587140200 143.92 150.94 144.73 149.70 145.11
2013-02-01 2581459300 145.76 153.28 148.73 151.61 150.65
2013-03-01 2330972300 151.30 156.85 150.41 156.67 151.09
2013-04-01 2907035000 154.20 159.72 153.55 159.68 156.59
2013-05-01 2781596000 157.84 169.07 158.10 163.45 159.33
2013-06-03 3533321800 155.74 165.99 155.73 160.42 163.83
2013-07-01 2330904500 163.78 169.86 160.22 168.71 161.26
2013-08-01 2283131700 158.87 170.97 163.05 163.65 169.99
2013-09-02 2226749600 163.90 173.60 163.70 168.01 165.23
2013-10-01 2901739000 171.49 177.51 164.53 175.79 168.14
2013-11-01 1930952900 176.57 181.75 174.76 181.00 176.02
2013-12-02 2232775900 181.15 184.69 177.32 184.69 181.09
I've seen in the last version of pandas you can use time offset alias 'BMS', which stands for "business month start frequency" or 'BM', which stands for "business month end frequency".
The code in the first case would look like
data_dy.resample('BMS', closed='right', label='right').apply(ohlc_dict)
or, in the second case,
data_dy.resample('BM', closed='right', label='right').apply(ohlc_dict)