Daily Data Scraping - python

I am trying to scrape the stock price of the same company on a daily basis for the next 30 days using Python. I used indexing of list and .append(), the initial value gets replaced as soon as the updated price is added. How can I make a list of the price of the same stock for 30 days?
*#Catalyst Pharmaceuticals
#New York Stack Exchange
import requests
import pytz
from bs4 import BeautifulSoup
import datetime
import csv
r=requests.get('https://robinhood.com/collections/technology')
html=r.content
soup=BeautifulSoup(html,'html.parser')
csv_file=open('Catalyst Pharmaceuticals Monthly.csv','a')
csv_writer=csv.writer(csv_file)
price_list = []
dttm = []
def websc():
global price_list
global dttm
global a_price
#i=10
for p in soup.find_all('a',{'class':'rh-hyperlink'})[2]:
a_price = p.text
dd=datetime.datetime.now(pytz.timezone("GMT"))
dd=dd.strftime("%Y-%m-%d %H:%M:%S")
price_list.append(a_price)
dttm.append(dd)
zipped = zip(price_list,dttm)
d = list(zipped)
print(d)
csv_writer.writerows(d)
csv_file.close()
websc()*

You need to open the file in append mode rather than write mode if you don't want to overwrite the file

Can't you just loop through some tickers, push everything into a dataframe, and then export that to a CSV?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.optimize as sco
import datetime as dt
import math
from datetime import datetime, timedelta
from pandas_datareader import data as wb
from sklearn.cluster import KMeans
np.random.seed(777)
start = '2020-01-01'
end = '2020-08-27'
#N = 165
#start = datetime.now() - timedelta(days=N)
#end = dt.datetime.today()
tickers = ['AAPL','MSFT','GOOG','SBUX','MCD','NKE']
thelen = len(tickers)
price_data = []
for ticker in tickers:
try:
prices = wb.DataReader(ticker, start = start, end = end, data_source='yahoo')[['Adj Close']]
price_data.append(prices.assign(ticker=ticker)[['ticker', 'Adj Close']])
except:
print(ticker)
df = pd.concat(price_data)
df.dtypes
df.head()
df.shape
# finally....
df.to_csv('file_name.csv')
Try that and post back if you need something else, related to this.

Related

Why is get_data_yahoo giving me a ValueError for my dates?

I am following a youtube tutorial for a Monte Carlo simulation. I copied this code verbatim:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from pandas_datareader import data as pdr
#import data
def get_data(stocks, start, end):
StockData = pdr.get_data_yahoo(start, end)
StockData = StockData['Close']
returns = StockData.pct_change()
meanReturns = returns.mean()
covMatrix = returns.cov()
return meanReturns, covMatrix
stockList = ['AAPL', 'LOW', 'NFLX', 'VALE', 'WBD']
endDate = dt.datetime.now()
startDate = endDate - dt.timedelta(days=300)
meanReturns, covMatrix = get_data(stockList, startDate, endDate)
print(meanReturns)
and I get this error:
ValueError: start must be an earlier date than end
I then tried flipping the dates in the function call but got this error:
TypeError: object of type 'datetime.datetime' has no len()
Is it maybe the version of python that I'm using? I'm confused because it seems like any date values that I put into this function give me an error.

Extracting financial data on python - google colab

I'm running this code to get the data below, but this is taking a lot of time to load. Is there a more optimized way to run it better? P.S. I'm working on google colab.
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
import pandas_datareader.data as web
import requests
import json
import pandas as pd
from datetime import datetime, timedelta
import yfinance as yf
import pandas as pd
stocks = ['AYX', 'TEAM', 'DDOG', 'MDB', 'TDC', 'CFLT'\]
df = pd.DataFrame()
start_date = '2021-10-1'
current_date = datetime.now().strftime("%Y-%m-%d")
date_range = pd.date_range(start=start_date, end=current_date, freq='M')
dates = [date.strftime("%Y-%m-%d") for date in date_range\]
for stock in stocks:
for date in dates:
# Loop through each date
info = yf.Ticker(stock).info
info1 = yf.Ticker(stock).fast_info
NetDebt = info['totalDebt'] - info['totalCash']
marketcap = info1['market_cap']
asofDate = date
df = df.append({
'Date': asofDate,
'Stock': stock,
'NetDebt': NetDebt,
'marketcap': marketcap,
'EV': marketcap + NetDebt
}, ignore_index=True)
print(df)

Faster yf.Ticker() calls for fundamentals, i.e. something like yf.pdr_override() but for fundamental data

I have been using yfinance to loop through a list of around 1800 stocks, to get several years daily price data for each. To speed up this process I have been using yf.pdr_override() and each call takes about 2-3 seconds, and has done for the many months I have been running this program.
I have now added calls for some fundamental data as well, and the speed of my programme has slowed considerably, to about 20 seconds per stock.
Is this because import yfinance as yf with yf.Ticker() is much slower without yf.pdr_override()?
And is there a version of yf.pdr_override() for getting yfinance fundamental data? I am only looking for a few pieces of info if there is a way to just get those?
from asyncio.windows_events import NULL
import datetime as dt
from datetime import datetime
import time
from numpy import False_, NaN, fabs
import pandas as pd
from pandas_datareader import data as pdr
import yfinance as yf
from tkinter import EXCEPTION, Tk
from tkinter.filedialog import askopenfilename
import os
from pandas import ExcelWriter
from pathlib import Path
yf.pdr_override()
now = dt.datetime.now()
from datetime import datetime, timedelta
start = datetime(2018, 12, 31)
# EXCEL IMPORT STUFF REMOVED FOR SIMPLICITY
for i in stocklist.index:
varName_cleaned=str(stocklist["Name cleaned"][i]).replace("'","")
varSymbol=str(stocklist["Symbol"][i])
varIndustry=str(stocklist["Industry"][i])
varSector=str(stocklist["Sector"][i])
if varIndustry == "—" or varIndustry == "":
varIndustry = "Unspecified"
if varSector == "—" or varSector == "":
varSector = "Unspecified"
varFILTER=str(stocklist["Good stock"][i])
goodStock = False
if (varFILTER != "'nan'" and varFILTER != "''" and varFILTER != "'FALSE'"):
goodStock = True
try:
thisTicker = yf.Ticker(varSymbol)
thisTickerInfo = thisTicker.info
f_QR = thisTickerInfo["quickRatio"]
f_CR = thisTickerInfo["currentRatio"]
f_Debt2Equity = thisTickerInfo["debtToEquity"]
f_OperatingMargin = thisTickerInfo["operatingMargins"]
f_ProfitMargin = thisTickerInfo["profitMargins"]
df = pdr.get_data_yahoo(varSymbol, start, now)

How to resample OHLC data with multiple stocks in index?

I haven't been able to find anything too similar to this I have OHLC data pulled from y-finance for multiple stocks. This results in a multi-index of columns of OHLC data and stock names
Python Script
'''
import requests
import pandas as pd
import numpy as np
import yfinance as yf
from datetime import datetime, timedelta
N_DAYS_AGO = 15
now = datetime.now()
today = datetime(now.year,now.month,now.day, now.hour)
n_days_ago = today - timedelta(days=N_DAYS_AGO)
df = yf.download(['SPY','TLT'], start=n_days_ago, end=now, interval = "60m") #no error with 1 stock
ohlc_dict = {
'Adj Close':'last',
'Open':'first',
'High':'max',
'Low':'min',
'Close':'last',
'Volume':'sum'
}
df_sample = df.resample('W-FRI', closed='left').agg(ohlc_dict)
df_sample #error with 2 stocks
'''
The code above works without a single stock but fails when there are multiple stocks/ multi index columns.
I've tried stacking and unstacking but haven't found a good way to resample this data. What's the simplest path forward here?

How to corect the code it prints all symbols in a way that , in one dataframe ,and without any NaN or zeros?

Would you please correct the code, it prints all the symbols as well as without any NaN or zeros like?
import datetime
import pandas as pd
import numpy as np
import yfinance as yf
start = datetime.datetime(2015, 10, 1)
end = datetime.datetime.now()
symbols = ['BTC-USD', 'ETH-USD', '^GSPC']
df = pd.DataFrame()
for i in symbols:
data = yf.download(i, start=None, end=None, period="max", interval="1mo")
df[i] = data['Adj Close'].pct_change()
df = df.fillna(method='ffill')
The current output is:

Categories