Extracting financial data on python - google colab - python

I'm running this code to get the data below, but this is taking a lot of time to load. Is there a more optimized way to run it better? P.S. I'm working on google colab.
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
import pandas_datareader.data as web
import requests
import json
import pandas as pd
from datetime import datetime, timedelta
import yfinance as yf
import pandas as pd
stocks = ['AYX', 'TEAM', 'DDOG', 'MDB', 'TDC', 'CFLT'\]
df = pd.DataFrame()
start_date = '2021-10-1'
current_date = datetime.now().strftime("%Y-%m-%d")
date_range = pd.date_range(start=start_date, end=current_date, freq='M')
dates = [date.strftime("%Y-%m-%d") for date in date_range\]
for stock in stocks:
for date in dates:
# Loop through each date
info = yf.Ticker(stock).info
info1 = yf.Ticker(stock).fast_info
NetDebt = info['totalDebt'] - info['totalCash']
marketcap = info1['market_cap']
asofDate = date
df = df.append({
'Date': asofDate,
'Stock': stock,
'NetDebt': NetDebt,
'marketcap': marketcap,
'EV': marketcap + NetDebt
}, ignore_index=True)
print(df)

Related

Faster yf.Ticker() calls for fundamentals, i.e. something like yf.pdr_override() but for fundamental data

I have been using yfinance to loop through a list of around 1800 stocks, to get several years daily price data for each. To speed up this process I have been using yf.pdr_override() and each call takes about 2-3 seconds, and has done for the many months I have been running this program.
I have now added calls for some fundamental data as well, and the speed of my programme has slowed considerably, to about 20 seconds per stock.
Is this because import yfinance as yf with yf.Ticker() is much slower without yf.pdr_override()?
And is there a version of yf.pdr_override() for getting yfinance fundamental data? I am only looking for a few pieces of info if there is a way to just get those?
from asyncio.windows_events import NULL
import datetime as dt
from datetime import datetime
import time
from numpy import False_, NaN, fabs
import pandas as pd
from pandas_datareader import data as pdr
import yfinance as yf
from tkinter import EXCEPTION, Tk
from tkinter.filedialog import askopenfilename
import os
from pandas import ExcelWriter
from pathlib import Path
yf.pdr_override()
now = dt.datetime.now()
from datetime import datetime, timedelta
start = datetime(2018, 12, 31)
# EXCEL IMPORT STUFF REMOVED FOR SIMPLICITY
for i in stocklist.index:
varName_cleaned=str(stocklist["Name cleaned"][i]).replace("'","")
varSymbol=str(stocklist["Symbol"][i])
varIndustry=str(stocklist["Industry"][i])
varSector=str(stocklist["Sector"][i])
if varIndustry == "—" or varIndustry == "":
varIndustry = "Unspecified"
if varSector == "—" or varSector == "":
varSector = "Unspecified"
varFILTER=str(stocklist["Good stock"][i])
goodStock = False
if (varFILTER != "'nan'" and varFILTER != "''" and varFILTER != "'FALSE'"):
goodStock = True
try:
thisTicker = yf.Ticker(varSymbol)
thisTickerInfo = thisTicker.info
f_QR = thisTickerInfo["quickRatio"]
f_CR = thisTickerInfo["currentRatio"]
f_Debt2Equity = thisTickerInfo["debtToEquity"]
f_OperatingMargin = thisTickerInfo["operatingMargins"]
f_ProfitMargin = thisTickerInfo["profitMargins"]
df = pdr.get_data_yahoo(varSymbol, start, now)

How to resample OHLC data with multiple stocks in index?

I haven't been able to find anything too similar to this I have OHLC data pulled from y-finance for multiple stocks. This results in a multi-index of columns of OHLC data and stock names
Python Script
'''
import requests
import pandas as pd
import numpy as np
import yfinance as yf
from datetime import datetime, timedelta
N_DAYS_AGO = 15
now = datetime.now()
today = datetime(now.year,now.month,now.day, now.hour)
n_days_ago = today - timedelta(days=N_DAYS_AGO)
df = yf.download(['SPY','TLT'], start=n_days_ago, end=now, interval = "60m") #no error with 1 stock
ohlc_dict = {
'Adj Close':'last',
'Open':'first',
'High':'max',
'Low':'min',
'Close':'last',
'Volume':'sum'
}
df_sample = df.resample('W-FRI', closed='left').agg(ohlc_dict)
df_sample #error with 2 stocks
'''
The code above works without a single stock but fails when there are multiple stocks/ multi index columns.
I've tried stacking and unstacking but haven't found a good way to resample this data. What's the simplest path forward here?

Daily Data Scraping

I am trying to scrape the stock price of the same company on a daily basis for the next 30 days using Python. I used indexing of list and .append(), the initial value gets replaced as soon as the updated price is added. How can I make a list of the price of the same stock for 30 days?
*#Catalyst Pharmaceuticals
#New York Stack Exchange
import requests
import pytz
from bs4 import BeautifulSoup
import datetime
import csv
r=requests.get('https://robinhood.com/collections/technology')
html=r.content
soup=BeautifulSoup(html,'html.parser')
csv_file=open('Catalyst Pharmaceuticals Monthly.csv','a')
csv_writer=csv.writer(csv_file)
price_list = []
dttm = []
def websc():
global price_list
global dttm
global a_price
#i=10
for p in soup.find_all('a',{'class':'rh-hyperlink'})[2]:
a_price = p.text
dd=datetime.datetime.now(pytz.timezone("GMT"))
dd=dd.strftime("%Y-%m-%d %H:%M:%S")
price_list.append(a_price)
dttm.append(dd)
zipped = zip(price_list,dttm)
d = list(zipped)
print(d)
csv_writer.writerows(d)
csv_file.close()
websc()*
You need to open the file in append mode rather than write mode if you don't want to overwrite the file
Can't you just loop through some tickers, push everything into a dataframe, and then export that to a CSV?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.optimize as sco
import datetime as dt
import math
from datetime import datetime, timedelta
from pandas_datareader import data as wb
from sklearn.cluster import KMeans
np.random.seed(777)
start = '2020-01-01'
end = '2020-08-27'
#N = 165
#start = datetime.now() - timedelta(days=N)
#end = dt.datetime.today()
tickers = ['AAPL','MSFT','GOOG','SBUX','MCD','NKE']
thelen = len(tickers)
price_data = []
for ticker in tickers:
try:
prices = wb.DataReader(ticker, start = start, end = end, data_source='yahoo')[['Adj Close']]
price_data.append(prices.assign(ticker=ticker)[['ticker', 'Adj Close']])
except:
print(ticker)
df = pd.concat(price_data)
df.dtypes
df.head()
df.shape
# finally....
df.to_csv('file_name.csv')
Try that and post back if you need something else, related to this.

How to see stock's symbol when using pandas

import pandas as pd
import pandas_datareader.data as web
from datetime import datetime
start_date = '2019-11-26'
end_date = str(datetime.now().strftime('%Y-%m-%d'))
tickers = ['IBM', 'AAPL','GOOG']
df = pd.concat([web.DataReader(ticker, 'yahoo', start_date, end_date) for ticker in tickers]).reset_index()
with pd.option_context('display.max_columns', 999):
print(df)
When I run my code, I can see only "Date High Low Open Close Volume Adj Close" values.
What I want to see is the stocks' names before the Date!
Please, help me out...
It always gives data without stocks' names so you have to add names before you concatenate data.
tickers = ['IBM', 'AAPL','GOOG']
data = []
for ticker in tickers:
df = web.DataReader(ticker, 'yahoo', start_date, end_date)
df['Name'] = ticker
data.append(df)
df = pd.concat(data).reset_index()

Sort by date with Excel file and Pandas

I am trying to sort my Excel file by the date column. When the code runs it turns the cells from a text string to a time date and it sorts, but only within the same month. That is, when I have dates from October and September it completes by the month.
I have been all over Google and YouTube.
import pandas as pd
import datetime
from datetime import timedelta
x = datetime.datetime.now()
excel_workbook = 'data.xlsx'
sheet1 = pd.read_excel(excel_workbook, sheet_name='RAW DATA')
sheet1['Call_DateTime'] = pd.to_datetime(sheet1['Call_DateTime'])
sheet1.sort_values(sheet1['Call_DateTime'], axis=1, ascending=True, inplace=True)
sheet1['SegmentDuration'] = pd.to_timedelta(sheet1['SegmentDuration'], unit='s')
sheet1['SegmentDuration'] = timedelta(hours=0.222)
sheet1.style.apply('h:mm:ss', column=['SegmentDuration'])
sheet1.to_excel("S4x Output"+x.strftime("%m-%d")+".xlsx", index = False)
print("All Set!!")
I would like it to sort oldest to newest.
Update code and this works.
import pandas as pd
import datetime
from datetime import timedelta
x = datetime.datetime.now()
excel_workbook = 'data.xlsx'
sheet1 = pd.read_excel(excel_workbook, sheet_name='RAW DATA')
sheet1['Call_DateTime'] = pd.to_datetime(sheet1['Call_DateTime'])
sheet1.sort_values(['Call_DateTime'], axis=0, ascending=True, inplace=True)
sheet1['SegmentDuration'] = pd.to_timedelta(sheet1['SegmentDuration'], unit='s')
sheet1['SegmentDuration'] = timedelta(hours=0.222)
sheet1.style.apply('h:mm:ss', column=['SegmentDuration'])
sheet1.to_excel("S4x Output"+x.strftime("%m-%d")+".xlsx", index = False)
print("All Set!!")

Categories