I wrote a script to scrape Yahoo Finance stock data using the Yahoo_Fin package
The aim of the script is to grab company financials to be able to perform some calculations. The input to the script is a txt file with a list of company ticker symbols. The output is also supposed to be a txt with only the companies that match a certain number of established criteria.
The script does occasionally work with a small txt file (20 tickers or less) however it does sometimes give me the following error (without me changing any code)
"None of ['Breakdown'] are in the columns" with Breakdown being the index column I set for the df.
I have run the script dozens of times and sometimes it works, sometimes it doesn't. Ran it in Atom and Jupyter Notebook and still have no clue what is causing the problem. I have also updated pandas and all necessary packages.
This is the code:
import pandas as pd
import statistics as stat
from yahoo_fin.stock_info import *
stock_list = [line.rstrip('\n') for line in open("test.txt", "r")]
#print(stock_list)
## The balance sheet df ##
balance_sheet = {ticker: get_balance_sheet(ticker)
for ticker in stock_list}
## The income statement df ##
income_statement = {ticker: get_income_statement(ticker)
for ticker in stock_list}
bs_data=[]
for i in range(0,len(stock_list)):
one_ticker = pd.DataFrame(balance_sheet[stock_list[i]])
one_ticker = one_ticker.set_index('Breakdown')
bs_data.append(one_ticker)
#print(bs_data)
income_data=[]
#one_ticker =[]
for i in range(0,len(stock_list)):
one_ticker = pd.DataFrame(income_statement[stock_list[i]])
one_ticker = one_ticker.set_index('Breakdown')
income_data.append(one_ticker)
#print(income_data)
## These are the balance sheet variables ##
for loop_counter in range(0,len(stock_list)):
# Total Assets
total_assets = (bs_data[loop_counter].loc['Total Assets'].astype(int))
avg_total_assets = stat.mean(total_assets)
#print(avg_total_assets)
# Total Current Liabilities
total_current_liabilities = (bs_data[loop_counter].loc['Total Current Liabilities'].astype(int))
avg_total_current_liabilities = stat.mean(total_current_liabilities)
#print(avg_total_current_liabilities)
#Total Liabilities
total_liabilities = (bs_data[loop_counter].loc['Total Liabilities'].astype(int))
avg_total_liabilities = stat.mean(total_liabilities)
#print(avg_total_liabilities)
## These are the income statement variables ##
# Total Revenue
total_revenue = (income_data[loop_counter].loc['Total Revenue']).astype(int)
avg_total_revenue = stat.mean(total_revenue)
#print(avg_total_revenue)
# Operating Income
operating_income = (income_data[loop_counter].loc['Operating Income or Loss']).astype(int)
avg_operating_income = stat.mean(operating_income)
#print(avg_operating_income)
# Total Operating Expenses
total_operating_expenses = (income_data[loop_counter].loc['Total Operating Expenses'].astype(int))
avg_total_operating_expenses = stat.mean(total_operating_expenses)
#print(avg_total_operating_expenses)
# EBIT
ebit = (avg_total_revenue-avg_total_operating_expenses)
#print(ebit)
## Calculations ##
opm = (avg_operating_income) / (avg_total_revenue)
#print(opm)
roce = (ebit) / ((avg_total_assets) - (avg_total_current_liabilities))
#print(roce)
leverage = (avg_total_liabilities) / (avg_total_assets)
#print(leverage)
#print("Leverage: " + str(round(leverage,2)))
#print("OPM: " + str(round(opm*100,2)) + "%")
#print("ROCE: " + str(round(roce*100,2)) + "%")
## Save to file ##
#print(leverage)
#print(opm)
#print(roce)
if leverage < 1.00 and roce >= 0.2 and opm >= 0.2:
#print("We have a match!")
outfile = open("results.txt", "a")
outfile.write(stock_list[loop_counter])
outfile.write("\n")
outfile.close()
Any clues to what might be the problem??
Update #2 Code:
import pandas as pd
import statistics as stat
from yahooquery import *
# Ticker input here
stock_list = [line.rstrip('\n') for line in open("test.txt", "r")]
#for stock in stock_list:
tickers = Ticker(stock_list)
# Get balance sheet
for stock in stock_list:
#print(stock)
bs = tickers.balance_sheet()
bs = pd.DataFrame(bs)
bs = bs.set_index('endDate')
#print(bs)
## Balance sheet variables to extract ##
# Total Assets
total_assets = bs['totalAssets']
avg_total_assets = stat.mean(total_assets)
# Total Current Liabilities
total_current_liabilities = bs['totalCurrentLiabilities']
avg_total_current_liabilities = stat.mean(total_current_liabilities)
# Total Liabilities
total_liabilities = bs['totalLiab']
avg_total_liabilities = stat.mean(total_liabilities)
## Get income statement ##
inst = tickers.income_statement()
inst = pd.DataFrame(inst)
inst = inst.set_index('endDate')
## Income statement variables to extract ##
# Total Revenue#
total_revenue = inst['totalRevenue']
avg_total_revenue = stat.mean(total_revenue)
# Operating Income
operating_income = inst['operatingIncome']
avg_operating_income = stat.mean(operating_income)
# Total Operating Expenses
total_operating_expenses = inst['totalOperatingExpenses']
avg_total_operating_expenses = stat.mean(total_operating_expenses)
# EBIT
ebit = (avg_total_revenue-avg_total_operating_expenses)
## Parameters ##
opm = (avg_operating_income) / (avg_total_revenue)
roce = (ebit) / ((avg_total_assets) - (avg_total_current_liabilities))
leverage = (avg_total_liabilities) / (avg_total_assets)
## Save to file ##
#print("Hello!")
if leverage < 1.00 and roce >= 0.2 and opm >= 0.2:
#print("Hello")
outfile = open("yahoo_query_results.txt", "w+")
outfile.write(stock)
outfile.write("\n")
outfile.close()
Related
I've written a python program that takes some inputs and turns them into a matplotlib graph. Specifically, it displays wealth distributions by percentile for a country of the user's choosing. However, these inputs are currently given by changing variables in the program.
I want to put this code on a website, allowing users to choose any country and see the wealth distribution for that country, as well as how they compare. Essentially, I am trying to recreate this: https://wid.world/income-comparator/
The code in python is all done but I am struggling to incorporate it into an HTML file. I was trying to use pyscript but it currently loads forever and displays nothing. Would rather not rewrite it in javascript (mainly because I don't know js). My thoughts are that it has something to do with the code importing csv files from my device?
import csv
from typing import List
import matplotlib.pyplot as plt
import collections
import math
from forex_python.converter import CurrencyRates
# ---------------- #
# whether or not the graph includes the top 1 percent in the graph (makes the rest of the graph visible!)
one_percent = False # True or False
# pick which country(ies) you want to view
country = 'China' # String
# what currency should the graph use
currency_used = 'Canada' # String
# if you want to compare an income
compare_income = True # True or False
# what income do you want to compare
income = 100000 # Int
# ---------------- #
codes = {}
# get dictionary of monetary country codes
monetary_codes = {}
with open('codes-all.csv') as csv_file:
list = csv.reader(csv_file, delimiter=',')
for row in list:
if row[5] == "":
monetary_codes[row[0]] = (row[2], row[1])
# get dictionary of country names and codes for WID
with open('WID_countries.csv') as csv_file:
WID_codes = csv.reader(csv_file, delimiter=',')
next(WID_codes)
for row in WID_codes:
if len(row[0]) == 2:
if row[2] != "":
monetary_code = monetary_codes[row[1].upper()][0]
currency_name = monetary_codes[row[1].upper()][1]
codes[row[1].upper()] = (row[0], monetary_code, currency_name)
elif row[2] == "":
codes[row[1].upper()] = (row[0], 'USD', 'United States Dollar')
elif row[0][0] == 'U' and row[0][1] == 'S':
codes[row[1].upper()] = (row[0], 'USD', 'United States Dollar')
# converts user input to upper case
country = country.upper()
currency_used = currency_used.upper()
# gets conversion rate
c = CurrencyRates()
conversion_rate = c.get_rate(codes[country][1], codes[currency_used][1])
# convert money into correct currency
def convert_money(conversion_rate, value):
return float(value) * conversion_rate
# get and clean data
def get_data(country):
aptinc = {}
# cleaning the data
with open(f'country_data/WID_data_{codes[country][0]}.csv') as csv_file:
data = csv.reader(csv_file, delimiter=';')
for row in data:
# I only care about the year 2021 and the variable 'aptinc'
if 'aptinc992' in row[1] and row[3] == '2021':
# translates percentile string into a numerical value
index = 0
for i in row[2]:
# index 0 is always 'p', so we get rid of that
if index == 0:
row[2] = row[2][1:]
# each string has a p in the middle of the numbers we care about. I also only
# care about the rows which measure a single percentile
# (upper bound - lower bound <= 1)
elif i == 'p':
lb = float(row[2][:index - 1])
ub = float(row[2][index:])
# if the top one percent is being filtered out adds another requirement
if not one_percent:
if ub - lb <= 1 and ub <= 99:
row[2] = ub
else:
row[2] = 0
else:
if ub - lb <= 1:
row[2] = ub
else: row[2] = 0
index += 1
# adds wanted, cleaned data to a dictionary. Also converts all values to one currency
if row[2] != 0:
aptinc[row[2]] = convert_money(conversion_rate, row[4])
return aptinc
# find the closest percentile to an income
def closest_percentile(income, data):
closest = math.inf
percentile = float()
for i in data:
difference = income - data[i]
if abs(difference) < closest:
closest = difference
percentile = i
return percentile
# ---------------- #
unsorted_data = {}
percentiles = []
average_income = []
# gets data for the country
data = get_data(country)
for i in data:
unsorted_data[i] = data[i]
# sorts the data
sorted = collections.OrderedDict(sorted(unsorted_data.items()))
for i in sorted:
percentiles.append(i)
average_income.append(data[i])
# makes countries pretty for printing
country = country.lower()
country = country.capitalize()
# calculates where the income places against incomes from country(ies)
blurb = ""
if compare_income:
percentile = closest_percentile(income, sorted)
blurb = f"You are richer than {round(percentile)} percent of {country}'s population"
# plot this data!
plt.plot(percentiles,average_income)
plt.title(f'{country} Average Annual Income by Percentile')
plt.xlabel(f'Percentile\n{blurb}')
plt.ylabel(f'Average Annual Income of {country}({codes[currency_used][1]})')
plt.axvline(x = 99, color = 'r', label = '99th percentile', linestyle=':')
if compare_income:
plt.axvline(x = percentile, color = 'g', label = f'{income} {codes[currency_used][2]}')
plt.legend(bbox_to_anchor = (0, 1), loc = 'upper left')
plt.show()
I am trying to implement a "user-friendly" portfolio optimization program in Python.
Since I am still a beginner I did not quite manage to realize it.
The only thing the program should use as input are the stock codes.
I tried to create a mwe below:
import numpy as np
import yfinance as yf
import pandas as pd
def daily_returns(price):
price = price.to_numpy()
shift_1 = price[1:]
shift_2 = price[:-1]
return (shift_1 - shift_2)/shift_1
def annual_returns(price):
price = price.to_numpy()
start = price[0]
end = price[len(price)-1]
return (end-start)/start
def adjusting(price):
adj = len(price)
diff = adj - adjvalue
if diff != 0:
price_new = price[:-diff]
else: price_new = price
return price_new
#Minimal Reproducible Example
#getting user input
names = input('Stock codes:')
names = names.split()
a = len(names)
msft = yf.Ticker(names[0])
aapl = yf.Ticker(names[1])
#import data
hist_msft = msft.history(interval='1d',start='2020-01-01',end='2020-12-31')
hist_msft = pd.DataFrame(hist_msft,columns=['Close'])
#hist_msft = hist_msft.to_numpy()
hist_aapl = aapl.history(interval='1d',start='2020-01-01',end='2020-12-31')
hist_aapl = pd.DataFrame(hist_aapl,columns=['Close'])
#hist_aapl = hist_aapl.to_numpy()
#daily returns
aapl_daily_returns = daily_returns(hist_aapl)
aapl_daily_returns = np.ravel(aapl_daily_returns)
msft_daily_returns = daily_returns(hist_msft)
msft_daily_returns = np.ravel(msft_daily_returns)
#adjusting for different trading periods
adjvalue = min(len(aapl_daily_returns),len(msft_daily_returns))
aapl_adj = adjusting(aapl_daily_returns)
msft_adj = adjusting(msft_daily_returns)
#annual returns
aapl_ann_returns = annual_returns(hist_aapl)
msft_ann_returns = annual_returns(hist_msft)
#inputs for optimization
cov_mat = np.cov([aapl_adj,msft_adj])*252
ann_returns = np.concatenate((aapl_ann_returns,msft_ann_returns))
Now I just want the code to work with a various, unknown number of inputs. I tried reading a lot about global variables or tried to figure it out with dictionaries but couldn't really achieve any progress.
I think using the for loop can solve your problem!
...
names = input('Stock codes:')
names = names.split()
for name in names:
#analyze here
#I don't know anything about stocks so I wont write anything here
...
I am trying to make this code look more attractive for potential employers that view it on my GitHub account. The code essentially loops through a CSV file and searches each symbol with the yfinance wrapper for the Yahoo-Finance API. It makes a few checks about the stock and decides whether it is a suitable investment. There are many try except clauses since API can return empty fields in the pandas dataframe. Currently I think it can be improved since it has multiple nested if statements with many try except statements. All feedback is greatly appreciated.
import yfinance as yf
import pandas as pd
import openpyxl
import csv
import math
import traceback
# Not a penny stock
# Earnings increase of at least 33% over 10 years using 3 year averages - 10% over 4 years since the API only contains the most recent 4 years
# Current price no more than 1.5x book value per share
# P/E ratio <= 15
# Long term debt no more than 110% current assets
# Current assets 1.5x current liabilities
symbol_array = []
failed_search = []
with open('companylist.csv') as file:
reader = csv.reader(file)
ticker_data = iter(reader) # skip the first value since it is the header
next(ticker_data)
for row in ticker_data:
ticker = row[0]
print('Searching: ', ticker)
try:
try:
company = yf.Ticker(ticker)
company_info = company.info
except:
print('Not a company')
continue # skip the ticker since it is not a company or the API doesn't have any information about the security
company_balance_sheet = company.balance_sheet
company_earnings = company.earnings
if company_balance_sheet.empty or company_earnings.empty:
continue # if balance sheets or earnings reports are not available, skip the search
column_date = company.balance_sheet.columns[0] # latest date on balance sheet to take data from
current_assets = company.balance_sheet.at['Total Current Assets', column_date]
try: # previous close price can be under 'previousClose' or 'regularMarketPrice' in company_info
current_price = company_info['previousClose']
except:
current_price = company_info['regularMarketPrice']
if current_price >= 10: # check if stock is penny stock
try:
long_term_debt = company.balance_sheet.at['Long Term Debt', column_date]
if math.isnan(long_term_debt):
long_term_debt = 0
except:
long_term_debt=0
if long_term_debt < (current_assets * 1.1):
current_liabilities = company.balance_sheet.at['Total Current Liabilities', column_date]
if current_liabilities < (1.5 * current_assets):
try:
pe_ratio = company_info['trailingPE'] # check if P/E ratio is available, assign pe_ratio 0 if it is not
except:
pe_ratio = 0
if pe_ratio <= 15:
try:
book_value = company_info['bookValue']
if type(book_value) != float: # book_value can be "None" in the company_info object
book_value = 0
except:
book_value = 0
if current_price < (book_value*1.5):
earnings_first = company.earnings.iat[0, 1]
earnings_last = company.earnings.iat[len(company.earnings)-1, 1]
if earnings_last >= earnings_first*1.1:
symbol_array.append(company_info['symbol'])
else:
print('Step 6 fail. Earnings growth too low')
else:
print('Step 5 fail. Current price too high')
else:
print('Step 4 fail. P/E ratio too high')
else:
print('Step 3 fail. Current liabilities too high')
else:
print('Step 2 fail. Long term debt too high')
else:
print('Step 1 fail. Penny stock')
except Exception as e:
print(traceback.format_exc()) # code to point out any errors in the main try statement
failed_search.append(ticker)
print(ticker, ' failed to search.')
print(e)
print('Failed searches:')
for failure in failed_search:
print(failure)
print('Potential Investments:')
for symbol in symbol_array:
print(symbol)
I pulled a Python script off of Github which is intended to analyze & rank stocks. I finally got it running but unfortunately the EV/EBITDA and Shareholder Yield are populating their default values, 1000 & 0 respectively.
I've spent the last few days attempting to troubleshoot, learning a lot in the process, but unfortunately had no luck.. I think it's attempting to extract data from a nonexistent line on the 'Scraper' portion or referencing an incorrect HTML. I'll paste the two code snips I think the error may lie within though the rest of the files are linked above.
Main File
from sys import stdout
from Stock import Stock
import Pickler
import Scraper
import Rankings
import Fixer
import Writer
# HTML error code handler - importing data is a chore, and getting a connection
# error halfway through is horribly demotivating. Use a pickler to serialize
# imported data into a hot-startable database.
pklFileName = 'tmpstocks.pkl'
pickler = Pickler.Pickler()
# Check if a pickled file exists. Load it if the user requests. If no file
# loaded, stocks is an empty list.
stocks = pickler.loadPickledFile(pklFileName)
# Scrape data from FINVIZ. Certain presets have been established (see direct
# link for more details)
url = 'http://finviz.com/screener.ashx?v=152&f=cap_smallover&' + \
'ft=4&c=0,1,2,6,7,10,11,13,14,45,65'
html = Scraper.importHtml(url)
# Parse the HTML for the number of pages from which we'll pull data
nPages = -1
for line in html:
if line[0:40] == '<option selected="selected" value=1>Page':
# Find indices
b1 = line.index('/') + 1
b2 = b1 + line[b1:].index('<')
# Number of pages containing stock data
nPages = int(line[b1:b2])
break
# Parse data from table on the first page of stocks and store in the database,
# but only if no data was pickled
if pickler.source == Pickler.PickleSource.NOPICKLE:
Scraper.importFinvizPage(html, stocks)
# The first page of stocks (20 stocks) has been imported. Now import the
# rest of them
source = Pickler.PickleSource.FINVIZ
iS = pickler.getIndex(source, 1, nPages + 1)
for i in range(iS, nPages + 1):
try:
# Print dynamic progress message
print('Importing FINVIZ metrics from page ' + str(i) + ' of ' + \
str(nPages) + '...', file=stdout, flush=True)
# Scrape data as before
url = 'http://finviz.com/screener.ashx?v=152&f=cap_smallover&ft=4&r=' + \
str(i*20+1) + '&c=0,1,2,6,7,10,11,13,14,45,65'
html = Scraper.importHtml(url)
# Import stock metrics from page into a buffer
bufferList = []
Scraper.importFinvizPage(html, bufferList)
# If no errors encountered, extend buffer to stocks list
stocks.extend(bufferList)
except:
# Error encountered. Pickle stocks for later loading
pickler.setError(source, i, stocks)
break
# FINVIZ stock metrics successfully imported
print('\n')
# Store number of stocks in list
nStocks = len(stocks)
# Handle pickle file
source = Pickler.PickleSource.YHOOEV
iS = pickler.getIndex(source, 0, nStocks)
# Grab EV/EBITDA metrics from Yahoo! Finance
for i in range(iS, nStocks):
try:
# Print dynamic progress message
print('Importing Key Statistics for ' + stocks[i].tick +
' (' + str(i) + '/' + str(nStocks - 1) + ') from Yahoo! Finance...', \
file=stdout, flush=True)
# Scrape data from Yahoo! Finance
url = 'http://finance.yahoo.com/q/ks?s=' + stocks[i].tick + '+Key+Statistics'
html = Scraper.importHtml(url)
# Parse data
for line in html:
# Check no value
if 'There is no Key Statistics' in line or \
'Get Quotes Results for' in line or \
'Changed Ticker Symbol' in line or \
'</html>' in line:
# Non-financial file (e.g. mutual fund) or
# Ticker not located or
# End of html page
stocks[i].evebitda = 1000
break
elif 'Enterprise Value/EBITDA' in line:
# Line contains EV/EBITDA data
evebitda = Scraper.readYahooEVEBITDA(line)
stocks[i].evebitda = evebitda
break
except:
# Error encountered. Pickle stocks for later loading
pickler.setError(source, i, stocks)
break
# Yahoo! Finance EV/EBITDA successfully imported
print('\n')
# Handle pickle file
source = Pickler.PickleSource.YHOOBBY
iS = pickler.getIndex(source, 0, nStocks)
# Grab BBY metrics from Yahoo! Finance
for i in range(iS, nStocks):
try:
# Print dynamic progress message
print('Importing Cash Flow for ' + stocks[i].tick +
' (' + str(i) + '/' + str(nStocks - 1) + ') from Yahoo! Finance...', \
file=stdout, flush=True)
# Scrape data from Yahoo! Finance
url = 'http://finance.yahoo.com/q/cf?s=' + stocks[i].tick + '&ql=1'
html = Scraper.importHtml(url)
# Parse data
totalBuysAndSells = 0
for line in html:
# Check no value
if 'There is no Cash Flow' in line or \
'Get Quotes Results for' in line or \
'Changed Ticker Symbol' in line or \
'</html>' in line:
# Non-financial file (e.g. mutual fund) or
# Ticker not located or
# End of html page
break
elif 'Sale Purchase of Stock' in line:
# Line contains Sale/Purchase of Stock information
totalBuysAndSells = Scraper.readYahooBBY(line)
break
# Calculate BBY as a percentage of current market cap
bby = round(-totalBuysAndSells / stocks[i].mktcap * 100, 2)
stocks[i].bby = bby
except:
# Error encountered. Pickle stocks for later loading
pickler.setError(source, i, stocks)
break
# Yahoo! Finance BBY successfully imported
if not pickler.hasErrorOccurred:
# All data imported
print('\n')
print('Fixing screener errors...')
# A number of stocks may have broken metrics. Fix these (i.e. assign out-of-
# bounds values) before sorting
stocks = Fixer.fixBrokenMetrics(stocks)
print('Ranking stocks...')
# Calculate shareholder Yield
for i in range(nStocks):
stocks[i].shy = stocks[i].div + stocks[i].bby
# Time to rank! Lowest value gets 100
rankPE = 100 * (1 - Rankings.rankByValue([o.pe for o in stocks]) / nStocks)
rankPS = 100 * (1 - Rankings.rankByValue([o.ps for o in stocks]) / nStocks)
rankPB = 100 * (1 - Rankings.rankByValue([o.pb for o in stocks]) / nStocks)
rankPFCF = 100 * (1 - Rankings.rankByValue([o.pfcf for o in stocks]) / nStocks)
rankEVEBITDA = 100 * (1 - Rankings.rankByValue([o.evebitda for o in stocks]) / nStocks)
# Shareholder yield ranked with highest getting 100
rankSHY = 100 * (Rankings.rankByValue([o.shy for o in stocks]) / nStocks)
# Rank total stock valuation
rankStock = rankPE + rankPS + rankPB + rankPFCF + rankEVEBITDA + rankSHY
# Rank 'em
rankOverall = Rankings.rankByValue(rankStock)
# Calculate Value Composite - higher the better
valueComposite = 100 * rankOverall / len(rankStock)
# Reverse indices - lower index -> better score
rankOverall = [len(rankStock) - 1 - x for x in rankOverall]
# Assign to stocks
for i in range(nStocks):
stocks[i].rank = rankOverall[i]
stocks[i].vc = round(valueComposite[i], 2)
print('Sorting stocks...')
# Sort all stocks by normalized rank
stocks = [x for (y, x) in sorted(zip(rankOverall, stocks))]
# Sort top decile by momentum factor. O'Shaughnessey historically uses 25
# stocks to hold. The top decile is printed, and the user may select the top 25
# (or any n) from the .csv file.
dec = int(nStocks / 10)
topDecile = []
# Store temporary momentums from top decile for sorting reasons
moms = [o.mom for o in stocks[:dec]]
# Sort top decile by momentum
for i in range(dec):
# Get index of top momentum performer in top decile
topMomInd = moms.index(max(moms))
# Sort
topDecile.append(stocks[topMomInd])
# Remove top momentum performer from further consideration
moms[topMomInd] = -100
print('Saving stocks...')
# Save momentum-weighted top decile
topCsvPath = 'top.csv'
Writer.writeCSV(topCsvPath, topDecile)
# Save results to .csv
allCsvPath = 'stocks.csv'
Writer.writeCSV(allCsvPath, stocks)
print('\n')
print('Complete.')
print('Top decile (sorted by momentum) saved to: ' + topCsvPath)
print('All stocks (sorted by trending value) saved to: ' + allCsvPath)
Scraper
import re
from urllib.request import urlopen
from Stock import Stock
def importHtml(url):
"Scrapes the HTML file from the given URL and returns line break delimited \
strings"
response = urlopen(url, data = None)
html = response.read().decode('utf-8').split('\n')
return html
def importFinvizPage(html, stocks):
"Imports data from a FINVIZ HTML page and stores in the list of Stock \
objects"
isFound = False
for line in html:
if line[0:15] == '<td height="10"':
isFound = True
# Import data line into stock database
_readFinvizLine(line, stocks)
if isFound and len(line) < 10:
break
return
def _readFinvizLine(line, stocks):
"Imports stock metrics from the data line and stores it in the list of \
Stock objects"
# Parse html
(stkraw, dl) = _parseHtml(line)
# Create new stock object
stock = Stock()
# Get ticker symbol
stock.tick = stkraw[dl[1] + 1: dl[2]]
# Get company name
stock.name = stkraw[dl[2] + 1 : dl[3]]
# Get market cap multiplier (either MM or BB)
if stkraw[dl[4] - 1] == 'B':
capmult = 1000000000
else:
capmult = 1000000
# Get market cap
stock.mktcap = capmult * _toFloat(stkraw[dl[3] + 1 : dl[4] - 1])
# Get P/E ratio
stock.pe = _toFloat(stkraw[dl[4] + 1 : dl[5]])
# Get P/S ratio
stock.ps = _toFloat(stkraw[dl[5] + 1 : dl[6]])
# Get P/B ratio
stock.pb = _toFloat(stkraw[dl[6] + 1 : dl[7]])
# Get P/FCF ratio
stock.pfcf = _toFloat(stkraw[dl[7] + 1 : dl[8]])
# Get Dividend Yield
stock.div = _toFloat(stkraw[dl[8] + 1 : dl[9] - 1])
# Get 6-mo Relative Price Strength
stock.mom = _toFloat(stkraw[dl[9] + 1 : dl[10] - 1])
# Get Current Stock Price
stock.price = _toFloat(stkraw[dl[11] + 1 : dl[12]])
# Append stock to list of stocks
stocks.append(stock)
return
def _toFloat(line):
"Converts a string to a float. Returns NaN if the line can't be converted"
try:
num = float(line)
except:
num = float('NaN')
return num
def readYahooEVEBITDA(line):
"Returns EV/EBITDA data from Yahoo! Finance HTML line"
# Parse html
(stkraw, dl) = _parseHtml(line)
for i in range(0, len(dl)):
if (stkraw[dl[i] + 1 : dl[i] + 24] == 'Enterprise Value/EBITDA'):
evebitda = stkraw[dl[i + 1] + 1 : dl[i + 2]]
break
return _toFloat(evebitda)
def readYahooBBY(line):
"Returns total buys and sells from Yahoo! Finance HTML line. Result will \
still need to be divided by market cap"
# Line also contains Borrowings details - Remove it all
if 'Net Borrowings' in line:
# Remove extra data
line = line[:line.find('Net Borrowings')]
# Trim prior data
line = line[line.find('Sale Purchase of Stock'):]
# Determine if buys or sells, replace open parantheses:
# (#,###) -> -#,###
line = re.sub(r'[(]', '-', line)
# Eliminate commas and close parantheses: -#,### -> -####
line = re.sub(r'[,|)]', '', line)
# Remove HTML data and markup, replacing with commas
line = re.sub(r'[<.*?>|]', ',', line)
line = re.sub(' ', ',', line)
# Locate the beginnings of each quarterly Sale Purchase points
starts = [m.start() for m in re.finditer(',\d+,|,.\d+', line)]
# Locate the ends of each quarterly Sale Purchase points
ends = [m.start() for m in re.finditer('\d,', line)]
# Sum all buys and sells across year
tot = 0
for i in range(0, len(starts)):
# x1000 because all numbers are in thousands
tot = tot + float(line[starts[i] + 1 : ends[i] + 1]) * 1000
return tot
def _parseHtml(line):
"Parses the HTML line by </td> breaks and returns the delimited string"
# Replace </td> breaks with placeholder, '`'
ph = '`'
rem = re.sub('</td>', ph, line)
# The ticker symbol initial delimiter is different
# Remove all other remaining HTML data
stkraw = re.sub('<.*?>', '', rem)
# Replace unbalanced HTML
stkraw = re.sub('">', '`', stkraw)
# Find the placeholders
dl = [m.start() for m in re.finditer(ph, stkraw)]
return (stkraw, dl)
If anyone has any input or perhaps a better method such as beautifulsoup, I'd really appreciate it! I'm very open to any tutorials that would help as well. My intent is to both better my programming ability and have an effective stock screener.
I was having the same issue scraping the Yahoo data in Python, and in Matlab as well. As a workaround, I wrote a macro in VBA to grab all of the EV/EBITDA data from Yahoo by visiting each stock's Key Statistics page. However, it takes about a day to run on all 3,000+ stocks with market caps over $200M, which is not really practical.
I've tried finding the EV/EBITDA on various stock screeners online, but they either don't report it or only let you download a couple hundred stocks' data without paying. Busy Stock's screener seems the best in this regard, but their EV/EBITDA figures don't line up to Yahoo's, which worries me that they are using different methodology.
One solution and my recommendation to you is to use the Trending Value algorithm in Quantopian, which is free. You can find the code here: https://www.quantopian.com/posts/oshaugnessy-what-works-on-wall-street
Quantopian will let you backtest the algorithm to 2002, and live test it as well.
I am trying to write a script to generate data. I am using random package for this. I execute the script and everything works fine. But when I check through the results, I found out that the script fails to generate the last 100+ rows for some reason.
Can someone suggest me why this could be happening?
from __future__ import print_function
from faker import Faker;
import random;
## Vaue declaration
population = 3;
product = 3;
years = 3;
months = 13;
days = 30;
tax= 3.5;
## Define Column Header
Column_Names = "Population_ID",";","Product_Name",";","Product_ID",";","Year",";",
"Month",";","Day","Quantity_sold",";","Sales_Price",";","Discount",
";","Actual_Sales_Price",tax;
## Function to generate sales related information
def sales_data():
for x in range(0,1):
quantity_sold = random.randint(5,20);
discount = random.choice(range(5,11));
sales_price = random.uniform(20,30);
return quantity_sold,round(sales_price,2),discount,round((sales_price)-(sales_price*discount)+(sales_price*tax));
## Format the month to quarter and return the value
def quarter(month):
if month >= 1 and month <= 3:
return "Q1";
elif month > 3 and month <= 6:
return "Q2";
elif month > 6 and month <= 9:
return "Q3";
else:
return "Q4";
## Generate product_id
def product_name():
str2 = "PROD";
sample2 = random.sample([1,2,3,4,5,6,7,8,9],5);
string_list = [];
for x in sample2:
string_list.append(str(x));
return (str2+''.join(string_list));
### Main starts here ###
result_log = open("C:/Users/Sangamesh.sangamad/Dropbox/Thesis/Data Preparation/GenData.csv",'w')
print (Column_Names, result_log);
### Loop and Generate Data ###
for pop in range(0,population):
pop = random.randint(55000,85000);
for prod_id in range(0,product):
product_name2 = product_name();
for year in range(1,years):
for month in range(1,months):
for day in range(1,31):
a = sales_data();
rows = str(pop)+";"+product_name2+";"+str(prod_id)+";"+str(year)+";"+str(month)+";"+quarter(month)+";"+str(day)+";"+str(a[0])+";"+str(a[1])+";"+str(a[2])+";"+str(tax)+";"+str(a[3]);
print(rows,file=result_log);
#print (rows);
tax = tax+1;
You need to close a file to have the buffers flushed:
result_log.close()
Better still, use the file object as a context manager and have the with statement close it for you when the block exits:
filename = "C:/Users/Sangamesh.sangamad/Dropbox/Thesis/Data Preparation/GenData.csv"
with result_log = open(filename, 'w'):
# code writing to result_log
Rather than manually writing strings with delimiters in between, you should really use the csv module:
import csv
# ..
column_names = (
"Population_ID", "Product_Name", "Product_ID", "Year",
"Month", "Day", "Quantity_sold", "Sales_Price", "Discount",
"Actual_Sales_Price", tax)
# ..
with result_log = open(filename, 'wb'):
writer = csv.writer(result_log, delimiter=';')
writer.writerow(column_names)
# looping
row = [pop, product_name2, prod_id, year, month, quarter(month), day,
a[0], a[1], a[2], tax, a[3]]
writer.writerow(row)