Improving a stock market algorithm - python

I am trying to make this code look more attractive for potential employers that view it on my GitHub account. The code essentially loops through a CSV file and searches each symbol with the yfinance wrapper for the Yahoo-Finance API. It makes a few checks about the stock and decides whether it is a suitable investment. There are many try except clauses since API can return empty fields in the pandas dataframe. Currently I think it can be improved since it has multiple nested if statements with many try except statements. All feedback is greatly appreciated.
import yfinance as yf
import pandas as pd
import openpyxl
import csv
import math
import traceback
# Not a penny stock
# Earnings increase of at least 33% over 10 years using 3 year averages - 10% over 4 years since the API only contains the most recent 4 years
# Current price no more than 1.5x book value per share
# P/E ratio <= 15
# Long term debt no more than 110% current assets
# Current assets 1.5x current liabilities
symbol_array = []
failed_search = []
with open('companylist.csv') as file:
reader = csv.reader(file)
ticker_data = iter(reader) # skip the first value since it is the header
next(ticker_data)
for row in ticker_data:
ticker = row[0]
print('Searching: ', ticker)
try:
try:
company = yf.Ticker(ticker)
company_info = company.info
except:
print('Not a company')
continue # skip the ticker since it is not a company or the API doesn't have any information about the security
company_balance_sheet = company.balance_sheet
company_earnings = company.earnings
if company_balance_sheet.empty or company_earnings.empty:
continue # if balance sheets or earnings reports are not available, skip the search
column_date = company.balance_sheet.columns[0] # latest date on balance sheet to take data from
current_assets = company.balance_sheet.at['Total Current Assets', column_date]
try: # previous close price can be under 'previousClose' or 'regularMarketPrice' in company_info
current_price = company_info['previousClose']
except:
current_price = company_info['regularMarketPrice']
if current_price >= 10: # check if stock is penny stock
try:
long_term_debt = company.balance_sheet.at['Long Term Debt', column_date]
if math.isnan(long_term_debt):
long_term_debt = 0
except:
long_term_debt=0
if long_term_debt < (current_assets * 1.1):
current_liabilities = company.balance_sheet.at['Total Current Liabilities', column_date]
if current_liabilities < (1.5 * current_assets):
try:
pe_ratio = company_info['trailingPE'] # check if P/E ratio is available, assign pe_ratio 0 if it is not
except:
pe_ratio = 0
if pe_ratio <= 15:
try:
book_value = company_info['bookValue']
if type(book_value) != float: # book_value can be "None" in the company_info object
book_value = 0
except:
book_value = 0
if current_price < (book_value*1.5):
earnings_first = company.earnings.iat[0, 1]
earnings_last = company.earnings.iat[len(company.earnings)-1, 1]
if earnings_last >= earnings_first*1.1:
symbol_array.append(company_info['symbol'])
else:
print('Step 6 fail. Earnings growth too low')
else:
print('Step 5 fail. Current price too high')
else:
print('Step 4 fail. P/E ratio too high')
else:
print('Step 3 fail. Current liabilities too high')
else:
print('Step 2 fail. Long term debt too high')
else:
print('Step 1 fail. Penny stock')
except Exception as e:
print(traceback.format_exc()) # code to point out any errors in the main try statement
failed_search.append(ticker)
print(ticker, ' failed to search.')
print(e)
print('Failed searches:')
for failure in failed_search:
print(failure)
print('Potential Investments:')
for symbol in symbol_array:
print(symbol)

Related

List index out of range error when using Pandas and Yahoo_fin

This is a modified version of a program from a tutorial that extracts data from all of the stocks in the S&P 500 and picks stocks that match the criteria you specify.
The issue is that when I run the program List index out of range [stock symbol] pops up and those stocks are skipped and aren't added to the final CSV file.
Example:
list index out of range for ABMD
list index out of range for ABT
list index out of range for ADBE
list index out of range for ADI
I'm not really sure what the issue is, I would greatly appreciate it if someone would explain it to me! Also, I am not applying any of the specifying criteria yet and am just trying to get all of the stock data into the CSV file. Make sure to create a database named stock_data if you try the program. Thanks!
My code:
import pandas_datareader as web
import pandas as pd
from yahoo_fin import stock_info as si
import datetime as dt
dow_list = si.tickers_dow()
sp_list = si.tickers_sp500()
tickers = sp_list
'''tickers = list(set(tickers))
tickers.sort()'''
start = dt.datetime.now() - dt.timedelta(days=365)
end = dt.datetime.now()
sp500_df = web.DataReader('^GSPC', 'yahoo', start, end)
sp500_df['Pct Change'] = sp500_df['Adj Close'].pct_change()
sp500_return = (sp500_df['Pct Change'] + 1).cumprod()[-1]
return_list = []
final_df = pd.DataFrame(columns=['Ticker', 'Latest_Price', 'Score', 'PE_Ratio', 'PEG_Ratio', 'SMA_150', 'SMA_200', '52_Week_Low', '52_Week_High'])
counter = 0
for ticker in tickers:
df = web.DataReader(ticker, 'yahoo', start, end)
df.to_csv(f'stock_data/{ticker}.csv')
df['Pct Change'] = df['Adj Close'].pct_change()
stock_return = (df['Pct Change'] + 1).cumprod()[-1]
returns_compared = round((stock_return / sp500_return), 2)
return_list.append(returns_compared)
counter += 1
if counter == 100:
break
best_performers = pd.DataFrame(list(zip(tickers, return_list)), columns=['Ticker', 'Returns Compared'])
best_performers['Score'] = best_performers['Returns Compared'].rank(pct=True) * 100
best_performers = best_performers[best_performers['Score'] >= best_performers['Score'].quantile(0)] #picks stocks in top 25 percentile
for ticker in best_performers['Ticker']:
try:
df = pd.read_csv(f'stock_data/{ticker}.csv', index_col=0)
moving_averages = [150, 200]
for ma in moving_averages:
df['SMA_' + str(ma)] = round(df['Adj Close'].rolling(window=ma).mean(), 2)
latest_price = df['Adj Close'][-1]
pe_ratio = float(si.get_quote_table(ticker)['PE Ratio (TTM)'])
peg_ratio = float(si.get_stats_valuation(ticker)[1][4])
moving_average_150 = df['SMA_150'][-1]
moving_average_200 = df['SMA_200'][-1]
low_52week = round(min(df['Low'][-(52*5):]), 2)
high_52week = round(min(df['High'][-(52 * 5):]), 2)
score = round(best_performers[best_performers['Ticker'] == ticker]['Score'].tolist()[0])
condition_1 = latest_price > moving_average_150 > moving_average_200
condition_2 = latest_price >= (1.3 * low_52week)
condition_3 = latest_price >= (0.75 * high_52week)
condition_4 = pe_ratio < 25
condition_5 = peg_ratio < 2
final_df = final_df.append({'Ticker': ticker,
'Latest_Price': latest_price,
'Score': score,
'PE_Ratio': pe_ratio,
'PEG_Ratio': peg_ratio,
'SMA_150': moving_average_150,
'SMA_200': moving_average_200,
'52_Week_Low': low_52week,
'52_Week_High': high_52week}, ignore_index=True)
except Exception as e:
print(f"{e} for {ticker}")
final_df.sort_values(by='Score', ascending=False)
pd.set_option('display.max_columns', 10)
print(final_df)
final_df.to_csv('final.csv')
I have done the error shooting on your behalf. As a conclusion, I see that you have not checked the contents of the acquisition of the individual indicator data.
They are being added to the dictionary format and empty data frames as they are in index and named series. I believe that is the root cause of the error.
Specifying the last data and retrieving the values
iloc is not used.
52*5 lookbacks for 253 data
In addition, when additional indicators are acquired for the acquired issue data, there are cases where they can be acquired for the same issue, and cases where they cannot. (The cause is unknown.) Therefore, it may be necessary to change the method of processing pe_ratio and peg_ratio after obtaining them in advance.
for ticker in best_performers['Ticker']:
#print(ticker)
try:
df = pd.read_csv(f'stock_data/{ticker}.csv')#, index_col=0
moving_averages = [150, 200]
for ma in moving_averages:
df['SMA_' + str(ma)] = round(df['Adj Close'].rolling(window=ma).mean(), 2)
latest_price = df['Adj Close'][-1:].values[0]
pe_ratio = float(si.get_quote_table(ticker)['PE Ratio (TTM)'])
moving_average_150 = df['SMA_150'][-1:].values[0]
moving_average_200 = df['SMA_200'][-1:].values[0]
low_52week = round(min(df['Low'][-(52*1):]), 2)
high_52week = round(min(df['High'][-(52*1):]), 2)
#print(low_52week, high_52week)
score = round(best_performers[best_performers['Ticker'] == ticker]['Score'].tolist()[0])
#print(score)
#print(ticker, latest_price,score,pe_ratio,moving_average_200,low_52week,high_52week)
final_df = final_df.append({'Ticker': ticker,
'Latest_Price': latest_price,
'Score': score,
'PE_Ratio': pe_ratio,
'SMA_150': moving_average_150,
'SMA_200': moving_average_200,
'52_Week_Low': low_52week,
'52_Week_High': high_52week}, ignore_index=True)
#print(final_df)
except Exception as e:
print(f"{e} for {ticker}")
final_df
Ticker Latest_Price Score PE_Ratio SMA_150 SMA_200 52_Week_Low 52_Week_High
0 A 123.839996 40 31.42 147.26 150.31 123.06 126.75
1 AAP 218.250000 70 22.23 220.66 216.64 190.79 202.04
2 AAPL 165.070007 80 29.42 161.85 158.24 150.10 154.12
3 ABC 161.899994 90 21.91 132.94 129.33 132.00 137.79
4 ADBE 425.470001 10 42.46 552.19 571.99 407.94 422.38
Note
Some stocks are missing because additional indicators could not be obtained.
(tickers = sp_list[:10] tested on the first 10)

how to do loop over the tickers and save result on the dictionary

I’m not so sure how to do the last part which is the dictionary part and ticker part, and also
On
“”file = open("/home/ubuntu/environment/hw5/" + tickers + “.txt”)””"
This line keep showing
TypeError: must be str, not list
Any suggestion on how to fix those or make the code works ?
Here’s my code
import json
def meanReversionStrategy(prices):
total_profit = 0
first_buy = None
buy = 0
for i in range(len(prices)):
if i >= 5:
current_price = prices[i]
moving_average = (prices[i-1] + prices[i-2] + prices[i-3] + prices[i-4] +prices[i-5]) / 5
if current_price < moving_average * 0.95 and buy == 0:
buy = current_price
print("buy at: ",round (current_price,2))
if first_buy is None:
first_buy = buy
elif current_price > moving_average * 1.05 and buy != 0:
print("sell at: ", round(current_price,2))
print("trade profit: ", round(current_price - buy,2))
total_profit = current_price - buy
buy = 0
final_profit_percentage = ( total_profit / first_buy ) * 100
print("First buy: " , round(first_buy,2))
print("Total profit: " , round(total_profit, 2))
print("Percentage return: ", round(final_profit_percentage, 2),"%")
def simpleMovingAverageStrategy(prices):
i = 0
buy = 0
total_profit = 0
first_buy = 0
for p in prices:
if i >= 5:
moving_average = (prices[i-1] + prices[i-2] + prices[i-3] + prices[i-4] +
prices[i-5]) / 5
#simple moving average logic, not mean
if p > moving_average and buy == 0: #buy
print("buying at: ", p)
buy = p
if first_buy == 0:
first_buy = p
elif p < moving_average and buy != 0: #sell
print("selling at: ", p)
print("trade profit: ", p - buy)
total_profit += p - buy
buy = 0
i += 1
final_percentage = (total_profit / first_buy) * 100
print("first buy: ", first_buy)
print("total profit: ", total_profit)
print("final percentage: ", final_percentage, "%")
return total_profit, final_percentage
tickers = ["AAPL1" , "ADBE" , "BA", "CMCSA", "CSCO", "CVS", "GOOG", "TLSYY","TM"]
file = open("/home/ubuntu/environment/hw5/" + tickers + ".txt")
lines = file.readlines()
# print(lines)
prices = []
for line in lines:
prices.append(float(line))
profit, returns = simpleMovingAverageStrategy(prices)
results = {}
results["AAPL1_profit"] =profit
results["AAPL1_returns"] = returns
json.dump(results, open("/home/ubuntu/environment/hw5/results.json", "w") )
Coding Requirements
-Create a function called meanReversionStrategy which takes a list called “prices” as an argument. The function runs a mean reversion strategy, and outputs to the console the buys and sells of the strategy (like you did in HW4). The function returns the profit and final returns percentage.
-Create a function called simpleMovingAverageStrategy which takes a list called “prices” as an argument. The function runs a Simple Moving Average strategy, and outputs to the console the buys and sells of the strategy. The function returns the profit and final returns percentage.
-Create a function called saveResults which takes a dictionary as an argument. Save the dictionary to a json file called “results.json”.
loop through the list of tickers
for ticker in tickers:
-load prices from a file <ticker>.txt, and store them in the results dictionary with the
key “<ticker>_prices”
-call meanReversionStrategy(prices) and store the profit and returns in the results
dictionary with the keys “<ticker>_mr_profit” and “<ticker>_mr_returns”
-call simpleMovingAverageStrategy(prices) and store the profit and returns in the
results dictionary with the keys “<ticker>_sma_profit” and “<ticker>_sma_profit”
with the keys “ticker_mr_profit” and “ticker_mr_returns”
call saveResults(results) and save the results dictionary to a file called results.json
Welcome!
As this is a homework question, I will not solve it for You, but here is my advice for You:
tickers is an array and as your homework description states, You must 'loop through the list of tickers' and open each of them. So how do You loop over tickers?
And reflect about the error. What do You think should be the result of "/home/ubuntu/environment/hw5/" + tickers + ".txt"?

Key Error: "None of --- are in the columns"

I wrote a script to scrape Yahoo Finance stock data using the Yahoo_Fin package
The aim of the script is to grab company financials to be able to perform some calculations. The input to the script is a txt file with a list of company ticker symbols. The output is also supposed to be a txt with only the companies that match a certain number of established criteria.
The script does occasionally work with a small txt file (20 tickers or less) however it does sometimes give me the following error (without me changing any code)
"None of ['Breakdown'] are in the columns" with Breakdown being the index column I set for the df.
I have run the script dozens of times and sometimes it works, sometimes it doesn't. Ran it in Atom and Jupyter Notebook and still have no clue what is causing the problem. I have also updated pandas and all necessary packages.
This is the code:
import pandas as pd
import statistics as stat
from yahoo_fin.stock_info import *
stock_list = [line.rstrip('\n') for line in open("test.txt", "r")]
#print(stock_list)
## The balance sheet df ##
balance_sheet = {ticker: get_balance_sheet(ticker)
for ticker in stock_list}
## The income statement df ##
income_statement = {ticker: get_income_statement(ticker)
for ticker in stock_list}
bs_data=[]
for i in range(0,len(stock_list)):
one_ticker = pd.DataFrame(balance_sheet[stock_list[i]])
one_ticker = one_ticker.set_index('Breakdown')
bs_data.append(one_ticker)
#print(bs_data)
income_data=[]
#one_ticker =[]
for i in range(0,len(stock_list)):
one_ticker = pd.DataFrame(income_statement[stock_list[i]])
one_ticker = one_ticker.set_index('Breakdown')
income_data.append(one_ticker)
#print(income_data)
## These are the balance sheet variables ##
for loop_counter in range(0,len(stock_list)):
# Total Assets
total_assets = (bs_data[loop_counter].loc['Total Assets'].astype(int))
avg_total_assets = stat.mean(total_assets)
#print(avg_total_assets)
# Total Current Liabilities
total_current_liabilities = (bs_data[loop_counter].loc['Total Current Liabilities'].astype(int))
avg_total_current_liabilities = stat.mean(total_current_liabilities)
#print(avg_total_current_liabilities)
#Total Liabilities
total_liabilities = (bs_data[loop_counter].loc['Total Liabilities'].astype(int))
avg_total_liabilities = stat.mean(total_liabilities)
#print(avg_total_liabilities)
## These are the income statement variables ##
# Total Revenue
total_revenue = (income_data[loop_counter].loc['Total Revenue']).astype(int)
avg_total_revenue = stat.mean(total_revenue)
#print(avg_total_revenue)
# Operating Income
operating_income = (income_data[loop_counter].loc['Operating Income or Loss']).astype(int)
avg_operating_income = stat.mean(operating_income)
#print(avg_operating_income)
# Total Operating Expenses
total_operating_expenses = (income_data[loop_counter].loc['Total Operating Expenses'].astype(int))
avg_total_operating_expenses = stat.mean(total_operating_expenses)
#print(avg_total_operating_expenses)
# EBIT
ebit = (avg_total_revenue-avg_total_operating_expenses)
#print(ebit)
## Calculations ##
opm = (avg_operating_income) / (avg_total_revenue)
#print(opm)
roce = (ebit) / ((avg_total_assets) - (avg_total_current_liabilities))
#print(roce)
leverage = (avg_total_liabilities) / (avg_total_assets)
#print(leverage)
#print("Leverage: " + str(round(leverage,2)))
#print("OPM: " + str(round(opm*100,2)) + "%")
#print("ROCE: " + str(round(roce*100,2)) + "%")
## Save to file ##
#print(leverage)
#print(opm)
#print(roce)
if leverage < 1.00 and roce >= 0.2 and opm >= 0.2:
#print("We have a match!")
outfile = open("results.txt", "a")
outfile.write(stock_list[loop_counter])
outfile.write("\n")
outfile.close()
Any clues to what might be the problem??
Update #2 Code:
import pandas as pd
import statistics as stat
from yahooquery import *
# Ticker input here
stock_list = [line.rstrip('\n') for line in open("test.txt", "r")]
#for stock in stock_list:
tickers = Ticker(stock_list)
# Get balance sheet
for stock in stock_list:
#print(stock)
bs = tickers.balance_sheet()
bs = pd.DataFrame(bs)
bs = bs.set_index('endDate')
#print(bs)
## Balance sheet variables to extract ##
# Total Assets
total_assets = bs['totalAssets']
avg_total_assets = stat.mean(total_assets)
# Total Current Liabilities
total_current_liabilities = bs['totalCurrentLiabilities']
avg_total_current_liabilities = stat.mean(total_current_liabilities)
# Total Liabilities
total_liabilities = bs['totalLiab']
avg_total_liabilities = stat.mean(total_liabilities)
## Get income statement ##
inst = tickers.income_statement()
inst = pd.DataFrame(inst)
inst = inst.set_index('endDate')
## Income statement variables to extract ##
# Total Revenue#
total_revenue = inst['totalRevenue']
avg_total_revenue = stat.mean(total_revenue)
# Operating Income
operating_income = inst['operatingIncome']
avg_operating_income = stat.mean(operating_income)
# Total Operating Expenses
total_operating_expenses = inst['totalOperatingExpenses']
avg_total_operating_expenses = stat.mean(total_operating_expenses)
# EBIT
ebit = (avg_total_revenue-avg_total_operating_expenses)
## Parameters ##
opm = (avg_operating_income) / (avg_total_revenue)
roce = (ebit) / ((avg_total_assets) - (avg_total_current_liabilities))
leverage = (avg_total_liabilities) / (avg_total_assets)
## Save to file ##
#print("Hello!")
if leverage < 1.00 and roce >= 0.2 and opm >= 0.2:
#print("Hello")
outfile = open("yahoo_query_results.txt", "w+")
outfile.write(stock)
outfile.write("\n")
outfile.close()

Issues scraping EV/EBITDA, Sale of Purchase Stock & Net Borrowings from Yahoo Finance

I pulled a Python script off of Github which is intended to analyze & rank stocks. I finally got it running but unfortunately the EV/EBITDA and Shareholder Yield are populating their default values, 1000 & 0 respectively.
I've spent the last few days attempting to troubleshoot, learning a lot in the process, but unfortunately had no luck.. I think it's attempting to extract data from a nonexistent line on the 'Scraper' portion or referencing an incorrect HTML. I'll paste the two code snips I think the error may lie within though the rest of the files are linked above.
Main File
from sys import stdout
from Stock import Stock
import Pickler
import Scraper
import Rankings
import Fixer
import Writer
# HTML error code handler - importing data is a chore, and getting a connection
# error halfway through is horribly demotivating. Use a pickler to serialize
# imported data into a hot-startable database.
pklFileName = 'tmpstocks.pkl'
pickler = Pickler.Pickler()
# Check if a pickled file exists. Load it if the user requests. If no file
# loaded, stocks is an empty list.
stocks = pickler.loadPickledFile(pklFileName)
# Scrape data from FINVIZ. Certain presets have been established (see direct
# link for more details)
url = 'http://finviz.com/screener.ashx?v=152&f=cap_smallover&' + \
'ft=4&c=0,1,2,6,7,10,11,13,14,45,65'
html = Scraper.importHtml(url)
# Parse the HTML for the number of pages from which we'll pull data
nPages = -1
for line in html:
if line[0:40] == '<option selected="selected" value=1>Page':
# Find indices
b1 = line.index('/') + 1
b2 = b1 + line[b1:].index('<')
# Number of pages containing stock data
nPages = int(line[b1:b2])
break
# Parse data from table on the first page of stocks and store in the database,
# but only if no data was pickled
if pickler.source == Pickler.PickleSource.NOPICKLE:
Scraper.importFinvizPage(html, stocks)
# The first page of stocks (20 stocks) has been imported. Now import the
# rest of them
source = Pickler.PickleSource.FINVIZ
iS = pickler.getIndex(source, 1, nPages + 1)
for i in range(iS, nPages + 1):
try:
# Print dynamic progress message
print('Importing FINVIZ metrics from page ' + str(i) + ' of ' + \
str(nPages) + '...', file=stdout, flush=True)
# Scrape data as before
url = 'http://finviz.com/screener.ashx?v=152&f=cap_smallover&ft=4&r=' + \
str(i*20+1) + '&c=0,1,2,6,7,10,11,13,14,45,65'
html = Scraper.importHtml(url)
# Import stock metrics from page into a buffer
bufferList = []
Scraper.importFinvizPage(html, bufferList)
# If no errors encountered, extend buffer to stocks list
stocks.extend(bufferList)
except:
# Error encountered. Pickle stocks for later loading
pickler.setError(source, i, stocks)
break
# FINVIZ stock metrics successfully imported
print('\n')
# Store number of stocks in list
nStocks = len(stocks)
# Handle pickle file
source = Pickler.PickleSource.YHOOEV
iS = pickler.getIndex(source, 0, nStocks)
# Grab EV/EBITDA metrics from Yahoo! Finance
for i in range(iS, nStocks):
try:
# Print dynamic progress message
print('Importing Key Statistics for ' + stocks[i].tick +
' (' + str(i) + '/' + str(nStocks - 1) + ') from Yahoo! Finance...', \
file=stdout, flush=True)
# Scrape data from Yahoo! Finance
url = 'http://finance.yahoo.com/q/ks?s=' + stocks[i].tick + '+Key+Statistics'
html = Scraper.importHtml(url)
# Parse data
for line in html:
# Check no value
if 'There is no Key Statistics' in line or \
'Get Quotes Results for' in line or \
'Changed Ticker Symbol' in line or \
'</html>' in line:
# Non-financial file (e.g. mutual fund) or
# Ticker not located or
# End of html page
stocks[i].evebitda = 1000
break
elif 'Enterprise Value/EBITDA' in line:
# Line contains EV/EBITDA data
evebitda = Scraper.readYahooEVEBITDA(line)
stocks[i].evebitda = evebitda
break
except:
# Error encountered. Pickle stocks for later loading
pickler.setError(source, i, stocks)
break
# Yahoo! Finance EV/EBITDA successfully imported
print('\n')
# Handle pickle file
source = Pickler.PickleSource.YHOOBBY
iS = pickler.getIndex(source, 0, nStocks)
# Grab BBY metrics from Yahoo! Finance
for i in range(iS, nStocks):
try:
# Print dynamic progress message
print('Importing Cash Flow for ' + stocks[i].tick +
' (' + str(i) + '/' + str(nStocks - 1) + ') from Yahoo! Finance...', \
file=stdout, flush=True)
# Scrape data from Yahoo! Finance
url = 'http://finance.yahoo.com/q/cf?s=' + stocks[i].tick + '&ql=1'
html = Scraper.importHtml(url)
# Parse data
totalBuysAndSells = 0
for line in html:
# Check no value
if 'There is no Cash Flow' in line or \
'Get Quotes Results for' in line or \
'Changed Ticker Symbol' in line or \
'</html>' in line:
# Non-financial file (e.g. mutual fund) or
# Ticker not located or
# End of html page
break
elif 'Sale Purchase of Stock' in line:
# Line contains Sale/Purchase of Stock information
totalBuysAndSells = Scraper.readYahooBBY(line)
break
# Calculate BBY as a percentage of current market cap
bby = round(-totalBuysAndSells / stocks[i].mktcap * 100, 2)
stocks[i].bby = bby
except:
# Error encountered. Pickle stocks for later loading
pickler.setError(source, i, stocks)
break
# Yahoo! Finance BBY successfully imported
if not pickler.hasErrorOccurred:
# All data imported
print('\n')
print('Fixing screener errors...')
# A number of stocks may have broken metrics. Fix these (i.e. assign out-of-
# bounds values) before sorting
stocks = Fixer.fixBrokenMetrics(stocks)
print('Ranking stocks...')
# Calculate shareholder Yield
for i in range(nStocks):
stocks[i].shy = stocks[i].div + stocks[i].bby
# Time to rank! Lowest value gets 100
rankPE = 100 * (1 - Rankings.rankByValue([o.pe for o in stocks]) / nStocks)
rankPS = 100 * (1 - Rankings.rankByValue([o.ps for o in stocks]) / nStocks)
rankPB = 100 * (1 - Rankings.rankByValue([o.pb for o in stocks]) / nStocks)
rankPFCF = 100 * (1 - Rankings.rankByValue([o.pfcf for o in stocks]) / nStocks)
rankEVEBITDA = 100 * (1 - Rankings.rankByValue([o.evebitda for o in stocks]) / nStocks)
# Shareholder yield ranked with highest getting 100
rankSHY = 100 * (Rankings.rankByValue([o.shy for o in stocks]) / nStocks)
# Rank total stock valuation
rankStock = rankPE + rankPS + rankPB + rankPFCF + rankEVEBITDA + rankSHY
# Rank 'em
rankOverall = Rankings.rankByValue(rankStock)
# Calculate Value Composite - higher the better
valueComposite = 100 * rankOverall / len(rankStock)
# Reverse indices - lower index -> better score
rankOverall = [len(rankStock) - 1 - x for x in rankOverall]
# Assign to stocks
for i in range(nStocks):
stocks[i].rank = rankOverall[i]
stocks[i].vc = round(valueComposite[i], 2)
print('Sorting stocks...')
# Sort all stocks by normalized rank
stocks = [x for (y, x) in sorted(zip(rankOverall, stocks))]
# Sort top decile by momentum factor. O'Shaughnessey historically uses 25
# stocks to hold. The top decile is printed, and the user may select the top 25
# (or any n) from the .csv file.
dec = int(nStocks / 10)
topDecile = []
# Store temporary momentums from top decile for sorting reasons
moms = [o.mom for o in stocks[:dec]]
# Sort top decile by momentum
for i in range(dec):
# Get index of top momentum performer in top decile
topMomInd = moms.index(max(moms))
# Sort
topDecile.append(stocks[topMomInd])
# Remove top momentum performer from further consideration
moms[topMomInd] = -100
print('Saving stocks...')
# Save momentum-weighted top decile
topCsvPath = 'top.csv'
Writer.writeCSV(topCsvPath, topDecile)
# Save results to .csv
allCsvPath = 'stocks.csv'
Writer.writeCSV(allCsvPath, stocks)
print('\n')
print('Complete.')
print('Top decile (sorted by momentum) saved to: ' + topCsvPath)
print('All stocks (sorted by trending value) saved to: ' + allCsvPath)
Scraper
import re
from urllib.request import urlopen
from Stock import Stock
def importHtml(url):
"Scrapes the HTML file from the given URL and returns line break delimited \
strings"
response = urlopen(url, data = None)
html = response.read().decode('utf-8').split('\n')
return html
def importFinvizPage(html, stocks):
"Imports data from a FINVIZ HTML page and stores in the list of Stock \
objects"
isFound = False
for line in html:
if line[0:15] == '<td height="10"':
isFound = True
# Import data line into stock database
_readFinvizLine(line, stocks)
if isFound and len(line) < 10:
break
return
def _readFinvizLine(line, stocks):
"Imports stock metrics from the data line and stores it in the list of \
Stock objects"
# Parse html
(stkraw, dl) = _parseHtml(line)
# Create new stock object
stock = Stock()
# Get ticker symbol
stock.tick = stkraw[dl[1] + 1: dl[2]]
# Get company name
stock.name = stkraw[dl[2] + 1 : dl[3]]
# Get market cap multiplier (either MM or BB)
if stkraw[dl[4] - 1] == 'B':
capmult = 1000000000
else:
capmult = 1000000
# Get market cap
stock.mktcap = capmult * _toFloat(stkraw[dl[3] + 1 : dl[4] - 1])
# Get P/E ratio
stock.pe = _toFloat(stkraw[dl[4] + 1 : dl[5]])
# Get P/S ratio
stock.ps = _toFloat(stkraw[dl[5] + 1 : dl[6]])
# Get P/B ratio
stock.pb = _toFloat(stkraw[dl[6] + 1 : dl[7]])
# Get P/FCF ratio
stock.pfcf = _toFloat(stkraw[dl[7] + 1 : dl[8]])
# Get Dividend Yield
stock.div = _toFloat(stkraw[dl[8] + 1 : dl[9] - 1])
# Get 6-mo Relative Price Strength
stock.mom = _toFloat(stkraw[dl[9] + 1 : dl[10] - 1])
# Get Current Stock Price
stock.price = _toFloat(stkraw[dl[11] + 1 : dl[12]])
# Append stock to list of stocks
stocks.append(stock)
return
def _toFloat(line):
"Converts a string to a float. Returns NaN if the line can't be converted"
try:
num = float(line)
except:
num = float('NaN')
return num
def readYahooEVEBITDA(line):
"Returns EV/EBITDA data from Yahoo! Finance HTML line"
# Parse html
(stkraw, dl) = _parseHtml(line)
for i in range(0, len(dl)):
if (stkraw[dl[i] + 1 : dl[i] + 24] == 'Enterprise Value/EBITDA'):
evebitda = stkraw[dl[i + 1] + 1 : dl[i + 2]]
break
return _toFloat(evebitda)
def readYahooBBY(line):
"Returns total buys and sells from Yahoo! Finance HTML line. Result will \
still need to be divided by market cap"
# Line also contains Borrowings details - Remove it all
if 'Net Borrowings' in line:
# Remove extra data
line = line[:line.find('Net Borrowings')]
# Trim prior data
line = line[line.find('Sale Purchase of Stock'):]
# Determine if buys or sells, replace open parantheses:
# (#,###) -> -#,###
line = re.sub(r'[(]', '-', line)
# Eliminate commas and close parantheses: -#,### -> -####
line = re.sub(r'[,|)]', '', line)
# Remove HTML data and markup, replacing with commas
line = re.sub(r'[<.*?>|]', ',', line)
line = re.sub(' ', ',', line)
# Locate the beginnings of each quarterly Sale Purchase points
starts = [m.start() for m in re.finditer(',\d+,|,.\d+', line)]
# Locate the ends of each quarterly Sale Purchase points
ends = [m.start() for m in re.finditer('\d,', line)]
# Sum all buys and sells across year
tot = 0
for i in range(0, len(starts)):
# x1000 because all numbers are in thousands
tot = tot + float(line[starts[i] + 1 : ends[i] + 1]) * 1000
return tot
def _parseHtml(line):
"Parses the HTML line by </td> breaks and returns the delimited string"
# Replace </td> breaks with placeholder, '`'
ph = '`'
rem = re.sub('</td>', ph, line)
# The ticker symbol initial delimiter is different
# Remove all other remaining HTML data
stkraw = re.sub('<.*?>', '', rem)
# Replace unbalanced HTML
stkraw = re.sub('">', '`', stkraw)
# Find the placeholders
dl = [m.start() for m in re.finditer(ph, stkraw)]
return (stkraw, dl)
If anyone has any input or perhaps a better method such as beautifulsoup, I'd really appreciate it! I'm very open to any tutorials that would help as well. My intent is to both better my programming ability and have an effective stock screener.
I was having the same issue scraping the Yahoo data in Python, and in Matlab as well. As a workaround, I wrote a macro in VBA to grab all of the EV/EBITDA data from Yahoo by visiting each stock's Key Statistics page. However, it takes about a day to run on all 3,000+ stocks with market caps over $200M, which is not really practical.
I've tried finding the EV/EBITDA on various stock screeners online, but they either don't report it or only let you download a couple hundred stocks' data without paying. Busy Stock's screener seems the best in this regard, but their EV/EBITDA figures don't line up to Yahoo's, which worries me that they are using different methodology.
One solution and my recommendation to you is to use the Trending Value algorithm in Quantopian, which is free. You can find the code here: https://www.quantopian.com/posts/oshaugnessy-what-works-on-wall-street
Quantopian will let you backtest the algorithm to 2002, and live test it as well.

Data generation incomplete: Python random

I am trying to write a script to generate data. I am using random package for this. I execute the script and everything works fine. But when I check through the results, I found out that the script fails to generate the last 100+ rows for some reason.
Can someone suggest me why this could be happening?
from __future__ import print_function
from faker import Faker;
import random;
## Vaue declaration
population = 3;
product = 3;
years = 3;
months = 13;
days = 30;
tax= 3.5;
## Define Column Header
Column_Names = "Population_ID",";","Product_Name",";","Product_ID",";","Year",";",
"Month",";","Day","Quantity_sold",";","Sales_Price",";","Discount",
";","Actual_Sales_Price",tax;
## Function to generate sales related information
def sales_data():
for x in range(0,1):
quantity_sold = random.randint(5,20);
discount = random.choice(range(5,11));
sales_price = random.uniform(20,30);
return quantity_sold,round(sales_price,2),discount,round((sales_price)-(sales_price*discount)+(sales_price*tax));
## Format the month to quarter and return the value
def quarter(month):
if month >= 1 and month <= 3:
return "Q1";
elif month > 3 and month <= 6:
return "Q2";
elif month > 6 and month <= 9:
return "Q3";
else:
return "Q4";
## Generate product_id
def product_name():
str2 = "PROD";
sample2 = random.sample([1,2,3,4,5,6,7,8,9],5);
string_list = [];
for x in sample2:
string_list.append(str(x));
return (str2+''.join(string_list));
### Main starts here ###
result_log = open("C:/Users/Sangamesh.sangamad/Dropbox/Thesis/Data Preparation/GenData.csv",'w')
print (Column_Names, result_log);
### Loop and Generate Data ###
for pop in range(0,population):
pop = random.randint(55000,85000);
for prod_id in range(0,product):
product_name2 = product_name();
for year in range(1,years):
for month in range(1,months):
for day in range(1,31):
a = sales_data();
rows = str(pop)+";"+product_name2+";"+str(prod_id)+";"+str(year)+";"+str(month)+";"+quarter(month)+";"+str(day)+";"+str(a[0])+";"+str(a[1])+";"+str(a[2])+";"+str(tax)+";"+str(a[3]);
print(rows,file=result_log);
#print (rows);
tax = tax+1;
You need to close a file to have the buffers flushed:
result_log.close()
Better still, use the file object as a context manager and have the with statement close it for you when the block exits:
filename = "C:/Users/Sangamesh.sangamad/Dropbox/Thesis/Data Preparation/GenData.csv"
with result_log = open(filename, 'w'):
# code writing to result_log
Rather than manually writing strings with delimiters in between, you should really use the csv module:
import csv
# ..
column_names = (
"Population_ID", "Product_Name", "Product_ID", "Year",
"Month", "Day", "Quantity_sold", "Sales_Price", "Discount",
"Actual_Sales_Price", tax)
# ..
with result_log = open(filename, 'wb'):
writer = csv.writer(result_log, delimiter=';')
writer.writerow(column_names)
# looping
row = [pop, product_name2, prod_id, year, month, quarter(month), day,
a[0], a[1], a[2], tax, a[3]]
writer.writerow(row)

Categories