Creating Large DataFrame from smaller DataFrames - python

I am having an issue with the structure of data as I get it off the PGA website. I have trouble putting the data into a dataframe and merging the data so that I can use the dataframe for analysis later. The dimensions of the scraped data are never right. I get a separate error each time I run the code that I cant seem to reconcile.
I have tried merging and concatenating dataframes but nothing seems to work. ANy help is appreciated
I would really like for my dataframe to contain the individual statistics from the separate sites but on the same row as the other data formatted by the year and PLAYER NAME.
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import socket
import urllib.error
import pandas as pd
import urllib
import sqlalchemy
import numpy as np
import functools
base = 'http://www.pgatour.com/'
inn = 'stats/stat'
end = '.html'
years = ['2017','2016']
alpha = []
#all pages with links to tables
urls = ['http://www.pgatour.com/stats.html','http://www.pgatour.com/stats/categories.ROTT_INQ.html','http://www.pgatour.com/stats/categories.RAPP_INQ.html','http://www.pgatour.com/stats/categories.RARG_INQ.html','http://www.pgatour.com/stats/categories.RPUT_INQ.html','http://www.pgatour.com/stats/categories.RSCR_INQ.html','http://www.pgatour.com/stats/categories.RSTR_INQ.html','http://www.pgatour.com/stats/categories.RMNY_INQ.html','http://www.pgatour.com/stats/categories.RPTS_INQ.html']
for i in urls:
data = urlopen(i)
soup = BeautifulSoup(data, "html.parser")
for link in soup.find_all('a'):
if link.has_attr('href'):
alpha.append(base + link['href'][17:]) #may need adjusting
#data links
beta = []
for i in alpha:
if inn in i:
beta.append(i)
gamma = []
for i in beta:
if i not in gamma:
gamma.append(i)
jan = []
for i in gamma:
try:
data = urlopen(i)
soup = BeautifulSoup(data, "html.parser")
for table in soup.find_all('section',{'class':'module-statistics-off-the-tee-details'}):
for j in table.find_all('h3'):
y=j.get_text().replace(" ","").replace("-","").replace(":","").replace(">","").replace("<","").replace(">","").replace(")","").replace("(","").replace("=","").replace("+","")
jan.append([i,str(y+'.csv')])
print([i,str(y+'.csv')])
except Exception as e:
print(e)
pass
#my problem starts here
#using urls list so that I can find error faster
urls = [['http://www.pgatour.com/stats/stat.02356.html','d']
,['http://www.pgatour.com/stats/stat.02568.html','f']
,['http://www.pgatour.com/stats/stat.111.html','r']]
list = []
master = pd.DataFrame()
#jan = [['http://www.pgatour.com/stats/stat.02356.html', 'Last15EventsScoring.csv']]
#make a list with url and title name and cleaned csv name
#write to csv
row_sp = []
rows_sp =[]
title1 = []
title = []
for i in urls:
try:
for y in years:
data = urlopen(i[0][:-4] +y+ end)
soup = BeautifulSoup(data, "html.parser")
data1 = urlopen(i[0])
soup1 = BeautifulSoup(data1, "html.parser")
for table in soup1.find_all('table',{'id':'statsTable'}):
title.append('year')
for k in table.find_all('tr'):
for n in k.find_all('th'):
title1.append(n.get_text())
for l in title1:
if l not in title:
title.append(l)
rows_sp.append(title)
for table in soup.find_all('table',{'id':'statsTable'}):
for h in table.find_all('tr'):
row_sp = [y]
for j in h.find_all('td'):
row_sp.append(j.get_text().replace(" ","").replace("\n","").replace("\xa0"," "))
rows_sp.append(row_sp)
df=pd.DataFrame(rows_sp)
df.columns = title
df.drop(df.index[1],inplace = True)
print(df)
list.append(df)
except Exception as e:
print(e)
pass
df_merge = functools.reduce(lambda left,right: pd.merge(left,right,on=['year','PLAYER NAME'], how='outer'), list)

Related

Trying to get data from a table using beautifulsoup in python

Trying to get the "all splits" line of numbers from https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame (html code is in the picture) my code returns the 'all splits' text instead of the numbers I'm looking for. How do I go about changing the lookups in the GetStats function area to get the numbers instead of the first column descriptors.
import requests
from bs4 import BeautifulSoup
import re
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import csv
urls = []
data = []
for year in range(2003, 2005):
for page in range(1, 9):
url = f'http://www.espn.com/nba/hollinger/statistics/_/page/{page}/year/{year}/qualified/false'
if url is not None:
urls.append(url)
def GetData(url):
names_list = [] # names of players
pers = [] # player efficency ratings
playeridlist = [] # list of player ids to be used in making new stats searchable url
statsurls = [] # list of urls generated to get player stats
# makes a pattern for the function to look for
pattern = re.compile('playerId=(\d+)')
# setsup soup function
req = requests.get(url)
soup = BeautifulSoup(req.text, 'lxml')
# finds players names and adds to list
names = soup.find(lambda tag: tag.name == 'a' and 'playerId' in tag['href'])
bodytext = names.text
names_list.append(bodytext)
# finds plays player efficency rating and adds to list
pertag = soup.find('td', class_='sortcell')
per = pertag.text
pers.append(per)
# finds player id
names = soup.find('a', href=pattern)
player_id = names['href'].split('playerId=')[1]
playeridlist.append(player_id)
# uses player id to make a list of new urls for that player and get stats
for player_id in playeridlist:
statsurl = f"https://insider.espn.com/nba/player/splits/_/id/{player_id}/type/nba/year/{year}/category/perGame"
if statsurl is not None:
statsurls.append(statsurl)
# parses stats to get stats
def GetStats(statsurl): # GO BACK AND MAKE A THREAD EXECUTER STATEMENT WITHIN GETDATA FUNCTION BELOW THIS!!!
statsreq = requests.get(statsurl)
statssoup = BeautifulSoup(statsreq.text, 'lxml')
focusing_search = statssoup.find('tr', class_='Table__TR Table__TR--sm Table__even', attrs={'data-idx': '1'})
playerstathtml = focusing_search.find('td', class_='Table__TD')
stat_values = [playerstats.text for playerstats in playerstathtml]
print(stat_values)
GetStats("https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame")
#name_and_stats_list = dict(map(lambda i, j: (i, j), names_list, pers))
print(f"{bodytext}: {per}")
print(player_id)
GetData('http://www.espn.com/nba/hollinger/statistics/_/page/1/year/2003/qualified/false')
To get the all_splits stats from:
https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame
This is what I did:
I grabbed the table body using soup.select
Then I grabbed the headings and relevant stats by iterating through the columns/rows.
The list comprehension provides the text in list format, which is easy to convert to a dataframe.
Code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame'
soup = BeautifulSoup(requests.get(url).content, "html.parser")
t = soup.select('main#fittPageContainer div.Table__Scroller > table > tbody')
headings = [h.text for h in t[0].find_next('tr').find_all('td')]
all_splits = [h.text for h in t[0].find_all('tr')[1].find_all('td')]
df = pd.DataFrame([all_splits], columns=headings)
print(df)
Output:

BeautifulSoup: Scraping CSV list of URLs

I have been trying to download data from different urls and then save it to a csv file.
The idea is extract the highlighted data from: https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow
So far I built the following piece of code:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as ur
url_is = 'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow'
read_data = ur.urlopen(url_is).read()
soup_is=BeautifulSoup(read_data, 'lxml')
row = soup_is.select_one('tr.mainRow>td.rowTitle:contains("Cash Dividends Paid - Total")')
data=[cell.text for cell in row.parent.select('td') if cell.text!='']
df=pd.DataFrame(data)
print(df.T)
I get as an output:
All good so far.
Now my idea is to extract specific classes from multiple URLs, keep the same headers from the website and export it to a .csv.
The tags and classes stay the same
Sample URLs:
https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow
https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow
Code (I wanted to try with 2 columns: 2015 and 2016)
As desidered ouput I would like something like:
I wrote the following code, but is giving me issues, any help or advice is welcome:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as ur
import numpy as np
import requests
links = ['https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow', 'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow']
container = pd.DataFrame(columns=['Name', 'Name2'])
pos=0
for l in links:
read_data = ur.urlopen(l).read()
soup_is=BeautifulSoup(read_data, 'lxml')
row = soup_is.select_one('tr.mainRow>td.rowTitle:contains("Cash Dividends Paid - Total")')
results=[cell.text for cell in row.parent.select('td') if cell.text!='']
records = []
for result in results:
records = []
Name = result.find('span', attrs={'itemprop':'2015'}).text if result.find('span', attrs={'itemprop':'2015'}) is not None else ''
Name2 = result.find('span', attrs={'itemprop':'2016'}).text if result.find('span', attrs={'itemprop':'2016'}) is not None else ''
records.append(Name)
records.append(Name2)
container.loc[pos] = records
pos+=1
import requests
import pandas as pd
urls = ['https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow',
'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow']
def main(urls):
with requests.Session() as req:
goal = []
for url in urls:
r = req.get(url)
df = pd.read_html(
r.content, match="Cash Dividends Paid - Total")[0].iloc[[0], 0:3]
goal.append(df)
new = pd.concat(goal)
print(new)
main(urls)

Saving multiple data frames from loop

I have been searching for a solution to my problem, but all answers I find uses print() at the end of the answer, and NOT saving the data frames as I would like to.
Below I have a (almost) functioning code that prints 3 seperate tables. How do I save these three tables in 3 seperate data frames with the names matches_october, matches_november and matches_december?
The last line in my code is not working as I want it to work. I hope it is clear what I would like the code to do (Saving a data frame at the end of each of the 3 rounds in the loop)
import pandas as pd
import requests
from bs4 import BeautifulSoup
base_url = 'https://www.basketball-reference.com/leagues/NBA_2019_games-'
valid_pages = ['october','november','december']
end = '.html'
for i in valid_pages:
url = '{}{}{}'.format(base_url, i, end)
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
print(df)
matches + valid_pages = df[0]
You can case it, but that's not very robust (and it's rather ugly).
if i == 'october':
matches_october = pd.read_html(str(table))
if i == 'november':
# so on and so forth
A more elegant solution is to use a dictionary. Before the loop, declare matches = {}. Then, in each iteration:
matches[i] = pd.read_html(str(table))
Then you can access the October matches DataFrame via matches['october'].
You can't compose variable names using +, try using a dict instead:
import pandas as pd
import requests
from bs4 import BeautifulSoup
matches = {} # create an empty dict
base_url = 'https://www.basketball-reference.com/leagues/NBA_2019_games-'
valid_pages = ['october','november','december']
end = '.html'
for i in valid_pages:
url = '{}{}{}'.format(base_url, i, end)
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
print(df)
matches[i] = df[0] # store it in the dict
Thanks guys. That worked! :)
import pandas as pd
import requests
from bs4 import BeautifulSoup
matches = {} # create an empty dict
base_url = 'https://www.basketball-reference.com/leagues/NBA_2019_games-'
valid_pages = ['october','november','december']
end = '.html'
for i in valid_pages:
url = '{}{}{}'.format(base_url, i, end)
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
matches[i] = df[0] # store it in the dict
matches_october = matches['october']

Web-Scraping Python, Indexing Issue for DataFrame

I'm working on a web-scraper for Spotify Charts to extract the top 200 daily songs each day. I have done everything to extract the data I'm interested in including rank, artist, track title, and stream numbers. What I'm stuck on is putting everything into a DataFrame to export as a CSV to excel. Right now when I print my DataFrame, it is treating each cycle as 1 row with 4 columns as opposed to 200 rows with 4 columns.
I'm not sure what the issue is as I've tried just about everything and looked into it as much as I could. I know something is wrong with the indexing because each "what should be a row" has the same first "0" index, when they should go sequential to 199. Also, the column names for my DataFrame keep repeating after each "what should be a row", so I know there is definitely an issue there.
import requests
from bs4 import BeautifulSoup
from datetime import date, timedelta
from time import time
from time import sleep
from random import randint
import pandas as pd
import numpy as np
base_url = 'https://spotifycharts.com/regional/global/daily/'
r = requests.get(base_url)
soup = BeautifulSoup(r.text, 'html.parser')
chart = soup.find('table', {'class': 'chart-table'})
tbody = chart.find('tbody')
for tr in tbody.find_all('tr'):
rank_text = []
rank_text_elem = tr.find('td', {'class': 'chart-table-
position'})
for item in rank_text_elem:
rank_text = []
rank_text.append(item)
artist_text = []
artist_text_elem = tr.find('td', {'class': 'chart-table-
track'}).find_all('span')
for item in artist_text_elem:
artist_text = []
artist_text.append(item.text.replace('by ','').strip())
title_text = []
title_text_elem = tr.find('td', {'class': 'chart-table-
track'}).find_all('strong')
for item in title_text_elem:
title_text = []
title_text.append(item.text)
streams_text = []
streams_text_elem = tr.find('td', {'class': 'chart-table-streams'})
for item in streams_text_elem:
streams_text = []
streams_text.append(item)
# creating dataframe to store 4 variables
list_of_data = list(zip(rank_text, artist_text, title_text,
streams_text))
df = pd.DataFrame(list_of_data, columns =
['Rank','Artist','Title','Streams'])
print(df)
Basically, I'm trying to create a dataframe to hold 4 variables in each row for 200 rows for each date of spotify global charts. Please ignore some of the modules and libraries I've included at the top, they are used for iterating through each page of the historical data based on dynamic urls which I have already figured out. Any help is greatly appreciated! Thank you!
Before for loop I create list all_rows.
Inside for loop I add list with single row of data to all_rows.
After for loop I use all_rows to create DataFrame
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'https://spotifycharts.com/regional/global/daily/'
r = requests.get(base_url)
soup = BeautifulSoup(r.text, 'html.parser')
chart = soup.find('table', {'class': 'chart-table'})
tbody = chart.find('tbody')
all_rows = []
for tr in tbody.find_all('tr'):
rank_text = tr.find('td', {'class': 'chart-table-position'}).text
artist_text = tr.find('td', {'class': 'chart-table-track'}).find('span').text
artist_text = artist_text.replace('by ','').strip()
title_text = tr.find('td', {'class': 'chart-table-track'}).find('strong').text
streams_text = tr.find('td', {'class': 'chart-table-streams'}).text
all_rows.append( [rank_text, artist_text, title_text, streams_text] )
# after `for` loop
df = pd.DataFrame(all_rows, columns=['Rank','Artist','Title','Streams'])
print(df.head())
You could use pandas and requests
import pandas as pd
import requests
headers = {'User-Agent': 'Mozilla/5.0'}
url ='https://spotifycharts.com/regional/global/daily/'
r = requests.get(url, headers = headers).content
table = pd.read_html(r)[0] #transfer html to pandas
table.dropna(axis = 1, how = 'all', inplace = True) #drop nan column
table[['Title','Artist']] = table['Unnamed: 3'].str.split(' by ',expand=True) #split title artist strings into two columns
del table['Unnamed: 3'] #remove combined column
table = table[['Track', 'Artist','Title', 'Unnamed: 4']] #re-order cols
table.columns= ['Rank', 'Artist','Title', 'Streams'] #rename cols
print(table)

Python downloaded data in table form

This is the code that is downloading data from table and output that on cmd. I want to know if the same data can be downloaded in the same structure of table like in rows and columns?
This is what i have tried.
code:
import urllib
import re
from urlparse import urlparse
from bs4 import BeautifulSoup as bs
urls = ["http://physics.iitd.ac.in/content/list-faculty-members", "http://www.iitkgp.ac.in/commdir3/list.php?division=3&deptcode=ME","http://www.iitkgp.ac.in/commdir3/list.php?division=3&deptcode=CE"]
i = 0
while i< len(urls):
htmlfile = urllib.urlopen(urls[i])
htmltext = htmlfile.read()
soup = bs(htmltext)
tables = soup.find_all('table', attrs = {'border': '0' , 'width' : '100%' , 'cellpadding': '10'})
head = soup.find_all('h2' , attrs = {'class' : 'title style3'})
ree = tables.find_all('tr')
hea = head.find_all('big').find_all('strong')
datasets = []
q = []
s = []
t = hea.get_text()
q.append(t)
for b in ree:
x = [td.get_text() for td in b.find_all('td')]
dataset = [strong.get_text() for strong in b.find('td').find('a').find_all('strong')]
datasets.append(dataset)
q.append(x)
print q
i+=1
I think many people would recommend the use of the pandas library when working with tabular data. For well structured HTML, you can just blindly use pandas read_html.
import pandas as pd
tables = pd.read_html("http://physics.iitd.ac.in/content/list-faculty-members")
dataframe = tables[0]

Categories