Trying to get data from a table using beautifulsoup in python - python

Trying to get the "all splits" line of numbers from https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame (html code is in the picture) my code returns the 'all splits' text instead of the numbers I'm looking for. How do I go about changing the lookups in the GetStats function area to get the numbers instead of the first column descriptors.
import requests
from bs4 import BeautifulSoup
import re
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import csv
urls = []
data = []
for year in range(2003, 2005):
for page in range(1, 9):
url = f'http://www.espn.com/nba/hollinger/statistics/_/page/{page}/year/{year}/qualified/false'
if url is not None:
urls.append(url)
def GetData(url):
names_list = [] # names of players
pers = [] # player efficency ratings
playeridlist = [] # list of player ids to be used in making new stats searchable url
statsurls = [] # list of urls generated to get player stats
# makes a pattern for the function to look for
pattern = re.compile('playerId=(\d+)')
# setsup soup function
req = requests.get(url)
soup = BeautifulSoup(req.text, 'lxml')
# finds players names and adds to list
names = soup.find(lambda tag: tag.name == 'a' and 'playerId' in tag['href'])
bodytext = names.text
names_list.append(bodytext)
# finds plays player efficency rating and adds to list
pertag = soup.find('td', class_='sortcell')
per = pertag.text
pers.append(per)
# finds player id
names = soup.find('a', href=pattern)
player_id = names['href'].split('playerId=')[1]
playeridlist.append(player_id)
# uses player id to make a list of new urls for that player and get stats
for player_id in playeridlist:
statsurl = f"https://insider.espn.com/nba/player/splits/_/id/{player_id}/type/nba/year/{year}/category/perGame"
if statsurl is not None:
statsurls.append(statsurl)
# parses stats to get stats
def GetStats(statsurl): # GO BACK AND MAKE A THREAD EXECUTER STATEMENT WITHIN GETDATA FUNCTION BELOW THIS!!!
statsreq = requests.get(statsurl)
statssoup = BeautifulSoup(statsreq.text, 'lxml')
focusing_search = statssoup.find('tr', class_='Table__TR Table__TR--sm Table__even', attrs={'data-idx': '1'})
playerstathtml = focusing_search.find('td', class_='Table__TD')
stat_values = [playerstats.text for playerstats in playerstathtml]
print(stat_values)
GetStats("https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame")
#name_and_stats_list = dict(map(lambda i, j: (i, j), names_list, pers))
print(f"{bodytext}: {per}")
print(player_id)
GetData('http://www.espn.com/nba/hollinger/statistics/_/page/1/year/2003/qualified/false')

To get the all_splits stats from:
https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame
This is what I did:
I grabbed the table body using soup.select
Then I grabbed the headings and relevant stats by iterating through the columns/rows.
The list comprehension provides the text in list format, which is easy to convert to a dataframe.
Code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame'
soup = BeautifulSoup(requests.get(url).content, "html.parser")
t = soup.select('main#fittPageContainer div.Table__Scroller > table > tbody')
headings = [h.text for h in t[0].find_next('tr').find_all('td')]
all_splits = [h.text for h in t[0].find_all('tr')[1].find_all('td')]
df = pd.DataFrame([all_splits], columns=headings)
print(df)
Output:

Related

web-scrape: get H4 attributes & href

I am trying to web-scrape a website. But I can get access to the attributes of some fields.
here is the code i used:
import urllib3
from bs4 import BeautifulSoup
import pandas as pd
scrap_list = pd.DataFrame()
for path in range(10): # scroll over the categories
for path in range(10): # scroll over the pages
url = 'https://www.samehgroup.com/index.php?route=product/category'+str(page)+'&'+'path='+ str(path)
req = urllib3.PoolManager()
res = req.request('GET', URL)
soup = BeautifulSoup(res.data, 'html.parser')
soup.findAll('h4', {'class': 'caption'})
# extract names
scrap_name = [i.text.strip() for i in soup.findAll('h2', {'class': 'caption'})]
scrap_list['product_name']=pd.DataFrame(scrap_name,columns =['Item_name'])
# extract prices
scrap_list['product_price'] = [i.text.strip() for i in soup.findAll('div', {'class': 'price'})]
product_price=pd.DataFrame(scrap_price,columns =['Item_price'])
I want an output that provides me with each product and its price. I still can't get that right.
Any help would be very much appreciated.
I think the problem here was looping through the website pages. I got the code below working by first making a list of urls containing numbered 'paths' corresponding to pages on the website. Then looping through this list and applying a page number to the url.
If you wanted to only get all the products from a certain page, this page can be selected from the urlist and by index.
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
urlist = [] #create list of usable url's to iterate through,
for i in range(1,10): # 9 pages equal to pages on website
urlist.append('https://www.samehgroup.com/index.php?route=product/category&path=' + str(i))
namelist = []
newprice = []
for urlunf in urlist: #first loop to get 'path'
for n in range(100): #second loop to get 'pages'. set at 100 to cover website max page at 93
try: #try catches when pages containing products run out.
url = urlunf + '&page=' + str(n)
page = requests.get(url).text
soup = BeautifulSoup(page, 'html')
products = soup.find_all('div', class_='caption')
for prod in products: #loops over returned list of products for names and prices
name = prod.find('h4').text
newp = prod.find('p', class_='price').find('span', class_='price-new').text
namelist.append(name) #append data to list outside of loop
newprice.append(newp)
time.sleep(2)
except AttributeError: #if there are no more products it will move to next page
pass
df = pd.DataFrame() #create df and add scraped data
df['name'] = namelist
df['price'] = newprice

Web scraping with bs4 python: How to display football matchups

I'm a beginner to Python and am trying to create a program that will scrape the football/soccer schedule from skysports.com and will send it through SMS to my phone through Twilio. I've excluded the SMS code because I have that figured out, so here's the web scraping code I am getting stuck with so far:
import requests
from bs4 import BeautifulSoup
URL = "https://www.skysports.com/football-fixtures"
page = requests.get(URL)
results = BeautifulSoup(page.content, "html.parser")
d = defaultdict(list)
comp = results.find('h5', {"class": "fixres__header3"})
team1 = results.find('span', {"class": "matches__item-col matches__participant matches__participant--side1"})
date = results.find('span', {"class": "matches__date"})
team2 = results.find('span', {"class": "matches__item-col matches__participant matches__participant--side2"})
for ind in range(len(d)):
d['comp'].append(comp[ind].text)
d['team1'].append(team1[ind].text)
d['date'].append(date[ind].text)
d['team2'].append(team2[ind].text)
Down below should do the trick for you:
from bs4 import BeautifulSoup
import requests
a = requests.get('https://www.skysports.com/football-fixtures')
soup = BeautifulSoup(a.text,features="html.parser")
teams = []
for date in soup.find_all(class_="fixres__header2"): # searching in that date
for i in soup.find_all(class_="swap-text--bp30")[1:]: #skips the first one because that's a heading
teams.append(i.text)
date = soup.find(class_="fixres__header2").text
print(date)
teams = [i.strip('\n') for i in teams]
for x in range(0,len(teams),2):
print (teams[x]+" vs "+ teams[x+1])
Let me further explain what I have done:
All the football have this class name - swap-text--bp30
So we can use find_all to extract all the classes with that name.
Once we have our results we can put them into an array "teams = []" then append them in a for loop "team.append(i.text)". ".text" strips the html
Then we can get rid of "\n" in the array by stripping it and printing out each string in the array two by two.
This should be your final output:
EDIT: To scrape the title of the leagues we will do pretty much the same:
league = []
for date in soup.find_all(class_="fixres__header2"): # searching in that date
for i in soup.find_all(class_="fixres__header3"): #skips the first one because that's a heading
league.append(i.text)
Strip the array and create another one:
league = [i.strip('\n') for i in league]
final = []
Then add this final bit of code which is essentially just printing the league then the two teams over and over:
for x in range(0,len(teams),5):
final.append(teams[x]+" vs "+ teams[x+1])
for i in league:
print(i)
for i in final:
print(i)

Optimize scraping through list of urls and write to csv

With a csv of 20k+ urls I want to scrape and find the html element "super-attribute-select". If found, write the url to column A, along with the product number(sku) to column B. If not found, write url to column C and sku to column D. Finally, save the dataframe to a csv file.
If i run the following code it works, but my program runs out of memory. It liked to find a way to optimize this. Now ~1500 urls take 5 hrs to process. While the entire csv is 20k.
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from pandas import Series
urlList = pd.read_csv(r"url.csv")
urlList = urlList.url.tolist()
notfound = []
found = []
skulist =[]
skumissinglist =[]
# Function scrape, pass url, open with soup, and find class
def scrape(url):
tag ='select'
classused = "super-attribute-select"
d = dict(A=np.array(found), B=np.array(skulist), C=np.array(notfound), D=np.array(skumissinglist))
try:
content = urllib.request.urlopen(url)
soup = BeautifulSoup(content, features="html.parser")
sku= soup.find("div", {"itemprop": "sku"}).string
result = soup.find(tag, class_=classused)
#soup returns None if can't find anything
if result == None:
notfound.append(url)
skumissinglist.append(sku)
else:
found.append(url)
skulist.append(sku)
except:
result = print("Some extraction went wrong")
df = pd.DataFrame(dict([(k, Series(v)) for k, v in d.items()]))
df = df.to_csv('Test.csv')
for i in urlList:
scrape(i)
If I were doing this, I would try a few things:
(1) Update a dictionary instead of appending to a list. I think dictionaries are faster and more memory-efficient than lists.
(2) Rather than export each URL result as a CSV with the same name, either (a) preferred: wait until you are done to export all results as a single CSV, or (b) worse: maybe export them to different filenames by using f-strings instead of overwriting 'Test.csv' every time.
You could use a pool either with gevent or the built in one from urllib3 (or requests). Then you could do 10, or 100 a time depending on poolsize, and use an async queue to get remaining ones as the pools get exhausted.
from gevent import monkey, spawn, joinall
monkey.patch_all()
from gevent.pool import Pool as GeventPool
import pandas as pd
from pandas import Series
import numpy as np
import requests
from bs4 import BeautifulSoup
urlList = pd.read_csv(r"url.csv")
urlList = urlList.url.tolist()
pool = GeventPool(10)
notfound = []
found = []
skulist =[]
skumissinglist =[]
count = len(urllist)
# Function scrape, pass url, open with soup, and find class
def scrape(url):
tag ='select'
classused = "super-attribute-select"
d = dict(A=np.array(found), B=np.array(skulist), C=np.array(notfound), D=np.array(skumissinglist))
try:
content = requests.get(url).text
soup = BeautifulSoup(content, features="html.parser")
sku= soup.find("div", {"itemprop": "sku"}).string
result = soup.find(tag, class_=classused)
#soup returns None if can't find anything
if result == None:
notfound.append(url)
skumissinglist.append(sku)
else:
found.append(url)
skulist.append(sku)
except:
print("Some extraction went wrong")
df = pd.DataFrame(dict([(k, Series(v)) for k, v in d.items()]))
return df.to_csv('Test.csv')
pool.map(scrape, urllist)

Saving multiple data frames from loop

I have been searching for a solution to my problem, but all answers I find uses print() at the end of the answer, and NOT saving the data frames as I would like to.
Below I have a (almost) functioning code that prints 3 seperate tables. How do I save these three tables in 3 seperate data frames with the names matches_october, matches_november and matches_december?
The last line in my code is not working as I want it to work. I hope it is clear what I would like the code to do (Saving a data frame at the end of each of the 3 rounds in the loop)
import pandas as pd
import requests
from bs4 import BeautifulSoup
base_url = 'https://www.basketball-reference.com/leagues/NBA_2019_games-'
valid_pages = ['october','november','december']
end = '.html'
for i in valid_pages:
url = '{}{}{}'.format(base_url, i, end)
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
print(df)
matches + valid_pages = df[0]
You can case it, but that's not very robust (and it's rather ugly).
if i == 'october':
matches_october = pd.read_html(str(table))
if i == 'november':
# so on and so forth
A more elegant solution is to use a dictionary. Before the loop, declare matches = {}. Then, in each iteration:
matches[i] = pd.read_html(str(table))
Then you can access the October matches DataFrame via matches['october'].
You can't compose variable names using +, try using a dict instead:
import pandas as pd
import requests
from bs4 import BeautifulSoup
matches = {} # create an empty dict
base_url = 'https://www.basketball-reference.com/leagues/NBA_2019_games-'
valid_pages = ['october','november','december']
end = '.html'
for i in valid_pages:
url = '{}{}{}'.format(base_url, i, end)
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
print(df)
matches[i] = df[0] # store it in the dict
Thanks guys. That worked! :)
import pandas as pd
import requests
from bs4 import BeautifulSoup
matches = {} # create an empty dict
base_url = 'https://www.basketball-reference.com/leagues/NBA_2019_games-'
valid_pages = ['october','november','december']
end = '.html'
for i in valid_pages:
url = '{}{}{}'.format(base_url, i, end)
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
matches[i] = df[0] # store it in the dict
matches_october = matches['october']

Creating Large DataFrame from smaller DataFrames

I am having an issue with the structure of data as I get it off the PGA website. I have trouble putting the data into a dataframe and merging the data so that I can use the dataframe for analysis later. The dimensions of the scraped data are never right. I get a separate error each time I run the code that I cant seem to reconcile.
I have tried merging and concatenating dataframes but nothing seems to work. ANy help is appreciated
I would really like for my dataframe to contain the individual statistics from the separate sites but on the same row as the other data formatted by the year and PLAYER NAME.
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import socket
import urllib.error
import pandas as pd
import urllib
import sqlalchemy
import numpy as np
import functools
base = 'http://www.pgatour.com/'
inn = 'stats/stat'
end = '.html'
years = ['2017','2016']
alpha = []
#all pages with links to tables
urls = ['http://www.pgatour.com/stats.html','http://www.pgatour.com/stats/categories.ROTT_INQ.html','http://www.pgatour.com/stats/categories.RAPP_INQ.html','http://www.pgatour.com/stats/categories.RARG_INQ.html','http://www.pgatour.com/stats/categories.RPUT_INQ.html','http://www.pgatour.com/stats/categories.RSCR_INQ.html','http://www.pgatour.com/stats/categories.RSTR_INQ.html','http://www.pgatour.com/stats/categories.RMNY_INQ.html','http://www.pgatour.com/stats/categories.RPTS_INQ.html']
for i in urls:
data = urlopen(i)
soup = BeautifulSoup(data, "html.parser")
for link in soup.find_all('a'):
if link.has_attr('href'):
alpha.append(base + link['href'][17:]) #may need adjusting
#data links
beta = []
for i in alpha:
if inn in i:
beta.append(i)
gamma = []
for i in beta:
if i not in gamma:
gamma.append(i)
jan = []
for i in gamma:
try:
data = urlopen(i)
soup = BeautifulSoup(data, "html.parser")
for table in soup.find_all('section',{'class':'module-statistics-off-the-tee-details'}):
for j in table.find_all('h3'):
y=j.get_text().replace(" ","").replace("-","").replace(":","").replace(">","").replace("<","").replace(">","").replace(")","").replace("(","").replace("=","").replace("+","")
jan.append([i,str(y+'.csv')])
print([i,str(y+'.csv')])
except Exception as e:
print(e)
pass
#my problem starts here
#using urls list so that I can find error faster
urls = [['http://www.pgatour.com/stats/stat.02356.html','d']
,['http://www.pgatour.com/stats/stat.02568.html','f']
,['http://www.pgatour.com/stats/stat.111.html','r']]
list = []
master = pd.DataFrame()
#jan = [['http://www.pgatour.com/stats/stat.02356.html', 'Last15EventsScoring.csv']]
#make a list with url and title name and cleaned csv name
#write to csv
row_sp = []
rows_sp =[]
title1 = []
title = []
for i in urls:
try:
for y in years:
data = urlopen(i[0][:-4] +y+ end)
soup = BeautifulSoup(data, "html.parser")
data1 = urlopen(i[0])
soup1 = BeautifulSoup(data1, "html.parser")
for table in soup1.find_all('table',{'id':'statsTable'}):
title.append('year')
for k in table.find_all('tr'):
for n in k.find_all('th'):
title1.append(n.get_text())
for l in title1:
if l not in title:
title.append(l)
rows_sp.append(title)
for table in soup.find_all('table',{'id':'statsTable'}):
for h in table.find_all('tr'):
row_sp = [y]
for j in h.find_all('td'):
row_sp.append(j.get_text().replace(" ","").replace("\n","").replace("\xa0"," "))
rows_sp.append(row_sp)
df=pd.DataFrame(rows_sp)
df.columns = title
df.drop(df.index[1],inplace = True)
print(df)
list.append(df)
except Exception as e:
print(e)
pass
df_merge = functools.reduce(lambda left,right: pd.merge(left,right,on=['year','PLAYER NAME'], how='outer'), list)

Categories