Extracting Web Data Using Beautiful Soup (Python 2.7)

Extracting Web Data Using Beautiful Soup (Python 2.7) - python

in the code sample below, 3 of the 5 elements I am attempting to scrape return values as expected. 2 (goals_scored and assists) return no values. I have verified that the data does exist on the web page and that I am using the correct attribute, but not sure why results are not returning. Is there something obvious I am overlooking?
import sys
from bs4 import BeautifulSoup as bs
import urllib2
import datetime as dt
import time
import pandas as pd
proxy_support = urllib2.ProxyHandler({})
opener = urllib2.build_opener(proxy_support)
player_name=[]
club =[]
position = []
goals_scored = []
assists = []
for p in range(25):
player_url = 'http://www.mlssoccer.com/stats/season?page={p}&franchise=select&year=2017&season_type=REG&group=goals'.format(
p=p)
page = opener.open(player_url).read()
player_soup = bs(page,"lxml")
print >>sys.stderr, '[{time}] Running page {n}...'.format(
time=dt.datetime.now(), n=p)
length = len(player_soup.find('tbody').findAll('tr'))
for row in range(0, length):
try:
name = player_soup.find('tbody').findAll('td', attrs={'data-title': 'Player'})[row].find('a').contents[0]
player_name.append(name)
team = player_soup.find('tbody').findAll('td', attrs={'data-title': 'Club'})[row].contents[0]
club.append(team)
pos = player_soup.find('tbody').findAll('td', attrs={'data-title': 'POS'})[row].contents[0]
position.append(pos)
goals = player_soup.find('tbody').findAll('td', attrs={'data-title': 'G' ,'class': 'responsive'})[row].contents[0]
goals_scored.apppend(goals)
a = player_soup.find('tbody').findAll('td', attrs={'data-title': 'A'})[row].contents[0]
assists.append(a)
except:
pass
player_data = {'player_name':player_name,
'club':club,
'position' : position,
'goals_scored' : goals_scored,
'assists' : assists,
}
df = pd.DataFrame.from_dict(player_data,orient='index')
df
The only thing I can figure out is that there is a slight difference in the HTML for the variables not returning data. Do i need to include the class= responsive in my code? If so, any examples of how that might look?
Position HTML : F
Goals HTML: 11
Any insight is appreciated

You can try like this to get your desired data. I've only parsed the portion you needed. The rest you can do for dataframe. FYI, there are two types of classes attached to different td tags. odd and even. Don't forget to consider that as well.
from bs4 import BeautifulSoup
import requests
page_url = "https://www.mlssoccer.com/stats/season?page={0}&franchise=select&year=2017&season_type=REG&group=goals"
for url in [page_url.format(p) for p in range(5)]:
soup = BeautifulSoup(requests.get(url).text, "lxml")
table = soup.select("table")[0]
for items in table.select(".odd,.even"):
player = items.select("td[data-title='Player']")[0].text
club = items.select("td[data-title='Club']")[0].text
position = items.select("td[data-title='POS']")[0].text
goals = items.select("td[data-title='G']")[0].text
assist = items.select("td[data-title='A']")[0].text
print(player,club,position,goals,assist)
Partial result looks like:
Nemanja Nikolic CHI F 24 4
Diego Valeri POR M 21 11
Ola Kamara CLB F 18 3
As I've included both the classes in my script so you will get all data from that site.

Related

Trying to get data from a table using beautifulsoup in python

Trying to get the "all splits" line of numbers from https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame (html code is in the picture) my code returns the 'all splits' text instead of the numbers I'm looking for. How do I go about changing the lookups in the GetStats function area to get the numbers instead of the first column descriptors.
import requests
from bs4 import BeautifulSoup
import re
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import csv
urls = []
data = []
for year in range(2003, 2005):
for page in range(1, 9):
url = f'http://www.espn.com/nba/hollinger/statistics/_/page/{page}/year/{year}/qualified/false'
if url is not None:
urls.append(url)
def GetData(url):
names_list = [] # names of players
pers = [] # player efficency ratings
playeridlist = [] # list of player ids to be used in making new stats searchable url
statsurls = [] # list of urls generated to get player stats
# makes a pattern for the function to look for
pattern = re.compile('playerId=(\d+)')
# setsup soup function
req = requests.get(url)
soup = BeautifulSoup(req.text, 'lxml')
# finds players names and adds to list
names = soup.find(lambda tag: tag.name == 'a' and 'playerId' in tag['href'])
bodytext = names.text
names_list.append(bodytext)
# finds plays player efficency rating and adds to list
pertag = soup.find('td', class_='sortcell')
per = pertag.text
pers.append(per)
# finds player id
names = soup.find('a', href=pattern)
player_id = names['href'].split('playerId=')[1]
playeridlist.append(player_id)
# uses player id to make a list of new urls for that player and get stats
for player_id in playeridlist:
statsurl = f"https://insider.espn.com/nba/player/splits/_/id/{player_id}/type/nba/year/{year}/category/perGame"
if statsurl is not None:
statsurls.append(statsurl)
# parses stats to get stats
def GetStats(statsurl): # GO BACK AND MAKE A THREAD EXECUTER STATEMENT WITHIN GETDATA FUNCTION BELOW THIS!!!
statsreq = requests.get(statsurl)
statssoup = BeautifulSoup(statsreq.text, 'lxml')
focusing_search = statssoup.find('tr', class_='Table__TR Table__TR--sm Table__even', attrs={'data-idx': '1'})
playerstathtml = focusing_search.find('td', class_='Table__TD')
stat_values = [playerstats.text for playerstats in playerstathtml]
print(stat_values)
GetStats("https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame")
#name_and_stats_list = dict(map(lambda i, j: (i, j), names_list, pers))
print(f"{bodytext}: {per}")
print(player_id)
GetData('http://www.espn.com/nba/hollinger/statistics/_/page/1/year/2003/qualified/false')

To get the all_splits stats from:
https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame
This is what I did:
I grabbed the table body using soup.select
Then I grabbed the headings and relevant stats by iterating through the columns/rows.
The list comprehension provides the text in list format, which is easy to convert to a dataframe.
Code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame'
soup = BeautifulSoup(requests.get(url).content, "html.parser")
t = soup.select('main#fittPageContainer div.Table__Scroller > table > tbody')
headings = [h.text for h in t[0].find_next('tr').find_all('td')]
all_splits = [h.text for h in t[0].find_all('tr')[1].find_all('td')]
df = pd.DataFrame([all_splits], columns=headings)
print(df)
Output:

Web scraping with bs4 python: How to display football matchups

I'm a beginner to Python and am trying to create a program that will scrape the football/soccer schedule from skysports.com and will send it through SMS to my phone through Twilio. I've excluded the SMS code because I have that figured out, so here's the web scraping code I am getting stuck with so far:
import requests
from bs4 import BeautifulSoup
URL = "https://www.skysports.com/football-fixtures"
page = requests.get(URL)
results = BeautifulSoup(page.content, "html.parser")
d = defaultdict(list)
comp = results.find('h5', {"class": "fixres__header3"})
team1 = results.find('span', {"class": "matches__item-col matches__participant matches__participant--side1"})
date = results.find('span', {"class": "matches__date"})
team2 = results.find('span', {"class": "matches__item-col matches__participant matches__participant--side2"})
for ind in range(len(d)):
d['comp'].append(comp[ind].text)
d['team1'].append(team1[ind].text)
d['date'].append(date[ind].text)
d['team2'].append(team2[ind].text)

Down below should do the trick for you:
from bs4 import BeautifulSoup
import requests
a = requests.get('https://www.skysports.com/football-fixtures')
soup = BeautifulSoup(a.text,features="html.parser")
teams = []
for date in soup.find_all(class_="fixres__header2"): # searching in that date
for i in soup.find_all(class_="swap-text--bp30")[1:]: #skips the first one because that's a heading
teams.append(i.text)
date = soup.find(class_="fixres__header2").text
print(date)
teams = [i.strip('\n') for i in teams]
for x in range(0,len(teams),2):
print (teams[x]+" vs "+ teams[x+1])
Let me further explain what I have done:
All the football have this class name - swap-text--bp30
So we can use find_all to extract all the classes with that name.
Once we have our results we can put them into an array "teams = []" then append them in a for loop "team.append(i.text)". ".text" strips the html
Then we can get rid of "\n" in the array by stripping it and printing out each string in the array two by two.
This should be your final output:
EDIT: To scrape the title of the leagues we will do pretty much the same:
league = []
for date in soup.find_all(class_="fixres__header2"): # searching in that date
for i in soup.find_all(class_="fixres__header3"): #skips the first one because that's a heading
league.append(i.text)
Strip the array and create another one:
league = [i.strip('\n') for i in league]
final = []
Then add this final bit of code which is essentially just printing the league then the two teams over and over:
for x in range(0,len(teams),5):
final.append(teams[x]+" vs "+ teams[x+1])
for i in league:
print(i)
for i in final:
print(i)

Scraping Wikipedia information (table)

I would need to scrape information regarding Elenco dei comuni per regione on Wikipedia. I would like to create an array that can allow me to associate each comune to the corresponding region, i.e. something like this:
'Abbateggio': 'Pescara' -> Abruzzo
I tried to get information using BeautifulSoup and requests as follows:
from bs4 import BeautifulSoup as bs
import requests
with requests.Session() as s: # use session object for efficiency of tcp re-use
s.headers = {'User-Agent': 'Mozilla/5.0'}
r = s.get('https://it.wikipedia.org/wiki/Comuni_d%27Italia')
soup = bs(r.text, 'html.parser')
for ele in soup.find_all('h3')[:6]:
tx = bs(str(ele),'html.parser').find('span', attrs={'class': "mw-headline"})
if tx is not None:
print(tx['id'])
however it does not work (it returns me an empty list).
The information that I have looked at using Inspect of Google Chrome are the following:
<span class="mw-headline" id="Elenco_dei_comuni_per_regione">Elenco dei comuni per regione</span> (table)
Comuni dell'Abruzzo
(this field should change for each region)
then <table class="wikitable sortable query-tablesortes">
Could you please give me advice on how to get such results?
Any help and suggestion will be appreciated.
EDIT:
Example:
I have a word: comunediabbateggio. This word includes Abbateggio. I would like to know which region can be associated with that city, if it exists.
Information from Wikipedia needs to create a dataset that can allow me to check the field and associate to comuni/cities a region.
What I should expect is:
WORD REGION/STATE
comunediabbateggio Pescara
I hope this can help you. Sorry if it was not clear.
Another example for English speaker that might be slightly better for understanding is the following:
Instead of the Italian link above, you can also consider the following: https://en.wikipedia.org/wiki/List_of_comuni_of_Italy . For each region (Lombardia, Veneto, Sicily, ... ) I would need to collect information about the list of communes of the Provinces.
if you click in a link of List of Communes of ... , there is a table that list the comune, e.g. https://en.wikipedia.org/wiki/List_of_communes_of_the_Province_of_Agrigento.

import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
target = "https://en.wikipedia.org/wiki/List_of_comuni_of_Italy"
def main(url):
with requests.Session() as req:
r = req.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
provinces = [item.find_next("span").text for item in soup.findAll(
"span", class_="tocnumber", text=re.compile(r"\d[.]\d"))]
search = [item.replace(
" ", "_") if " " in item else item for item in provinces]
nested = []
for item in search:
for a in soup.findAll("span", id=item):
goes = [b.text.split("of ")[-1]
for b in a.find_next("ul").findAll("a")]
nested.append(goes)
dictionary = dict(zip(provinces, nested))
urls = [f'{url[:24]}{b.get("href")}' for item in search for a in soup.findAll(
"span", id=item) for b in a.find_next("ul").findAll("a")]
return urls, dictionary
def parser():
links, dics = main(target)
com = []
for link in tqdm(links):
try:
df = pd.read_html(link)[0]
com.append(df[df.columns[1]].to_list()[:-1])
except ValueError:
com.append(["N/A"])
com = iter(com)
for x in dics:
b = dics[x]
dics[x] = dict(zip(b, com))
print(dics)
parser()

BeautifulSoup - Scrape multiple pages

I want to scrape the name of the members from each page and move on to the next pages and do the same. My code is working for only one page. I'm very new to this, Any advice would be appreciated. Thank you.
import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.bodia.com/spa-members/page/1")
soup = BeautifulSoup(r.text,"html.parser")
lights = soup.findAll("span",{"class":"light"})
lights_list = []
for l in lights[0:]:
result = l.text.strip()
lights_list.append(result)
print (lights_list)
I tried this and it only gives me the members of the page 3.
for i in range (1,4): #to scrape names of page 1 to 3
r = requests.get("https://www.bodia.com/spa-members/page/"+ format(i))
soup = BeautifulSoup(r.text,"html.parser")
lights = soup.findAll("span",{"class":"light"})
lights_list = []
for l in lights[0:]:
result = l.text.strip()
lights_list.append(result)
print (lights_list)
Then I tried this :
i = 1
while i<5:
r = requests.get("https://www.bodia.com/spa-members/page/"+str(i))
i+=1
soup = BeautifulSoup(r.text,"html.parser")
lights = soup.findAll("span",{"class":"light"})
lights_list = []
for l in lights[0:]:
result = l.text.strip()
lights_list.append(result)
print (lights_list)
It gives me the name of 4 members, but I don't know from which page
['Seng Putheary (Nana)']
['Marco Julia']
['Simon']
['Ms Anne Guerineau']

Just two changes needed to be made to get it to scrape everything.
r = requests.get("https://www.bodia.com/spa-members/page/"+ format(i)) needs to be changed to r = requests.get("https://www.bodia.com/spa-members/page/{}".format(i)). Your use of format was incorrect.
You were not looping over all the code, so the result was that it only printed out one set of names and then had no way to return to the start of the loop. Indenting everything under the for loop fixed that.
import requests
from bs4 import BeautifulSoup
for i in range (1,4): #to scrape names of page 1 to 3
r = requests.get("https://www.bodia.com/spa-members/page/{}".format(i))
soup = BeautifulSoup(r.text,"html.parser")
lights = soup.findAll("span",{"class":"light"})
lights_list = []
for l in lights[0:]:
result = l.text.strip()
lights_list.append(result)
print(lights_list)
The above code was spitting out a list of names every 3 seconds for the pages it scraped.

Beautifulsoup 4 Filtering Python 3 Issue

Well I have been looking at this for 6 hours and can't figure this out. I want to use Beautifulsoup to filter data from a webpage but I can't get .contents or get_text() to work and I have no clue where I am going wrong or how to do another filter on the first pass. I can get to the "fields tag" but can't narrow down to the tags to get the data. Sorry if this is a simple issue that I am doing wrong, I only started Python yesterday and started (trying atleast) web scraping this morning.
Entire Code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from openpyxl import Workbook
import bs4 as bs
import math
book = Workbook()
sheet = book.active
i=0
#Change this value to your starting tracking number
StartingTrackingNumber=231029883
#Change this value to increase or decrease the number of tracking numbers you want to search overal
TrackingNumberCount = 4
#Number of Tacking Numbers Searched at One Time
QtySearch = 4
#TrackingNumbers=["Test","Test 2"]
for i in range(0,TrackingNumberCount):
g=i+StartingTrackingNumber
sheet.cell(row=i+1,column=1).value = 'RN' + str(g) + 'CA,'
TrackingNumbers = []
for col in sheet['A']:
TrackingNumbers.append(col.value)
MaxRow = sheet.max_row
MaxIterations = math.ceil(MaxRow / QtySearch)
#print(MaxIterations)
RowCount = 0
LastTrackingThisPass = QtySearch
for RowCount in range (0,MaxIterations): #range(1,MaxRow):
FirstTrackingThisPass = (RowCount)*QtySearch
x = TrackingNumbers[FirstTrackingThisPass:LastTrackingThisPass]
LastTrackingThisPass+=QtySearch
driver = webdriver.Safari()
driver.set_page_load_timeout(20)
driver.get("https://www.canadapost.ca/cpotools/apps/track/personal/findByTrackNumber?execution=e1s1")
driver.find_element_by_xpath('//*[contains(#id, "trackNumbers")]').send_keys(x)
driver.find_element_by_xpath('//*[contains(#id, "submit_button")]').send_keys(chr(13))
driver.set_page_load_timeout(3000)
WebDriverWait(driver,30).until(EC.presence_of_element_located((By.ID, "noResults_modal")))
SourceCodeTest = driver.page_source
#print(SourceCodeTest)
Soup = bs.BeautifulSoup(SourceCodeTest, "lxml") #""html.parser")
z = 3
#for z in range (1,5):
# t = str(z)
# NameCheck = "trackingNumber" + t
##FindTrackingNumbers = Soup.find_all("div", {"id": "trackingNumber3"})
# FindTrackingNumbers = Soup.find_all("div", {"id": NameCheck})
# print(FindTrackingNumbers)
Info = Soup.find_all("fieldset", {"class": "trackhistoryitem"}, "strong")
print(Info.get_text())
Desired Output:
RN231029885CA N/A
RN231029884CA N/A
RN231029883CA 2017/04/04
Sample of the HTML trying to be parsed:
<fieldset class="trackhistoryitem">
<p><strong>Tracking No. </strong><br><input type="hidden" name="ID_RN231029885CA" value="false">RN231029885CA
</p>
<p><strong>Date / Time </strong><br>
<!--h:outputText value="N/A" rendered="true"/>
<h:outputText value="N/A - N/A" rendered="false"/>
<h:outputText value="N/A" rendered="false"/-->N/A
</p>
<p><strong>Description </strong><br><span id="tapListResultForm:tapResultsItems:1:trk_rl_div_1">

Using .get_text() I got back this long ugly string:
'\nTracking No. RN231029885CA\n \nDate / Time \nN/A\n \nDescription '
So with some of pythons string functions:
objects = []
for each in soup.find_all("fieldset"):
each = each.get_text().split("\n") #split the ugly string up
each = [each[1][-13:], each[4]] #grab the parts you want, rmv extra words
objects.append(each)
Note: This assumes all tracking numbers are 13 digits long, if not you'll need to use regex or some other creative method to extract it.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Extracting Web Data Using Beautiful Soup (Python 2.7) - python

Related

Trying to get data from a table using beautifulsoup in python

Web scraping with bs4 python: How to display football matchups

Scraping Wikipedia information (table)

BeautifulSoup - Scrape multiple pages

Beautifulsoup 4 Filtering Python 3 Issue

Categories

Resources