How to scrape data from table using CSS Selector and BeautifulSoup? - python

On this page, there is a series of tables I'm trying to get specific data from an unnamed table and unnamed cells. I used Copy Selector from the inspect elements in Chrome to find the CSS selector. When I'm asking Python to print that specific CSS Selector, i'm getting 'Nonetype' object is not callable
Specifically on this page, I am trying to get the number "198" to show up from the table in #general-info, article:nth-child(4), table:nth-child(2),
The CSS Selector path is :
"html body div#program-details section#general-info article.grid-50 table tbody tr td"
and this comes up using the Copy Selector
#general-info > article:nth-child(4) > table:nth-child(2) > tbody > tr > td:nth-child(2)
Most of the code is accessing the site and bypassing the EULA. Skip to the bottom for the code i'm having problems with.
import mechanize
import requests
import urllib2
import urllib
import csv
from BeautifulSoup import BeautifulSoup
br = mechanize.Browser()
br.set_handle_robots(False)
br.addheaders = [("User-agent","Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.13) Gecko/20101206 Ubuntu/10.10 (maverick) Firefox/3.6.13")]
sign_in = br.open('https://login.ama-assn.org/account/login') #the login url
br.select_form(name = "go") #Alternatively you may use this instead of the above line if your form has name attribute available.
br["username"] = "wasabinoodlz" #the key "username" is the variable that takes the username/email value
br["password"] = "Bongshop10" #the key "password" is the variable that takes the password value
logged_in = br.submit() #submitting the login credentials
logincheck = logged_in.read() #reading the page body that is redirected after successful login
#print (logincheck) #printing the body of the redirected url after login
# EULA agreement stuff
cont = br.open('https://freida.ama-assn.org/Freida/eula.do').read()
cont1 = br.open('https://freida.ama-assn.org/Freida/eulaSubmit.do').read()
# Begin request for page data
req = br.open('https://freida.ama-assn.org/Freida/user/programDetails.do?pgmNumber=1205712369').read()
#Da Soups!
soup = BeautifulSoup(req)
#print soup.prettify() # use this to read html.prettify()
for score in soup.select('#general-info > article:nth-child(4) > table:nth-child(2) > tbody > tr > td:nth-child(2)'):
print score.string

You need to initialize BeautifulSoup using html5lib parser.
soup = BeautifulSoup(req, 'html5lib')
BeautifulSoup only implements the nth-of-type pseudo selector.
data = soup.select(
'#general-info > '
'article:nth-of-type(4) > '
'table:nth-of-type(2) > '
'tbody > '
'tr > '
'td:nth-of-type(2)'
)

Related

BeautifulSoup: find_all() returns an empty list

After checking the source page of the website I am crawling, I guess the reason that I could not fetch the content I wanted is that there's no div element in the source page. I also tried using css selector (in another question BeautifulSoup: Why .select method returned an empty list?), but that won't work either. Here's some of my code:
# Scraping top products sales and name from the Recommendation page
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import json
import requests
import numpy as np
import pandas as pd
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
'cookie': '_gcl_au=1.1.961206468.1594951946; _med=refer; _fbp=fb.2.1594951949275.1940955365; SPC_IA=-1; SPC_F=y1evilme0ImdfEmNWEc08bul3d8toc33; REC_T_ID=fab983c8-c7d2-11ea-a977-ccbbfe23657a; SPC_SI=uv1y64sfvhx3w6dir503ixw89ve2ixt4; _gid=GA1.3.413262278.1594951963; SPC_U=286107140; SPC_EC=GwoQmu7TiknULYXKODlEi5vEgjawyqNcpIWQjoxjQEW2yJ3H/jsB1Pw9iCgGRGYFfAkT/Ej00ruDcf7DHjg4eNGWbCG+0uXcKb7bqLDcn+A2hEl1XMtj1FCCIES7k17xoVdYW1tGg0qaXnSz0/Uf3iaEIIk7Q9rqsnT+COWVg8Y=; csrftoken=5MdKKnZH5boQXpaAza1kOVLRFBjx1eij; welcomePkgShown=true; _ga=GA1.1.1693450966.1594951955; _dc_gtm_UA-61904553-8=1; REC_MD_30_2002454304=1595153616; _ga_SW6D8G0HXK=GS1.1.1595152099.14.1.1595153019.0; REC_MD_41_1000044=1595153318_0_50_0_49; SPC_R_T_ID="Am9bCo3cc3Jno2mV5RDkLJIVsbIWEDTC6ezJknXdVVRfxlQRoGDcya57fIQsioFKZWhP8/9PAGhldR0L/efzcrKONe62GAzvsztkZHfAl0I="; SPC_T_IV="IETR5YkWloW3OcKf80c6RQ=="; SPC_R_T_IV="IETR5YkWloW3OcKf80c6RQ=="; SPC_T_ID="Am9bCo3cc3Jno2mV5RDkLJIVsbIWEDTC6ezJknXdVVRfxlQRoGDcya57fIQsioFKZWhP8/9PAGhldR0L/efzcrKONe62GAzvsztkZHfAl0I="'
}
shopee_url = 'https://shopee.co.id/top_products'
navi_info = requests.get('https://shopee.co.id/api/v4/recommend/recommend?bundle=top_sold_product_microsite&limit=20&offset=0')
# extracts all the "index" data from all "sections"
index_arrays = [object_['index'] for object_ in navi_info.json()['data']['sections']]
index_array = index_arrays[0] # only one section with "index" key is present
# extract all catIDs from the "index" payload
catIDs = [object_['key'] for object_ in index_array]
params = {'catID': catIDs}
print(params)
# a = requests.get(link, headers=headers)
response = requests.get('https://shopee.co.id/top_products', params=params)
print(response.text)
soup = bs(response.text, 'html.parser')
products = soup.find_all('div', attrs={'class': '_3S8sOC _2QfAXF'})
print(products) # Why this returns an empty list?
for product in products:
name = product.select_one('#main > div > div.shopee-page-wrapper > div.page-product > div.container > div.product-briefing.flex.card._2cRTS4 > div.flex.flex-auto.k-mj2F > div > div.qaNIZv > span')
sales = product.select_one('#main > div > div.shopee-page-wrapper > div.page-product > div.container > div.product-briefing.flex.card._2cRTS4 > div.flex.flex-auto.k-mj2F > div > div.flex._32fuIU > div.flex.SbDIui > div._22sp0A')
print(name)
print(sales)
You get no items in the find_all() list, because in the page (HTML) there are no tags with that class: '_3S8sOC _2QfAXF'
You can check this easily scraping all the elements with this class:
import requests
from bs4 import BeautifulSoup as bs
response = requests.get('https://shopee.co.id/top_products').content
soup = bs(response,"html.parser")
products = soup.find_all(class_ = "_3S8sOC _2QfAXF")
Note here that I am scraping the content attribute and also using the class_ kwarg in the find.all() method, to get all the elements with this specific class.
Unfortanaly there are no elemets. ://
print(products)
give back:
[]

Not able to scrape the all the reviews

I am trying to scrape this website and trying to get the reviews but I am facing an issue,
The page loads only 50 reviews.
To load more you have to click "Show More Reviews" and I don't know how to get all the data as there is no page link, also "Show more Reviews" doesn't have a URL to explore, the address remains the same.
url =
"https://www.capterra.com/p/134048/HiMama-Preschool-Child-Care-App/#reviews"
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
a = []
url = requests.get(url)
html = url.text
soup = BeautifulSoup(html, "html.parser")
table = soup.findAll("div", {"class":"review-comments"})
#print(table)
for x in table:
a.append(x.text)
df = pd.DataFrame(a)
df.to_csv("review.csv", sep='\t')
I know this is not pretty code but I am just trying to get the review text first.
kindly help. As I am little new to this.
Looking at the website, the "Show more reviews" button makes an ajax call and returns the additional info, all you have to do is find it's link and send a get request to it (which I've done with some simple regex):
import requests
import re
from bs4 import BeautifulSoup
headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) snap Chromium/74.0.3729.169 Chrome/74.0.3729.169 Safari/537.36"
}
url = "https://www.capterra.com/p/134048/HiMama-Preschool-Child-Care-App/#reviews"
Data = []
#Each page equivalant to 50 comments:
MaximumCommentPages = 3
with requests.Session() as session:
info = session.get(url)
#Get product ID, needed for getting more comments
productID = re.search(r'"product_id":(\w*)', info.text).group(1)
#Extract info from main data
soup = BeautifulSoup(info.content, "html.parser")
table = soup.findAll("div", {"class":"review-comments"})
for x in table:
Data.append(x)
#Number of pages to get:
#Get additional data:
params = {
"page": "",
"product_id": productID
}
while(MaximumCommentPages > 1): # number 1 because one of them was the main page data which we already extracted!
MaximumCommentPages -= 1
params["page"] = str(MaximumCommentPages)
additionalInfo = session.get("https://www.capterra.com/gdm_reviews", params=params)
print(additionalInfo.url)
#print(additionalInfo.text)
#Extract info for additional info:
soup = BeautifulSoup(additionalInfo.content, "html.parser")
table = soup.findAll("div", {"class":"review-comments"})
for x in table:
Data.append(x)
#Extract data the old fashioned way:
counter = 1
with open('review.csv', 'w') as f:
for one in Data:
f.write(str(counter))
f.write(one.text)
f.write('\n')
counter += 1
Notice how I'm using a session to preserve cookies for the ajax call.
Edit 1: You can reload the webpage multiple times and call the ajax again to get even more data.
Edit 2: Save data using your own method.
Edit 3: Changed some stuff, now gets any number of pages for you, saves to file with good' ol open()

Scraping and parsing multi-page (aspx) table

I'm trying to scrape information on greyhound races. For example, I want to scrape http://www.gbgb.org.uk/RaceCard.aspx?dogName=Hardwick%20Serena. This page shows all results for the dog Hardwick Serena, but it is split over several pages.
Inspecting the page, it shows under the 'next page' button:
<input type="submit" name="ctl00$ctl00$mainContent$cmscontent$DogRaceCard$lvDogRaceCard$ctl00$ctl03$ctl01$ctl12" value=" " title="Next Page" class="rgPageNext">.
I was hoping for a HTML link, that I could use for the next iteration of the scrape, but no luck.
Further inspection, by looking at network traffic, shows that the browser send a horribly long (hashed?) string for __VIEWSTATE, among others. Likely to protect the database?
I'm looking for a way to scrape all pages of one dog, either by iterating over all pages, or by increasing the page length to show 100+ lines on page 1. The underlying database is .aspx.
I'm using Python 3.5 and BeautifulSoup.
current code:
import requests
from bs4 import BeautifulSoup
url = 'http://www.gbgb.org.uk/RaceCard.aspx?dogName=Hardwick%20Serena'
with requests.session() as s:
s.headers['user-agent'] = 'Mozilla/5.0'
r = s.get(url)
soup = BeautifulSoup(r.content, 'html5lib')
target = 'ctl00$ctl00$mainContent$cmscontent$DogRaceCard$btnFilter_input'
data = { tag['name']: tag['value']
for tag in soup.select('input[name^=ctl00]') if tag.get('value')
}
state = { tag['name']: tag['value']
for tag in soup.select('input[name^=__]')
}
data.update(state)
numberpages = int(str(soup.find('div', 'rgWrap rgInfoPart')).split(' ')[-2].split('>')[1].split('<')[0])
# for page in range(last_page + 1):
for page in range(numberpages):
data['__EVENTTARGET'] = target.format(page)
#data['__VIEWSTATE'] = target.format(page)
print(10)
r = s.post(url, data=data)
soup = BeautifulSoup(r.content, 'html5lib')
tables = soup.findChildren('table')
my_table = tables[9]
rows = my_table.findChildren(['th', 'tr'])
tabel = [[]]
for i in range(len(rows)):
cells = rows[i].findChildren('td')
tabel.append([])
for j in range(len(cells)):
value = cells[j].string
tabel[i].append(value)
table = []
for i in range(len(tabel)):
if len(tabel[i]) == 16:
del tabel[i][-2:]
table.append(tabel[i])
In this case, for each page requested a POST request is issued with form url encoded parameter __EVENTTARGET & __VIEWSTATE :
__VIEWSTATE can be easily extracted from an input tag
__EVENTTARGET is different for each page and the value is passed from a javacript function for each page link so you can extract it with a regex :
<a href="javascript:__doPostBack('ctl00$ctl00$mainContent$cmscontent$DogRaceCard$lvDogRaceCard$ctl00$ctl03$ctl01$ctl07','')">
<span>2</span>
</a>
The python script :
from bs4 import BeautifulSoup
import requests
import re
# extract data from page
def extract_data(soup):
tables = soup.find_all("div", {"class":"race-card"})[0].find_all("tbody")
item_list = [
(
t[0].text.strip(), #date
t[1].text.strip(), #dist
t[2].text.strip(), #TP
t[3].text.strip(), #StmHCP
t[4].text.strip(), #Fin
t[5].text.strip(), #By
t[6].text.strip(), #WinnerOr2nd
t[7].text.strip(), #Venue
t[8].text.strip(), #Remarks
t[9].text.strip(), #WinTime
t[10].text.strip(), #Going
t[11].text.strip(), #SP
t[12].text.strip(), #Class
t[13].text.strip() #CalcTm
)
for t in (t.find_all('td') for t in tables[1].find_all('tr'))
if t
]
print(item_list)
session = requests.Session()
url = 'http://www.gbgb.org.uk/RaceCard.aspx?dogName=Hardwick%20Serena'
response = session.get(url)
soup = BeautifulSoup(response.content, "html.parser")
# get view state value
view_state = soup.find_all("input", {"id":"__VIEWSTATE"})[0]["value"]
# get all event target values
event_target = soup.find_all("div", {"class":"rgNumPart"})[0]
event_target_list = [
re.search('__doPostBack\(\'(.*)\',', t["href"]).group(1)
for t in event_target.find_all('a')
]
# extract data for the 1st page
extract_data(soup)
# extract data for each page except the first
for link in event_target_list[1:]:
print("get page {0}".format(link))
post_data = {
'__EVENTTARGET': link,
'__VIEWSTATE': view_state
}
response = session.post(url, data=post_data)
soup = BeautifulSoup(response.content, "html.parser")
extract_data(soup)

Unable to format the HTML output

I have python code that returns an HTML page. Within that page, there is a line "2092 Pittman Road" which is the parcel address. My code is below:
import mechanize
br = mechanize.Browser()
response = br.open("https://www.matsugov.us/myproperty")
for form in br.forms():
if form.attrs.get('name') == 'frmSearch':
br.form = form
break
br.form['ddlType']=["taxid"]
br['txtParm']="218N02W27C003"
req=br.submit().read()
print req
req gives me the o/p in HTML format. You can run this code as is to see the o/p.
Use this code, this will work for you:
from bs4 import BeautifulSoup
import mechanize
br = mechanize.Browser()
response = br.open("https://www.matsugov.us/myproperty")
for form in br.forms():
if form.attrs.get('name') == 'frmSearch':
br.form = form
break
br.form['ddlType']=["taxid"]
br['txtParm']="218N02W27C003"
req=br.submit().read()
soup = BeautifulSoup(req, 'html.parser')
table = soup.find('td', {'class': 'Grid_5'})
for row in table:
print row
Feed your HTML to BeautifulSoup, and then navigate or format it as you wish.

python mechanize check dates/time for an exam from a website

I am trying to check the dates/times availability for an exam using Python mechanize and send someone an email if a particular date/time becomes available in the result (result page screenshot attached)
import mechanize
from BeautifulSoup import BeautifulSoup
URL = "http://secure.dre.ca.gov/PublicASP/CurrentExams.asp"
br = mechanize.Browser()
response = br.open(URL)
# there are some errors in doctype and hence filtering the page content a bit
response.set_data(response.get_data()[200:])
br.set_response(response)
br.select_form(name="entry_form")
# select Oakland for the 1st set of checkboxes
for i in range(0, len(br.find_control(type="checkbox",name="cb_examSites").items)):
if i ==2:
br.find_control(type="checkbox",name="cb_examSites").items[i].selected =True
# select salesperson for the 2nd set of checkboxes
for i in range(0, len(br.find_control(type="checkbox",name="cb_examTypes").items)):
if i ==1:
br.find_control(type="checkbox",name="cb_examTypes").items[i].selected =True
reponse = br.submit()
print reponse.read()
I am able to get the response but for some reason the data within my table is missing
here are the buttons from the initial html page
<input type="submit" value="Get Exam List" name="B1">
<input type="button" value="Clear" name="B2" onclick="clear_entries()">
<input type="hidden" name="action" value="GO">
one part of the output (submit response) where the actual data is lying
<table summary="California Exams Scheduling" class="General_list" width="100%" cellspacing="0"> <EVERTHING INBETWEEN IS MISSING HERE>
</table>
All the data within the table is missing. I have provided a screenshot of the table element from chrome browser.
Can someone please tell me what could be wrong ?
Can someone please tell me how to get the date/time out of the response (assuming I have to use BeautifulSoup) and so has to be something on these lines. I am trying to find out if a particular date I have in mind (say March 8th) in the response shows up a Begin Time of 1:30 pm..screenshot attached
soup = BeautifulSoup(response.read())
print soup.find(name="table")
update - looks like my issue might be related to this question and am trying my options . I tried the below as per one of the answers but cannot see any tr elements in the data (though can see this in the page source when I check it manually)
soup.findAll('table')[0].findAll('tr')
Update - Modfied this to use selenium, will try and do further at some point soon
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import urllib3
myURL = "http://secure.dre.ca.gov/PublicASP/CurrentExams.asp"
browser = webdriver.Firefox() # Get local session of firefox
browser.get(myURL) # Load page
element = browser.find_element_by_id("Checkbox5")
element.click()
element = browser.find_element_by_id("Checkbox13")
element.click()
element = browser.find_element_by_name("B1")
element.click()
5 years later, maybe this can help someone. I took your problem as a training exercise. I completed it using the Requests package. (I use python 3.9)
The code below is in two parts:
the request to retrieve the data injected into the table after the POST request.
## the request part
url = "https://secure.dre.ca.gov/PublicASP/CurrentExams.asp"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0"}
params = {
"cb_examSites": [
"'Fresno'",
"'Los+Angeles'",
"'SF/Oakland'",
"'Sacramento'",
"'San+Diego'"
],
"cb_examTypes": [
"'Broker'",
"'Salesperson'"
],
"B1": "Get+Exam+List",
"action": "GO"
}
s = rq.Session()
r = s.get(url, headers=headers)
s.headers.update({"Cookie": "%s=%s" % (r.cookies.keys()[0], r.cookies.values()[0])})
r2 = s.post(url=url, data=params)
soup = bs(r2.content, "lxml") # contain data you want
Parsing the response (a lot of ways to do it mine is maybe a bit stuffy)
table = soup.find_all("table", class_="General_list")[0]
titles = [el.text for el in table.find_all("strong")]
def beetweenBr(soupx):
final_str = []
for br in soupx.findAll('br'):
next_s = br.nextSibling
if not (next_s and isinstance(next_s,NavigableString)):
continue
next2_s = next_s.nextSibling
if next2_s and isinstance(next2_s,Tag) and next2_s.name == 'br':
text = str(next_s).strip()
if text:
final_str.append(next_s.strip())
return "\n".join(final_str)
d = {}
trs = table.find_all("tr")
for tr in trs:
tr_text = tr.text
if tr_text in titles:
curr_title = tr_text
splitx = curr_title.split(" - ")
area, job = splitx[0].split(" ")[0], splitx[1].split(" ")[0]
if not job in d.keys():
d[job] = {}
if not area in d[job].keys():
d[job][area] = []
continue
if (not tr_text in titles) & (tr_text != "DateBegin TimeLocationScheduledCapacity"):
tds = tr.find_all("td")
sub = []
for itd, td in enumerate(tds):
if itd == 2:
sub.append(beetweenBr(td))
else :
sub.append(td.text)
d[job][area].append(sub)
"d" contain data u want. I didn't go as far as sending an email yet.

Categories