Issue while extracting table data using beautifulsoup in python - python

The code below gives me the required product data as a table. It works fine for most links, however in some it stops midway giving an error NoneType object has no attribute find_all in table.find_all('tr).
I believe this is because table does not exist in some products, so I tried creating an if condition on the existence of the table, but that also doesn't seem to help. What changes should I make in the code below?
import requests, json, time
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.1800wheelchair.com/category/toilet-accessories/?p="
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
data = []
for i in range(1,3):
print(i)
res = requests.get(url + str(i), headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
p_links = [i["data-link"] for i in soup.find("ul", {"id":"products-list"}).find_all("li",class_=["openlink","item"])]
for prod_url in p_links:
print(prod_url)
temp = {"Product URL": prod_url}
prod_res = requests.get(prod_url, headers = headers)
prod_soup = BeautifulSoup(prod_res.text, "html.parser")
for p in prod_soup.find("div", class_="basic-information").find_all("p"):
if "item" in p.text.lower(): temp["item number"] = p.find("span").text.strip()
elif "brand" in p.text.lower(): temp["manufacturer"] = p.find("span").text.strip()
elif "sku" in p.text.lower(): temp["sku"] = p.find("span").text.strip()
table = prod_soup.find("table",{"class":"specifications"})
for tr in table.find_all("tr"):
temp[tr.find("td", {"class":"tdLabel"}).text.strip()] = tr.find("td", {"class":"tdValue"}).text.strip()
data.append(temp)
pd.DataFrame(data).to_csv("toilet-acc.csv", index=False)

You can use Try and Except (documentation):
try:
for tr in table.find_all("tr"):
temp[tr.find("td", {"class":"tdLabel"}).text.strip()] = tr.find("td", {"class":"tdValue"}).text.strip()
except:
pass

you can use this:
tables = soup.select('table', attrs={"class":"specifications"})
rows = tables.findChildren(['tr'])

import requests, json, time
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.1800wheelchair.com/category/toilet-accessories/?p="
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
data = []
for i in range(1,3):
print(i)
res = requests.get(url + str(i), headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
p_links = [i["data-link"] for i in soup.find("ul", {"id":"products-list"}).find_all("li",class_=["openlink","item"])]
for prod_url in p_links:
print(prod_url)
temp = {"Product URL": prod_url}
prod_res = requests.get(prod_url, headers = headers)
prod_soup = BeautifulSoup(prod_res.text, "html.parser")
try:
for p in prod_soup.find("div", class_="basic-information").find_all("p"):
if "item" in p.text.lower(): temp["item number"] = p.find("span").text.strip()
elif "brand" in p.text.lower(): temp["manufacturer"] = p.find("span").text.strip()
elif "sku" in p.text.lower(): temp["sku"] = p.find("span").text.strip()
table = prod_soup.find("table",{"class":"specifications"})
for tr in table.find_all("tr"):
temp[tr.find("td", {"class":"tdLabel"}).text.strip()] = tr.find("td", {"class":"tdValue"}).text.strip()
except:
print("Failed for URL {}".format(prod_url))
data.append(temp)
time.sleep(2)
pd.DataFrame(data).to_csv("toilet-acc.csv", index=False)
Put a try/except not only to extract product specification but also to extract item/brand/sku. But in the except put a print statement to know which all urls failed so that you can try them again

Related

Problems with getting data from a page using python, beautiful soup

I am trying to explore the web scraping in python.Currently working with beautiful soup.I was trying to get names of the festivals from this site : https://www.skiddle.com/festivals .Everything was going pretty fine, except 1 page, this one: https://www.skiddle.com/festivals/front-end-data-test/. It says 'NoneType' object has no attribute 'find' any way i can get data from there?
Here is the code
import requests
from bs4 import BeautifulSoup
import lxml
import json
headers = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 OPR/89.0.4447.64"
}
#collect all fests URLs
fests_urls_list = []
#for i in range(0, 120, 24):
for i in range(0, 24, 24):
url = f"https://www.skiddle.com/festivals/search/?ajaxing=1&sort=0&fest_name=&from_date=15%20Aug%202022&to_date=&maxprice=500&o={i}&bannertitle=August"
req = requests.get(url=url, headers=headers)
json_data = json.loads(req.text)
html_response = json_data["html"]
with open(f"data/index_{i}.html", "w", encoding="utf-8") as file:
file.write(html_response)
with open(f"data/index_{i}.html", "r", encoding="utf-8") as file:
src = file.read()
soup = BeautifulSoup(src, "lxml")
cards = soup.find_all("a", class_="card-details-link")
for item in cards:
fest_url = "https://www.skiddle.com" + item.get("href")
fests_urls_list.append(fest_url)
#collect fest info
for url in fests_urls_list:
req = requests.get(url=url, headers=headers)
try:
soup = BeautifulSoup(req.text, "lxml")
fest_name = soup.find("div", class_="MuiContainer-root MuiContainer-maxWidthFalse css-1krljt2").find("h1").text.strip()
fest_data = soup.find("div", class_="MuiGrid-root MuiGrid-item MuiGrid-grid-xs-11 css-twt0ol").text.strip()
print(fest_data)
except Exception as ex :
print(ex)
print("This was not supposed to happen")

Pandas to_csv only write the data from certain page

I tried to scrape data from tripadvisor, but from several pages that I tried to scrape, when I try to export it to csv it only shows 1 line of data and gives an error message like this
AttributeError: 'NoneType' object has no attribute 'text'
this is my code
import requests
import pandas as pd
from requests import get
from bs4 import BeautifulSoup
URL = 'https://www.tripadvisor.com/Attraction_Review-g469404-d3780963-Reviews-oa'
for offset in range(0, 30, 10):
url = URL + str(offset) + '-Double_Six_Beach-Seminyak_Kuta_District_Bali.html'
headers = {"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
container = soup.find_all('div', {'class':'_2rspOqPP'})
for r in container:
reviews = r.find_all('div', {'class': None})
#the container that contains the elements that I want to scrape has no attributes and use DOM element. So I tried to access div with _2rspOqPP class first then access the div with no attributes from there
records = []
for review in reviews:
user = review.find('a', {'class':'_7c6GgQ6n _37QDe3gr WullykOU _3WoyIIcL'}).text
country = review.find('div', {'class' : 'DrjyGw-P _26S7gyB4 NGv7A1lw _2yS548m8 _2cnjB3re _1TAWSgm1 _1Z1zA2gh _2-K8UW3T _1dimhEoy'}).span.text
date = review.find('div', {'class' : '_3JxPDYSx'}).text
content = review.find('div', {'class' : 'DrjyGw-P _26S7gyB4 _2nPM5Opx'}).text
records.append((user, country, date, content))
df = pd.DataFrame(records, columns=['Name', 'Country', 'Date', 'Content'])
df.to_csv('doublesix_.csv', index=False, encoding='utf-8')
Code updated
for r in container:
reviews = r.find_all('div', {'class': None})
records = []
for review in reviews:
try:
user = review.find('a', {'class':'_7c6GgQ6n _37QDe3gr WullykOU _3WoyIIcL'}).text
country = review.find('div', {'class' : 'DrjyGw-P _26S7gyB4 NGv7A1lw _2yS548m8 _2cnjB3re _1TAWSgm1 _1Z1zA2gh _2-K8UW3T _1dimhEoy'}).span.text
date = review.find('div', {'class' : '_3JxPDYSx'}).text
content = review.find('div', {'class' : 'DrjyGw-P _26S7gyB4 _2nPM5Opx'}).text
records.append((user, country, date, content))
except:
pass
print(records)
df = pd.DataFrame(records, columns=['Name', 'Country', 'Date', 'Content'])
df.to_csv('doublesix_.csv', index=False, encoding='utf-8')
You should move the records out of the for loops and unindent the last few lines.
See this:
import pandas as pd
import requests
from bs4 import BeautifulSoup
main_url = 'https://www.tripadvisor.com/Attraction_Review-g469404-d3780963-Reviews-oa'
country_class = "DrjyGw-P _26S7gyB4 NGv7A1lw _2yS548m8 _2cnjB3re _1TAWSgm1 _1Z1zA2gh _2-K8UW3T _1dimhEoy"
records = []
for offset in range(0, 30, 10):
url = main_url + str(offset) + '-Double_Six_Beach-Seminyak_Kuta_District_Bali.html'
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
}
soup = BeautifulSoup(requests.get(url, headers=headers).text, "html.parser")
container = soup.find_all('div', {'class': '_2rspOqPP'})
for r in container:
reviews = r.find_all('div', {'class': None})
for review in reviews:
try:
user = review.find('a', {'class': '_7c6GgQ6n _37QDe3gr WullykOU _3WoyIIcL'}).text
country = review.find('div', {'class': country_class}).span.text
date = review.find('div', {'class': '_3JxPDYSx'}).text
content = review.find('div', {'class': 'DrjyGw-P _26S7gyB4 _2nPM5Opx'}).text
records.append((user, country, date, content))
except AttributeError:
pass
df = pd.DataFrame(records, columns=['Name', 'Country', 'Date', 'Content'])
df.to_csv('doublesix_.csv', index=False, encoding='utf-8')
Output from the .csv file:

Unable to scrape the name from the inner page of each result using requests

I've created a script in python making use of post http requests to get the search results from a webpage. To populate the results, it is necessary to click on the fields sequentially shown here. Now a new page will be there and this is how to populate the result.
There are ten results in the first page and the following script can parse the results flawlessly.
What I wish to do now is use the results to reach their inner page in order to parse Sole Proprietorship Name (English) from there.
website address
I've tried so far with:
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.businessregistration.moc.gov.kh/cambodia-master/service/create.html?targetAppCode=cambodia-master&targetRegisterAppCode=cambodia-br-soleproprietorships&service=registerItemSearch"
payload = {
'QueryString': '0',
'SourceAppCode': 'cambodia-br-soleproprietorships',
'OriginalVersionIdentifier': '',
'_CBASYNCUPDATE_': 'true',
'_CBHTMLFRAG_': 'true',
'_CBNAME_': 'buttonPush'
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
res = s.get(url)
target_url = res.url.split("&")[0].replace("view.", "update.")
node = re.findall(r"nodeW\d.+?-Advanced",res.text)[0].strip()
payload['_VIKEY_'] = re.findall(r"viewInstanceKey:'(.*?)',", res.text)[0].strip()
payload['_CBHTMLFRAGID_'] = re.findall(r"guid:(.*?),", res.text)[0].strip()
payload[node] = 'N'
payload['_CBNODE_'] = re.findall(r"Callback\('(.*?)','buttonPush", res.text)[2]
payload['_CBHTMLFRAGNODEID_'] = re.findall(r"AsyncWrapper(W\d.+?)'",res.text)[0].strip()
res = s.post(target_url,data=payload)
soup = BeautifulSoup(res.content, 'html.parser')
for item in soup.find_all("span", class_="appReceiveFocus")[3:]:
print(item.text)
How can I parse the Name (English) from each of the results inner page using requests?
This is one of the ways you can parse the name from the site's inner page and then email address from the address tab. I added this function .get_email() only because I wanted to let you know as to how you can parse content from different tabs.
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.businessregistration.moc.gov.kh/cambodia-master/service/create.html?targetAppCode=cambodia-master&targetRegisterAppCode=cambodia-br-soleproprietorships&service=registerItemSearch"
result_url = "https://www.businessregistration.moc.gov.kh/cambodia-master/viewInstance/update.html?id={}"
base_url = "https://www.businessregistration.moc.gov.kh/cambodia-br-soleproprietorships/viewInstance/update.html?id={}"
def get_names(s):
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
res = s.get(url)
target_url = result_url.format(res.url.split("id=")[1])
soup = BeautifulSoup(res.text,"lxml")
payload = {i['name']:i.get('value','') for i in soup.select('input[name]')}
payload['QueryString'] = 'a'
payload['SourceAppCode'] = 'cambodia-br-soleproprietorships'
payload['_CBNAME_'] = 'buttonPush'
payload['_CBHTMLFRAG_'] = 'true'
payload['_VIKEY_'] = re.findall(r"viewInstanceKey:'(.*?)',", res.text)[0].strip()
payload['_CBHTMLFRAGID_'] = re.findall(r"guid:(.*?),", res.text)[0].strip()
payload['_CBNODE_'] = re.findall(r"Callback\('(.*?)','buttonPush", res.text)[-1]
payload['_CBHTMLFRAGNODEID_'] = re.findall(r"AsyncWrapper(W\d.+?)'",res.text)[0].strip()
res = s.post(target_url,data=payload)
soup = BeautifulSoup(res.text,"lxml")
payload.pop('_CBHTMLFRAGNODEID_')
payload.pop('_CBHTMLFRAG_')
payload.pop('_CBHTMLFRAGID_')
for item in soup.select("a[class*='ItemBox-resultLeft-viewMenu']"):
payload['_CBNAME_'] = 'invokeMenuCb'
payload['_CBVALUE_'] = ''
payload['_CBNODE_'] = item['id'].replace('node','')
res = s.post(target_url,data=payload)
soup = BeautifulSoup(res.text,'lxml')
address_url = base_url.format(res.url.split("id=")[1])
node_id = re.findall(r"taba(.*)_",soup.select_one("a[aria-label='Addresses']")['id'])[0]
payload['_CBNODE_'] = node_id
payload['_CBHTMLFRAGID_'] = re.findall(r"guid:(.*?),", res.text)[0].strip()
payload['_CBNAME_'] = 'tabSelect'
payload['_CBVALUE_'] = '1'
eng_name = soup.select_one(".appCompanyName + .appAttrValue").get_text()
yield from get_email(s,eng_name,address_url,payload)
def get_email(s,eng_name,url,payload):
res = s.post(url,data=payload)
soup = BeautifulSoup(res.text,'lxml')
email = soup.select_one(".EntityEmailAddresses:contains('Email') .appAttrValue").get_text()
yield eng_name,email
if __name__ == '__main__':
with requests.Session() as s:
for item in get_names(s):
print(item)
Output are like:
('AMY GEMS', 'amy.n.company#gmail.com')
('AHARATHAN LIN LIANJIN FOOD FLAVOR', 'skykoko344#gmail.com')
('AMETHYST DIAMOND KTV', 'twobrotherktv#gmail.com')
To get the Name (English) you can simply replace print(item.text) with print(item.text.split('/')[1].split('(')[0].strip()) which prints AMY GEMS

Problem/Error with scraping in a pandas data frame with beautifulsoup

I'm working on this csv (https://www.kaggle.com/jtrofe/beer-recipes) and I want to scrape every URL in data frame, but I can't because I have a problem/error, I'm not able to scrape all URL, if I try with 1 URL, it's ok and go, but with the function there is a problem... can someone help me?
This is my code:
import requests
from bs4 import BeautifulSoup
from time import sleep
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
base = 'https://www.brewersfriend.com'
links = [f'{base}{r}' for r in df['URL']]
while True:
try:
r = requests.get(links, headers=headers, stream=False, timeout=8).text
break
except:
if r.status_code == 404:
print("Client error")
r.raise_for_status()
sleep(1)
soup = BeautifulSoup(r, 'html5lib')
rating = soup.find('span', {'itemprop': 'ratingValue'})
DEFAULT_VALUE = 'NaN'
if rating is None:
rating = DEFAULT_VALUE
print(rating.text)
I'm already know that in some page there isn't a rating and so I create the DEFAULT_VALURE with Not a Number, but maybe is an error too.
Before this code there is the data frame, but I don't put it too.
I hope someone can help me!
Thanks so much
All kinds of messy things here. I won;t go over all of it, but one thing I see is you are trying to print (rating.text). If youre rating is 'NaN', one error is that you can't do rating.text
This is not how I would write this up, but going off your initial coding:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from time import sleep
df = pd.read_csv('C:/recipeData/recipeData.csv', encoding = 'ISO-8859-1')
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'}
base = 'https://www.brewersfriend.com'
links = [f'{base}{r}' for r in df['URL']]
for link in links:
try:
r = requests.get(link, headers=headers, stream=False, timeout=8)
if r.status_code == 404:
print("Client error")
r.raise_for_status()
continue
else:
r = r.text
except:
continue
soup = BeautifulSoup(r, 'html5lib')
rating = soup.find('span', {'itemprop': 'ratingValue'}).text
DEFAULT_VALUE = 'NaN'
if rating is None:
rating = DEFAULT_VALUE
print('%s: %s' %(link,rating))
Here is a way to do entire process
import requests, re
import pandas as pd
from bs4 import BeautifulSoup as bs
p = re.compile(r'dataviewToken":"(.*?)"')
p1 = re.compile(r'"rowCount":(\d+)')
results = []
i = 0
with requests.Session() as s:
r = s.get('https://www.kaggle.com/jtrofe/beer-recipes')
token = p.findall(r.text)[0]
rows = int(p1.findall(r.text)[0])
data = {"jwe":{"encryptedToken": token},"source":{"type":3,"dataset":{"url":"jtrofe/beer-recipes","tableType":1,"csv":{"fileName":"recipeData.csv","delimiter":",","headerRows":1}}},"select":["BeerID","Name","URL","Style","StyleID","Size(L)","OG","FG","ABV","IBU","Color","BoilSize","BoilTime","BoilGravity","Efficiency","MashThickness","SugarScale","BrewMethod","PitchRate","PrimaryTemp"],"skip":0,"take": rows}
base = 'https://www.brewersfriend.com'
r = s.post('https://www.kaggleusercontent.com/services/datasets/kaggle.dataview.v1.DataViewer/GetDataView', json = data).json()
names, links = zip(*[(row['text'][1], base + row['text'][2]) for row in r['dataView']['rows']])
for link in links:
r = s.get(link, headers = {'User-Agent' : 'Mozilla/5.0'})
if r.status_code == 403:
rating = 'N/A'
else:
soup = bs(r.content, 'lxml')
rating = soup.select_one('[itemprop=ratingValue]')
if rating is None:
rating = 'N/A'
else:
rating = rating.text
row = [names[i], rating]
results.append(row)
i+=1
df = pd.DataFrame(results, columns = ['Name', 'Rating'])
print(df.head())
df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False )

BeautifulSoup loop isn't iterating through other nodes

There are quite similar scenarios regarding this; but I've been comparing with others.
Getting from Clustered Nodes etc. But somehow; I'm unsure why my for loop isn't iterating and grabbing the text from other elements but only from the first element of the node.
from requests import get
from bs4 import BeautifulSoup
url = 'https://shopee.com.my/'
l = []
headers = {'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)'}
response = get(url, headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')
def findDiv():
try:
for container in html_soup.find_all('div', {'class': 'section-trending-search-list'}):
topic = container.select_one(
'div._1waRmo')
if topic:
print(1)
d = {
'Titles': topic.text.replace("\n", "")}
print(2)
l.append(d)
return d
except:
d = None
findDiv()
print(l)
from requests import get
from bs4 import BeautifulSoup
url = 'https://shopee.com.my/'
l = []
headers = {'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)'}
response = get(url, headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')
def findDiv():
try:
for container in html_soup.find_all('div', {'class': '_25qBG5'}):
topic = container.select_one('div._1waRmo')
if topic:
d = {'Titles': topic.text.replace("\n", "")}
l.append(d)
return d
except:
d = None
findDiv()
print(l)
Output:
[{'Titles': 'school backpack'}, {'Titles': 'oppo case'}, {'Titles': 'baby chair'}, {'Titles': 'car holder'}, {'Titles': 'sling beg'}]
Again I suggest you use selenium. If you run this again you will see that you will get a different set of 5 dictionaries within the list. Every time you are making a request they are giving 5 random trending items. But they do have a 'change' button. If you use selenium, you might be able to just click that and keep scraping all trending items.
Try this:
toplevel is finding the root of the options, then we find all divs under that.
I hope this is what you want.
from requests import get
from bs4 import BeautifulSoup
url = 'https://shopee.com.my/'
l = []
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'}
response = get(url, headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')
def findDiv():
try:
toplevel = html_soup.find('._25qBG5')
for container in toplevel.find_all('div'):
topic = container.select_one('._1waRmo')
if topic:
print(1)
d = {'Titles': topic.text.replace("\n", "")}
print(2)
l.append(d)
return d
except:
d = None
findDiv()
print(l)
This enumerates fine with a local file. When I tried with the url given, the website wasn't returning the html you show.
from requests import get
from bs4 import BeautifulSoup
url = 'path_in_here\\test.html'
l = []
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'}
example = open(url,"r")
text = example.read()
#response = get(url, headers=headers)
#html_soup = BeautifulSoup(response.text, 'html.parser')
html_soup = BeautifulSoup(text, 'html.parser')
print (text)
def findDiv():
#try:
print("finding toplevel")
toplevel = html_soup.find("div", { "class": "_25qBG5"} )
print ("found toplevel")
divs = toplevel.findChildren("div", recursive=True)
print("found divs")
for container in divs:
print ("loop")
topic = container.select_one('.1waRmo')
if topic:
print(1)
d = {'Titles': topic.text.replace("\n", "")}
print(2)
l.append(d)
return d
#except:
# d = None
# print ("error")
findDiv()
print(l)

Categories