BeautifulSoup loop isn't iterating through other nodes

BeautifulSoup loop isn't iterating through other nodes - python

There are quite similar scenarios regarding this; but I've been comparing with others.
Getting from Clustered Nodes etc. But somehow; I'm unsure why my for loop isn't iterating and grabbing the text from other elements but only from the first element of the node.
from requests import get
from bs4 import BeautifulSoup
url = 'https://shopee.com.my/'
l = []
headers = {'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)'}
response = get(url, headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')
def findDiv():
try:
for container in html_soup.find_all('div', {'class': 'section-trending-search-list'}):
topic = container.select_one(
'div._1waRmo')
if topic:
print(1)
d = {
'Titles': topic.text.replace("\n", "")}
print(2)
l.append(d)
return d
except:
d = None
findDiv()
print(l)

from requests import get
from bs4 import BeautifulSoup
url = 'https://shopee.com.my/'
l = []
headers = {'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)'}
response = get(url, headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')
def findDiv():
try:
for container in html_soup.find_all('div', {'class': '_25qBG5'}):
topic = container.select_one('div._1waRmo')
if topic:
d = {'Titles': topic.text.replace("\n", "")}
l.append(d)
return d
except:
d = None
findDiv()
print(l)
Output:
[{'Titles': 'school backpack'}, {'Titles': 'oppo case'}, {'Titles': 'baby chair'}, {'Titles': 'car holder'}, {'Titles': 'sling beg'}]
Again I suggest you use selenium. If you run this again you will see that you will get a different set of 5 dictionaries within the list. Every time you are making a request they are giving 5 random trending items. But they do have a 'change' button. If you use selenium, you might be able to just click that and keep scraping all trending items.

Try this:
toplevel is finding the root of the options, then we find all divs under that.
I hope this is what you want.
from requests import get
from bs4 import BeautifulSoup
url = 'https://shopee.com.my/'
l = []
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'}
response = get(url, headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')
def findDiv():
try:
toplevel = html_soup.find('._25qBG5')
for container in toplevel.find_all('div'):
topic = container.select_one('._1waRmo')
if topic:
print(1)
d = {'Titles': topic.text.replace("\n", "")}
print(2)
l.append(d)
return d
except:
d = None
findDiv()
print(l)
This enumerates fine with a local file. When I tried with the url given, the website wasn't returning the html you show.
from requests import get
from bs4 import BeautifulSoup
url = 'path_in_here\\test.html'
l = []
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'}
example = open(url,"r")
text = example.read()
#response = get(url, headers=headers)
#html_soup = BeautifulSoup(response.text, 'html.parser')
html_soup = BeautifulSoup(text, 'html.parser')
print (text)
def findDiv():
#try:
print("finding toplevel")
toplevel = html_soup.find("div", { "class": "_25qBG5"} )
print ("found toplevel")
divs = toplevel.findChildren("div", recursive=True)
print("found divs")
for container in divs:
print ("loop")
topic = container.select_one('.1waRmo')
if topic:
print(1)
d = {'Titles': topic.text.replace("\n", "")}
print(2)
l.append(d)
return d
#except:
# d = None
# print ("error")
findDiv()
print(l)

Related

Parsing in python for all pages of the site section

I'm making a python parser for the site: https://www.kinopoisk.ru/lists/series-top250/
import requests
from bs4 import BeautifulSoup
import csv
CSV = 'genres.csv'
URL = 'https://www.kinopoisk.ru/lists/series-top250/?page=1&tab=all'
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0', 'accept': '*/*'}
def get_html(url, params = None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', class_='selection-film-item-meta selection-film-item-meta_theme_desktop')
genres = []
for item in items:
additional = item.find_all('span', {'class':'selection-film-item-meta__meta-additional-item'})
genres.append(
{
'genre': additional[1].get_text(strip = True)
}
)
return genres
def save_genres(items, path):
with open(path, 'w', newline='') as file:
writer = csv.writer(file, delimiter=',')
writer.writerow(['genre'])
for item in items:
writer.writerow([item['genre']])
def parser():
html = get_html(URL)
if html.status_code == 200:
genres = []
for page in range(1, 6):
html = get_html(URL, params = {'page': page})
genres.extend(get_content(html.text))
save_genres(genres, CSV)
pass
else:
print('Non_available')
parser()
The section of site has 5 pages of rating:
https://www.kinopoisk.ru/lists/series-top250/?page=1&tab=all
...
https://www.kinopoisk.ru/lists/series-top250/?page=5&tab=all
I made a for_loop for parsing from all pages with changing number of page
for page in range(1, 6):
html = get_html(URL, params = {'page': page})
genres.extend(get_content(html.text))
but parsing occurs only on 1 page. Please tell me, what am I doing wrong?
And when I save the result in CSV, each line can contain more than 1 word (genre designation), I don’t know how to make sure that there is only 1 value on 1 line for aggregated analytics
Thank you!

Remove the parameters from the URL (the part after ? included):
import requests
from bs4 import BeautifulSoup
import csv
CSV = "genres.csv"
URL = "https://www.kinopoisk.ru/lists/series-top250/"
HEADERS = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0",
"accept": "*/*",
}
PARAMS = {"page": 1, "tab": "all"}
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_content(html):
soup = BeautifulSoup(html, "html.parser")
items = soup.find_all(
"div",
class_="selection-film-item-meta selection-film-item-meta_theme_desktop",
)
genres = []
for item in items:
additional = item.find_all(
"span", {"class": "selection-film-item-meta__meta-additional-item"}
)
genres.append({"genre": additional[1].get_text(strip=True)})
return genres
def save_genres(items, path):
with open(path, "w", newline="") as file:
writer = csv.writer(file, delimiter=",")
writer.writerow(["genre"])
for item in items:
writer.writerow([item["genre"]])
def parser():
genres = []
for page in range(1, 6):
print("Parsing page {}...".format(page))
PARAMS["page"] = page
html = get_html(URL, PARAMS)
if html.status_code == 200:
genres.extend(get_content(html.text))
else:
print("Non_available")
save_genres(genres, CSV)
parser()
Creates genres.csv:

Issue while extracting table data using beautifulsoup in python

The code below gives me the required product data as a table. It works fine for most links, however in some it stops midway giving an error NoneType object has no attribute find_all in table.find_all('tr).
I believe this is because table does not exist in some products, so I tried creating an if condition on the existence of the table, but that also doesn't seem to help. What changes should I make in the code below?
import requests, json, time
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.1800wheelchair.com/category/toilet-accessories/?p="
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
data = []
for i in range(1,3):
print(i)
res = requests.get(url + str(i), headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
p_links = [i["data-link"] for i in soup.find("ul", {"id":"products-list"}).find_all("li",class_=["openlink","item"])]
for prod_url in p_links:
print(prod_url)
temp = {"Product URL": prod_url}
prod_res = requests.get(prod_url, headers = headers)
prod_soup = BeautifulSoup(prod_res.text, "html.parser")
for p in prod_soup.find("div", class_="basic-information").find_all("p"):
if "item" in p.text.lower(): temp["item number"] = p.find("span").text.strip()
elif "brand" in p.text.lower(): temp["manufacturer"] = p.find("span").text.strip()
elif "sku" in p.text.lower(): temp["sku"] = p.find("span").text.strip()
table = prod_soup.find("table",{"class":"specifications"})
for tr in table.find_all("tr"):
temp[tr.find("td", {"class":"tdLabel"}).text.strip()] = tr.find("td", {"class":"tdValue"}).text.strip()
data.append(temp)
pd.DataFrame(data).to_csv("toilet-acc.csv", index=False)

You can use Try and Except (documentation):
try:
for tr in table.find_all("tr"):
temp[tr.find("td", {"class":"tdLabel"}).text.strip()] = tr.find("td", {"class":"tdValue"}).text.strip()
except:
pass

you can use this:
tables = soup.select('table', attrs={"class":"specifications"})
rows = tables.findChildren(['tr'])

import requests, json, time
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.1800wheelchair.com/category/toilet-accessories/?p="
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
data = []
for i in range(1,3):
print(i)
res = requests.get(url + str(i), headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
p_links = [i["data-link"] for i in soup.find("ul", {"id":"products-list"}).find_all("li",class_=["openlink","item"])]
for prod_url in p_links:
print(prod_url)
temp = {"Product URL": prod_url}
prod_res = requests.get(prod_url, headers = headers)
prod_soup = BeautifulSoup(prod_res.text, "html.parser")
try:
for p in prod_soup.find("div", class_="basic-information").find_all("p"):
if "item" in p.text.lower(): temp["item number"] = p.find("span").text.strip()
elif "brand" in p.text.lower(): temp["manufacturer"] = p.find("span").text.strip()
elif "sku" in p.text.lower(): temp["sku"] = p.find("span").text.strip()
table = prod_soup.find("table",{"class":"specifications"})
for tr in table.find_all("tr"):
temp[tr.find("td", {"class":"tdLabel"}).text.strip()] = tr.find("td", {"class":"tdValue"}).text.strip()
except:
print("Failed for URL {}".format(prod_url))
data.append(temp)
time.sleep(2)
pd.DataFrame(data).to_csv("toilet-acc.csv", index=False)
Put a try/except not only to extract product specification but also to extract item/brand/sku. But in the except put a print statement to know which all urls failed so that you can try them again

Unable to scrape the name from the inner page of each result using requests

I've created a script in python making use of post http requests to get the search results from a webpage. To populate the results, it is necessary to click on the fields sequentially shown here. Now a new page will be there and this is how to populate the result.
There are ten results in the first page and the following script can parse the results flawlessly.
What I wish to do now is use the results to reach their inner page in order to parse Sole Proprietorship Name (English) from there.
website address
I've tried so far with:
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.businessregistration.moc.gov.kh/cambodia-master/service/create.html?targetAppCode=cambodia-master&targetRegisterAppCode=cambodia-br-soleproprietorships&service=registerItemSearch"
payload = {
'QueryString': '0',
'SourceAppCode': 'cambodia-br-soleproprietorships',
'OriginalVersionIdentifier': '',
'_CBASYNCUPDATE_': 'true',
'_CBHTMLFRAG_': 'true',
'_CBNAME_': 'buttonPush'
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
res = s.get(url)
target_url = res.url.split("&")[0].replace("view.", "update.")
node = re.findall(r"nodeW\d.+?-Advanced",res.text)[0].strip()
payload['_VIKEY_'] = re.findall(r"viewInstanceKey:'(.*?)',", res.text)[0].strip()
payload['_CBHTMLFRAGID_'] = re.findall(r"guid:(.*?),", res.text)[0].strip()
payload[node] = 'N'
payload['_CBNODE_'] = re.findall(r"Callback\('(.*?)','buttonPush", res.text)[2]
payload['_CBHTMLFRAGNODEID_'] = re.findall(r"AsyncWrapper(W\d.+?)'",res.text)[0].strip()
res = s.post(target_url,data=payload)
soup = BeautifulSoup(res.content, 'html.parser')
for item in soup.find_all("span", class_="appReceiveFocus")[3:]:
print(item.text)
How can I parse the Name (English) from each of the results inner page using requests?

This is one of the ways you can parse the name from the site's inner page and then email address from the address tab. I added this function .get_email() only because I wanted to let you know as to how you can parse content from different tabs.
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.businessregistration.moc.gov.kh/cambodia-master/service/create.html?targetAppCode=cambodia-master&targetRegisterAppCode=cambodia-br-soleproprietorships&service=registerItemSearch"
result_url = "https://www.businessregistration.moc.gov.kh/cambodia-master/viewInstance/update.html?id={}"
base_url = "https://www.businessregistration.moc.gov.kh/cambodia-br-soleproprietorships/viewInstance/update.html?id={}"
def get_names(s):
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
res = s.get(url)
target_url = result_url.format(res.url.split("id=")[1])
soup = BeautifulSoup(res.text,"lxml")
payload = {i['name']:i.get('value','') for i in soup.select('input[name]')}
payload['QueryString'] = 'a'
payload['SourceAppCode'] = 'cambodia-br-soleproprietorships'
payload['_CBNAME_'] = 'buttonPush'
payload['_CBHTMLFRAG_'] = 'true'
payload['_VIKEY_'] = re.findall(r"viewInstanceKey:'(.*?)',", res.text)[0].strip()
payload['_CBHTMLFRAGID_'] = re.findall(r"guid:(.*?),", res.text)[0].strip()
payload['_CBNODE_'] = re.findall(r"Callback\('(.*?)','buttonPush", res.text)[-1]
payload['_CBHTMLFRAGNODEID_'] = re.findall(r"AsyncWrapper(W\d.+?)'",res.text)[0].strip()
res = s.post(target_url,data=payload)
soup = BeautifulSoup(res.text,"lxml")
payload.pop('_CBHTMLFRAGNODEID_')
payload.pop('_CBHTMLFRAG_')
payload.pop('_CBHTMLFRAGID_')
for item in soup.select("a[class*='ItemBox-resultLeft-viewMenu']"):
payload['_CBNAME_'] = 'invokeMenuCb'
payload['_CBVALUE_'] = ''
payload['_CBNODE_'] = item['id'].replace('node','')
res = s.post(target_url,data=payload)
soup = BeautifulSoup(res.text,'lxml')
address_url = base_url.format(res.url.split("id=")[1])
node_id = re.findall(r"taba(.*)_",soup.select_one("a[aria-label='Addresses']")['id'])[0]
payload['_CBNODE_'] = node_id
payload['_CBHTMLFRAGID_'] = re.findall(r"guid:(.*?),", res.text)[0].strip()
payload['_CBNAME_'] = 'tabSelect'
payload['_CBVALUE_'] = '1'
eng_name = soup.select_one(".appCompanyName + .appAttrValue").get_text()
yield from get_email(s,eng_name,address_url,payload)
def get_email(s,eng_name,url,payload):
res = s.post(url,data=payload)
soup = BeautifulSoup(res.text,'lxml')
email = soup.select_one(".EntityEmailAddresses:contains('Email') .appAttrValue").get_text()
yield eng_name,email
if __name__ == '__main__':
with requests.Session() as s:
for item in get_names(s):
print(item)
Output are like:
('AMY GEMS', 'amy.n.company#gmail.com')
('AHARATHAN LIN LIANJIN FOOD FLAVOR', 'skykoko344#gmail.com')
('AMETHYST DIAMOND KTV', 'twobrotherktv#gmail.com')

To get the Name (English) you can simply replace print(item.text) with print(item.text.split('/')[1].split('(')[0].strip()) which prints AMY GEMS

Incomprehensible parser behavior

Help me please! I programmed a simple parser, but it does not work correctly, and I do not know what this is connected with.
import requests
from bs4 import BeautifulSoup
URL = 'https://stopgame.ru//topgames'
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0', 'accept': '*/*'}
HOST = 'https://stopgame.ru'
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('a', class_="lent-block game-block")
print(items)
def parse():
html = get_html(URL)
if html.status_code == 200:
items = get_content(html.text)
else:
print('Error')
parse()
I've got this output :
[]
Process finished with exit code 0

items = soup.find_all('a', class_="lent-block game-block")
You are trying to find out "lent-block game-block" class for anchor
tag which actually is not there in html and hence you are getting
blank list.
Try with this div item you will get the list of matched items.
items = soup.find_all('div', class_="lent-block lent-main")

Unable to expand more... python

I can scrape all the reviews from the web page.But I am not getting full content.Only half review content i can scrape.I need to scrape the full content.
from bs4 import BeautifulSoup import requests import re
s = requests.Session()
def get_soup(url):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'}
r = s.get(url, headers=headers)
#with open('temp.html', 'wb') as f:
# f.write(r.content)
# webbrowser.open('temp.html')
if r.status_code != 200:
print('status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def parse(url, response):
if not response:
print('no response:', url)
return
# get number of reviews
# num_reviews = response.find('span', class_='reviews_header_count').text
# num_reviews = num_reviews[1:-1] # remove `( )`
# num_reviews = num_reviews.replace(',', '') # remove `,`
# num_reviews = int(num_reviews)
# print('num_reviews:', num_reviews, type(num_reviews))
num_reviews = (20)
# num_reviews = num_reviews[1:-1] # remove `( )`
# num_reviews = num_reviews.replace(',', '') # remove `,`
# num_reviews = int(num_reviews)
print('num_reviews:', num_reviews, type(num_reviews))
# create template for urls to pages with reviews
url = url.replace('Hilton_New_York_Grand_Central-New_York_City_New_York.html', 'or{}-Hilton_New_York_Grand_Central-New_York_City_New_York.html')
print('template:', url)
# add requests to list
for offset in range(0, num_reviews, 5):
print('url:', url.format(offset))
url_ = url.format(offset)
parse_reviews(url_, get_soup(url_))
#return # for test only - to stop after first page
def parse_reviews(url, response):
print('review:', url)
if not response:
print('no response:', url)
return
for idx, review in enumerate(response.find_all('div', class_='review-container')):
item = {
'hotel_name': response.find('h1', class_='heading_title').text,
'review_title': review.find('span', class_='noQuotes').text,
'review_body': review.find('p', class_='partial_entry').text,
'review_date': review.find('span', class_='relativeDate')['title'],#.text,#[idx],
# 'num_reviews_reviewer': review.find('span', class_='badgetext').text,
'reviewer_name': review.find('span', class_='scrname').text,
'bubble_rating': review.select_one('div.reviewItemInline span.ui_bubble_rating')['class'][1][7:],
}
#~ yield item
results.append(item)
for key,val in item.items():
print(key, ':', val)
print('----')
#return # for test only - to stop after first review
start_urls = [
'https://www.tripadvisor.in/Hotel_Review-g60763-d93339-Reviews-Hilton_New_York_Grand_Central-New_York_City_New_York.html',
#'https://www.tripadvisor.com/Hotel_Review-g60795-d102542-Reviews-Courtyard_Philadelphia_Airport-Philadelphia_Pennsylvania.html',
#'https://www.tripadvisor.com/Hotel_Review-g60795-d122332-Reviews-The_Ritz_Carlton_Philadelphia-Philadelphia_Pennsylvania.html', ]
results = []
for url in start_urls:
parse(url, get_soup(url))
import pandas as pd
df = pd.DataFrame(results) # <--- convert list to DataFrame df.to_csv('output.csv')
I am getting an output sample in csv file from review like:
I went on a family trip and it was amazing, I hope to come back soon. The room was small but what can you expect from New York. It was close to many things and the staff was perfect.I will come back again soon.More...
I just want to expand that more. I need a help..I really have no clue to do it.Please help.
I have written one more code but unable to pull the id from next page.Code is given below
import re
import urllib
#import webbrowser``
s = requests.Session()
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'}
for i in range(0,10,5):
url = ("https://www.tripadvisor.in/Hotel_Review-g60763-d93339-Reviews-or{}-Hilton_New_York_Grand_Central-New_York_City_New_York.html").format(i)
print(url)
r = s.get(url,headers=headers)
html = BeautifulSoup(r.text, 'html.parser')
pattern = re.compile(r"UID_(\w+)\-SRC_(\w+)")
id = soup.find("div", id=pattern)["id"]
uid = pattern.match(id).group(2)
print(uid)
url1 ="https://www.tripadvisor.in/ShowUserReviews-g60763-d93339-r"+str(uid)+"-Hilton_New_York_Grand_Central-New_York_City_New_York.html#CHECK_RATES_CONT"
print(url1)
url2 = ('"' + url1 + '"')`enter code here`
print(url2)

The site uses ajax to expand the review content. The full content is not downloaded until the More link is clicked.
One way to access the content would be to figure out the ajax request format and then issue a HTTP request for the same. That might be difficult, perhaps not.
Another, easier, way is by noticing that the review title is a clickable link which loads the full review in a new page. You can therefore scrape the URL for each review and send a similar GET request. Then scrape the data from the response.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

BeautifulSoup loop isn't iterating through other nodes - python

Related

Parsing in python for all pages of the site section

Issue while extracting table data using beautifulsoup in python

Unable to scrape the name from the inner page of each result using requests

Incomprehensible parser behavior

Unable to expand more... python

Categories

Resources