I am trying to scrape this websites: voxnews.info
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
web='https://voxnews.info'
def main(req, num, web):
r = req.get(web+"/page/{}/".format(num))
soup = BeautifulSoup(r.content, 'html.parser')
goal = [(x.time.text, x.h1.a.get_text(strip=True), x.select_one("span.cat-links").get_text(strip=True), x.p.get_text(strip=True))
for x in soup.select("div.site-content")]
return goal
with ThreadPoolExecutor(max_workers=30) as executor:
with requests.Session() as req:
fs = [executor.submit(main, req, num) for num in range(1, 2)] # need to scrape all the webpages in the website
allin = []
for f in fs:
allin.extend(f.result())
df = pd.DataFrame.from_records(
allin, columns=["Date", "Title", "Category", "Content"])
print(df)
But the code has two problems:
the first one is that I am not scraping all the pages (I currently put 1 and 2 in the range, but I would need all the pages);
it does not save correctly the dates.
If could have a look at the code and tell me how to improve it in order to fix these two issues,it would be awesome.
Some minor changes.
First it isn't necessary to use requests.Session() for single requests - you aren't trying to save data between requests.
A minor change to how you had your with statement, I don't know if it's more correct, or just how I do it, you don't need all of the code to run with the executer still open.
I gave you two options for parsing the date, either as it's written on the site, a string in Italian, or as a datetime object.
I didn't see any "p" tag within the articles, so I removed that part. It seems in order to get the "content" of the articles, you would have to actually navigate to and scrape them individually. I removed that line from the code.
In your original code, you weren't getting every single article on the page, just the first one of each. There is only one "div.site-content" tag per page, but multiple "article" tags. That's what that change is.
And finally, I prefer find over select, but that's just my style choice. This worked for me for the first three pages, I didn't try the entire site. Be careful when you do run this, 78 blocks of 30 requests might get you blocked...
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import datetime
def main(num, web):
r = requests.get(web+"/page/{}/".format(num))
soup = BeautifulSoup(r.content, 'html.parser')
html = soup.find("div", class_="site-content")
articles = html.find_all("article")
# Date as string In italian
goal = [(x.time.get_text(), x.h1.a.get_text(strip=True), x.find("span", class_="cat-links").get_text(strip=True)) for x in articles]
# OR as datetime object
goal = [(datetime.datetime.strptime(x.time["datetime"], "%Y-%m-%dT%H:%M:%S%z"), x.h1.a.get_text(strip=True), x.find("span", class_="cat-links").get_text(strip=True)) for x in articles]
return goal
web='https://voxnews.info'
r = requests.get(web)
soup = BeautifulSoup(r.text, "html.parser")
last_page = soup.find_all("a", class_="page-numbers")[1].get_text()
last_int = int(last_page.replace(".",""))
### BE CAREFUL HERE WITH TESTING, DON'T USE ALL 2,320 PAGES ###
with ThreadPoolExecutor(max_workers=30) as executor:
fs = [executor.submit(main, num, web) for num in range(1, last_int)]
allin = []
for f in fs:
allin.extend(f.result())
df = pd.DataFrame.from_records(
allin, columns=["Date", "Title", "Category"])
print(df)
In order to fetch results from all pages, not just one or ten pages (i.e. hardcoded), the best solution is to use an infinite while loop and test for something (button, element) that will cause it to exit.
This solution is better than a hardcoded for loop since the while loop will go through all pages no matter how many there are until a certain condition is fulfilled. In our case, this is the presence of a button on the page (.next CSS selector):
if soup.select_one(".next"):
page_num += 1
else:
break
You can also add a limit on the number of pages, upon reaching which the cycle will also stop:
limit = 20 # paginate through 20 pages
if page_num == limit:
break
Check code in online IDE.
from bs4 import BeautifulSoup
import requests, json, lxml
# https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
}
data = []
page_num = 1
limit = 20 # page limit
while True:
html = requests.get(f"https://voxnews.info/page/{page_num}", headers=headers, timeout=30)
soup = BeautifulSoup(html.text, "lxml")
print(f"Extracting page: {page_num}")
print("-" * 10)
for result in soup.select(".entry-header"):
title = result.select_one(".entry-title a").text
category = result.select_one(".entry-meta:nth-child(1)").text.strip()
date = result.select_one(".entry-date").text
data.append({
"title": title,
"category": category,
"date": date
})
# Condition for exiting the loop when the specified number of pages is reached.
if page_num == limit:
break
if soup.select_one(".next"):
page_num += 1
else:
break
print(json.dumps(data, indent=2, ensure_ascii=False))
Example output:
[
{
"title": "Italia invasa dai figli degli immigrati: “Italiani pezzi di merda” – VIDEO",
"category": "BREAKING NEWS, INVASIONE, MILANO, VIDEO",
"date": "Novembre 23, 2022"
},
{
"title": "Soumahoro accusato di avere fatto sparire altri 200mila euro – VIDEO",
"category": "BREAKING NEWS, POLITICA, VIDEO",
"date": "Novembre 23, 2022"
},
{
"title": "Città invase da immigrati: “Qui comandiamo noi” – VIDEO",
"category": "BREAKING NEWS, INVASIONE, VENEZIA, VIDEO",
"date": "Novembre 23, 2022"
},
# ...
]
There's a 13 ways to scrape any public data from any website blog post if you want to know more about website scraping.
Related
I am interested in scraping PDFs from any of the speakers on this page. How might I go about this: https://www.nas.gov.sg/archivesonline/speeches/search-result?search-type=advanced&speaker=Amy+Khor
The website has changed from previous occasions and the code used previously such as this:
import requests
from bs4 import BeautifulSoup
url = 'http://www.nas.gov.sg/archivesonline/speeches/search-result?search-type=advanced&speaker='
search_term = 'Amy+Khor'
data = {
'keywords': search_term,
'search-type': 'basic',
'keywords-type': 'all',
'page-num': 1
}
soup = BeautifulSoup(requests.post(url, data=data).text, 'lxml')
cnt = 1
while True:
print()
print('Page no. {}'.format(cnt))
print('-' * 80)
for a in soup.select('a[href$=".pdf"]'):
print(a['href'])
if soup.select_one('span.next-10'):
data['page-num'] += 10
cnt += 1
soup = BeautifulSoup(requests.post(url, data=data).text, 'lxml')
else:
break
The code above no longer works...
To get all PDF links from the pages you can use next example:
import requests
from bs4 import BeautifulSoup
url = "https://www.nas.gov.sg/archivesonline/speeches/search-result"
params = {
"search-type": "advanced",
"speaker": "Amy Khor",
"page-num": "1",
}
for params["page-num"] in range(1, 3): # <--- increase number of pages here
soup = BeautifulSoup(
requests.get(url, params=params).content, "html.parser"
)
for a in soup.select('a[href$="pdf"]'):
print("https:" + a["href"])
print("-" * 80)
Prints:
https://www.nas.gov.sg/archivesonline/data/pdfdoc/MINDEF_20171123001_2.pdf
https://www.nas.gov.sg/archivesonline/data/pdfdoc/MSE_20151126001.pdf
https://www.nas.gov.sg/archivesonline/data/pdfdoc/MSE_20160229002.pdf
...and so on.
Here's how I'd do it if I were to start from scratch.
Google Search is actually pretty powerful, and I feel like this query gets your pdfs:
"Amy Khor" site:https://www.nas.gov.sg/archivesonline/data/pdfdoc filetype:pdf
Then, I'd use either BeautifulSoup or, even better, something like googlesearch-python to get the results and process them into your desired lxml format.
Hi I am a Newbie to programming. So I spent 4 days trying to learn python. I evented some new swear words too.
I was particularly interested in trying as an exercise some web-scraping to learn something new and get some exposure to see how it all works.
This is what I came up with. See code at end. It works (to a degree)
But what's missing?
This website has pagination on it. In this case 11 pages worth. How would you go about adding to this script and getting python to go look at those other pages too and carry out the same scrape. Ie scrape page one , scrape page 2, 3 ... 11 and post the results to a csv?
https://www.organicwine.com.au/vegan/?pgnum=1
https://www.organicwine.com.au/vegan/?pgnum=2
https://www.organicwine.com.au/vegan/?pgnum=3
https://www.organicwine.com.au/vegan/?pgnum=4
https://www.organicwine.com.au/vegan/?pgnum=5
https://www.organicwine.com.au/vegan/?pgnum=6
https://www.organicwine.com.au/vegan/?pgnum=7
8, 9,10, and 11
On these pages the images are actually a thumbnail images something like 251px by 251px.
How would you go about adding to this script to say. And whilst you are at it follow the links to the detailed product page and capture the image link from there where the images are 1600px by 1600px and post those links to CSV
https://www.organicwine.com.au/mercer-wines-preservative-free-shiraz-2020
When we have identified those links lets also download those larger images to a folder
CSV writer. Also I don't understand line 58
for i in range(23)
how would i know how many products there were without counting them (i.e. there is 24 products on page one)
So this is what I want to learn how to do. Not asking for much (he says sarcastically) I could pay someone on up-work to do it but where's the fun in that? and that does not teach me how to 'fish'.
Where is a good place to learn python? A master class on web-scraping. It seems to be trial and error and blog posts and where ever you can pick up bits of information to piece it all together.
Maybe I need a mentor.
I wish there had been someone I could have reached out to, to tell me what beautifulSoup was all about. worked it out by trial and error and mostly guessing. No understanding of it but it just works.
Anyway, any help in pulling this all together to produce a decent script would be greatly appreciated.
Hopefully there is someone out there who would not mind helping me.
Apologies to organicwine for using their website as a learning tool. I do not wish to cause any harm or be a nuisance to the site
Thank you in advance
John
code:
import requests
import csv
from bs4 import BeautifulSoup
URL = "https://www.organicwine.com.au/vegan/?pgnum=1"
response = requests.get(URL)
website_html = response.text
soup = BeautifulSoup(website_html, "html.parser")
product_title = soup.find_all('div', class_="caption")
# print(product_title)
winename = []
for wine in product_title:
winetext = wine.a.text
winename.append(winetext)
print(f'''Wine Name: {winetext}''')
# print(f'''\nWine Name: {winename}\n''')
product_price = soup.find_all('div', class_='wrap-thumb-mob')
# print(product_price.text)
price =[]
for wine in product_price:
wineprice = wine.span.text
price.append(wineprice)
print(f'''Wine Price: {wineprice}''')
# print(f'''\nWine Price: {price}\n''')
image =[]
product_image_link = (soup.find_all('div', class_='thumbnail-image'))
# print(product_image_link)
for imagelink in product_image_link:
wineimagelink = imagelink.a['href']
image.append(wineimagelink)
# image.append(imagelink)
print(f'''Wine Image Lin: {wineimagelink}''')
# print(f'''\nWine Image: {image}\n''')
#
#
# """ writing data to CSV """
# open OrganicWine2.csv file in "write" mode
# newline stops a blank line appearing in csv
with open('OrganicWine2.csv', 'w',newline='') as file:
# create a "writer" object
writer = csv.writer(file, delimiter=',')
# use "writer" obj to write
# you should give a "list"
writer.writerow(["Wine Name", "Wine Price", "Wine Image Link"])
for i in range(23):
writer.writerow([
winename[i],
price[i],
image[i],
])
In this case, to do pagination, instead of for i in range(1, 100) which is a hardcoded way of paging, it's better to use a while loop to dynamically paginate all possible pages.
"While" is an infinite loop and it will be executed until the transition to the next page is possible, in this case it will check for the presence of the button for the next page, for which the CSS selector ".fa-chevron-right" is responsible:
if soup.select_one(".fa-chevron-right"):
params["pgnum"] += 1 # go to the next page
else:
break
To extract the full size image an additional request is required, CSS selector ".main-image a" is responsible for full-size images:
full_image_html = requests.get(link, headers=headers, timeout=30)
image_soup = BeautifulSoup(full_image_html.text, "lxml")
try:
original_image = f'https://www.organicwine.com.au{image_soup.select_one(".main-image a")["href"]}'
except:
original_image = None
An additional step to avoid being blocked is to rotate user-agents. Ideally, it would be better to use residential proxies with random user-agent.
pandas can be used to extract data in CSV format:
pd.DataFrame(data=data).to_csv("<csv_file_name>.csv", index=False)
For a quick and easy search for CSS selectors, you can use the SelectorGadget Chrome extension (not always work perfectly if the website is rendered via JavaScript).
Check code with pagination and saving information to CSV in online IDE.
from bs4 import BeautifulSoup
import requests, json, lxml
import pandas as pd
# https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
}
params = {
'pgnum': 1 # number page by default
}
data = []
while True:
page = requests.get(
"https://www.organicwine.com.au/vegan/?",
params=params,
headers=headers,
timeout=30,
)
soup = BeautifulSoup(page.text, "lxml")
print(f"Extracting page: {params['pgnum']}")
for products in soup.select(".price-btn-conts"):
try:
title = products.select_one(".new-h3").text
except:
title = None
try:
price = products.select_one(".price").text.strip()
except:
price = None
try:
snippet = products.select_one(".price-btn-conts p a").text
except:
snippet = None
try:
link = products.select_one(".new-h3 a")["href"]
except:
link = None
# additional request is needed to extract full size image
full_image_html = requests.get(link, headers=headers, timeout=30)
image_soup = BeautifulSoup(full_image_html.text, "lxml")
try:
original_image = f'https://www.organicwine.com.au{image_soup.select_one(".main-image a")["href"]}'
except:
original_image = None
data.append(
{
"title": title,
"price": price,
"snippet": snippet,
"link": link,
"original_image": original_image
}
)
if soup.select_one(".fa-chevron-right"):
params["pgnum"] += 1
else:
break
# save to CSV (install, import pandas as pd)
pd.DataFrame(data=data).to_csv("<csv_file_name>.csv", index=False)
print(json.dumps(data, indent=2, ensure_ascii=False))
Example output:
[
{
"title": "Yangarra McLaren Vale GSM 2016",
"price": "$29.78 in a straight 12\nor $34.99 each",
"snippet": "The Yangarra GSM is a careful blending of Grenache, Shiraz and Mourvèdre in which the composition varies from year to year, conveying the traditional estate blends of the southern Rhône. The backbone of the wine comes fr...",
"link": "https://www.organicwine.com.au/yangarra-mclaren-vale-gsm-2016",
"original_image": "https://www.organicwine.com.au/assets/full/YG_GSM_16.png?20211110083637"
},
{
"title": "Yangarra Old Vine Grenache 2020",
"price": "$37.64 in a straight 12\nor $41.99 each",
"snippet": "Produced from the fruit of dry grown bush vines planted high up in the Estate's elevated vineyards in deep sandy soils. These venerated vines date from 1946 and produce a wine that is complex, perfumed and elegant with a...",
"link": "https://www.organicwine.com.au/yangarra-old-vine-grenache-2020",
"original_image": "https://www.organicwine.com.au/assets/full/YG_GRE_20.jpg?20210710165951"
},
#...
]
Create the URL by putting the page number in it, then put the rest of your code into a for loop and you can use len(winenames) to count how many results you have. You should do the writing outside the for loop. Here's your code with those changes:
import requests
import csv
from bs4 import BeautifulSoup
num_pages = 11
result = []
for pgnum in range(num_pages):
url = f"https://www.organicwine.com.au/vegan/?pgnum={pgnum+1}"
response = requests.get(url)
website_html = response.text
soup = BeautifulSoup(website_html, "html.parser")
product_title = soup.find_all("div", class_="caption")
winename = []
for wine in product_title:
winetext = wine.a.text
winename.append(winetext)
product_price = soup.find_all("div", class_="wrap-thumb-mob")
price = []
for wine in product_price:
wineprice = wine.span.text
price.append(wineprice)
image = []
product_image_link = soup.find_all("div", class_="thumbnail-image")
for imagelink in product_image_link:
winelink = imagelink.a["href"]
response = requests.get(winelink)
wine_page_soup = BeautifulSoup(response.text, "html.parser")
main_image = wine_page_soup.find("a", class_="fancybox")
image.append(main_image['href'])
for i in range(len(winename)):
result.append([winename[i], price[i], image[i]])
with open("/tmp/OrganicWine2.csv", "w", newline="") as file:
writer = csv.writer(file, delimiter=",")
writer.writerow(["Wine Name", "Wine Price", "Wine Image Link"])
writer.writerows(results)
And here's how I would rewrite your code to accomplish this task. It's more pythonic (you should basically never write range(len(something)), there's always a cleaner way) and it doesn't require knowing how many pages of results there are:
import csv
import itertools
import time
import requests
from bs4 import BeautifulSoup
data = []
# Try opening 100 pages at most, in case the scraping code is broken
# which can happen because websites change.
for pgnum in range(1, 100):
url = f"https://www.organicwine.com.au/vegan/?pgnum={pgnum}"
response = requests.get(url)
website_html = response.text
soup = BeautifulSoup(website_html, "html.parser")
search_results = soup.find_all("div", class_="thumbnail")
for search_result in search_results:
name = search_result.find("div", class_="caption").a.text
price = search_result.find("p", class_="price").span.text
# link to the product's page
link = search_result.find("div", class_="thumbnail-image").a["href"]
# get the full resolution product image
response = requests.get(link)
time.sleep(1) # rate limit
wine_page_soup = BeautifulSoup(response.text, "html.parser")
main_image = wine_page_soup.find("a", class_="fancybox")
image_url = main_image["href"]
# or you can just "guess" it from the thumbnail's URL
# thumbnail = search_result.find("div", class_="thumbnail-image").a.img['src']
# image_url = thumbnail.replace('/thumbL/', '/full/')
data.append([name, price, link, image_url])
# if there's no "next page" button or no search results on the current page,
# stop scraping
if not soup.find("i", class_="fa-chevron-right") or not search_results:
break
# rate limit
time.sleep(1)
with open("/tmp/OrganicWine3.csv", "w", newline="") as file:
writer = csv.writer(file, delimiter=",")
writer.writerow(["Wine Name", "Wine Price", "Wine Link", "Wine Image Link"])
writer.writerows(data)
I am trying to scrape this website and trying to get the reviews but I am facing an issue,
The page loads only 50 reviews.
To load more you have to click "Show More Reviews" and I don't know how to get all the data as there is no page link, also "Show more Reviews" doesn't have a URL to explore, the address remains the same.
url =
"https://www.capterra.com/p/134048/HiMama-Preschool-Child-Care-App/#reviews"
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
a = []
url = requests.get(url)
html = url.text
soup = BeautifulSoup(html, "html.parser")
table = soup.findAll("div", {"class":"review-comments"})
#print(table)
for x in table:
a.append(x.text)
df = pd.DataFrame(a)
df.to_csv("review.csv", sep='\t')
I know this is not pretty code but I am just trying to get the review text first.
kindly help. As I am little new to this.
Looking at the website, the "Show more reviews" button makes an ajax call and returns the additional info, all you have to do is find it's link and send a get request to it (which I've done with some simple regex):
import requests
import re
from bs4 import BeautifulSoup
headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) snap Chromium/74.0.3729.169 Chrome/74.0.3729.169 Safari/537.36"
}
url = "https://www.capterra.com/p/134048/HiMama-Preschool-Child-Care-App/#reviews"
Data = []
#Each page equivalant to 50 comments:
MaximumCommentPages = 3
with requests.Session() as session:
info = session.get(url)
#Get product ID, needed for getting more comments
productID = re.search(r'"product_id":(\w*)', info.text).group(1)
#Extract info from main data
soup = BeautifulSoup(info.content, "html.parser")
table = soup.findAll("div", {"class":"review-comments"})
for x in table:
Data.append(x)
#Number of pages to get:
#Get additional data:
params = {
"page": "",
"product_id": productID
}
while(MaximumCommentPages > 1): # number 1 because one of them was the main page data which we already extracted!
MaximumCommentPages -= 1
params["page"] = str(MaximumCommentPages)
additionalInfo = session.get("https://www.capterra.com/gdm_reviews", params=params)
print(additionalInfo.url)
#print(additionalInfo.text)
#Extract info for additional info:
soup = BeautifulSoup(additionalInfo.content, "html.parser")
table = soup.findAll("div", {"class":"review-comments"})
for x in table:
Data.append(x)
#Extract data the old fashioned way:
counter = 1
with open('review.csv', 'w') as f:
for one in Data:
f.write(str(counter))
f.write(one.text)
f.write('\n')
counter += 1
Notice how I'm using a session to preserve cookies for the ajax call.
Edit 1: You can reload the webpage multiple times and call the ajax again to get even more data.
Edit 2: Save data using your own method.
Edit 3: Changed some stuff, now gets any number of pages for you, saves to file with good' ol open()
We are trying to scrape every product for every category on Forever 21's website. Given a product page, we know how to extract the information we need, and given a category, we can extract every product. However, we do not know how to crawl through every product category. Here is our code for a given category and getting every product:
import requests
from bs4 import BeautifulSoup
import json
#import re
params = {"action": "getcategory",
"br": "f21",
#"category": re.compile('\S+'),
"category": "dress",
"pageno": 1,
"pagesize": "",
"sort": "",
"fsize": "",
"fcolor": "",
"fprice": "",
"fattr": ""}
url = "http://www.forever21.com/Ajax/Ajax_Category.aspx"
js = requests.get(url, params=params).json()
soup = BeautifulSoup(js[u'CategoryHTML'], "html.parser")
i = 0
j = 0
while len(soup.select("div.item_pic a")) != 0:
for a in soup.select("div.item_pic a"):
#print a["href"]
i = i + 1
params["pageno"] = params["pageno"] + 1
j = j + 1
js = requests.get(url, params=params).json()
soup = BeautifulSoup(js[u'CategoryHTML'], "html.parser")
print i
print j
As you can see in the comments, we tried to use regular expressions for the category but had no success. i and j are just product and page counters. Any suggestions on how to modify/add to this code to get every product category?
You can scrape the category page and get all subcategories from the navigation menu:
import requests
from bs4 import BeautifulSoup
url = "http://www.forever21.com/Product/Category.aspx?br=f21&category=app-main"
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36"})
soup = BeautifulSoup(response.content, "html.parser")
menues = [li["class"][0] for li in soup.select("#has_sub .white nav ul > li")]
print(menues)
Prints:
[u'women-new-arrivals', u'want_list', u'dress', u'top_blouses', u'outerwear_coats-and-jackets', u'bottoms', u'intimates_loungewear', u'activewear', u'swimwear_all', u'acc', u'shoes', u'branded-shop-women-clothing', u'sale_women|women', u'women-new-arrivals-clothing-dresses', u'women-new-arrivals-clothing-tops', u'women-new-arrivals-clothing-outerwear', u'women-new-arrivals-clothing-bottoms', u'women-new-arrivals-clothing-intimates-loungewear', u'women-new-arrivals-clothing-swimwear', u'women-new-arrivals-clothing-activewear', u'women-new-arrivals-accessories|women-new-arrivals', u'women-new-arrivals-shoes|women-new-arrivals', u'promo-web-exclusives', u'promo-best-sellers-app', u'backinstock-women', u'promo-shop-by-outfit-women', u'occasion-shop-wedding', u'contemporary-main', u'promo-basics', u'21_items', u'promo-summer-forever', u'promo-coming-soon', u'dress_casual', u'dress_romper', u'dress_maxi', u'dress_midi', u'dress_mini', u'occasion-shop-dress', u'top_blouses-off-shoulder', u'top_blouses-lace-up', u'top_bodysuits-bustiers', u'top_graphic-tops', u'top_blouses-crop-top', u'top_t-shirts', u'sweater', u'top_blouses-sweatshirts-hoodies', u'top_blouses-shirts', u'top_plaids', u'outerwear_bomber-jackets', u'outerwear_blazers', u'outerwear_leather-suede', u'outerwear_jean-jackets', u'outerwear_lightweight', u'outerwear_utility-jackets', u'outerwear_trench-coats', u'outerwear_faux-fur', u'promo-jeans-refresh|bottoms', u'bottoms_pants', u'bottoms_skirt', u'bottoms_shorts', u'bottoms_shorts-active', u'bottoms_leggings', u'bottoms_sweatpants', u'bottom_jeans|', u'intimates_loungewear-bras', u'intimates_loungewear-panties', u'intimates_loungewear-bodysuits-slips', u'intimates_loungewear-seamless', u'intimates_loungewear-accessories', u'intimates_loungewear-sets', u'activewear_top', u'activewear_sports-bra', u'activewear_bottoms', u'activewear_accessories', u'swimwear_tops', u'swimwear_bottoms', u'swimwear_one-piece', u'swimwear_cover-ups', u'acc_features', u'acc_jewelry', u'acc_handbags', u'acc_glasses', u'acc_hat', u'acc_hair', u'acc_legwear', u'acc_scarf-gloves', u'acc_home-and-gift-items', u'shoes_features', u'shoes_boots', u'shoes_high-heels', u'shoes_sandalsflipflops', u'shoes_wedges', u'shoes_flats', u'shoes_oxfords-loafers', u'shoes_sneakers', u'Shoes_slippers', u'branded-shop-new-arrivals-women', u'branded-shop-women-clothing-dresses', u'branded-shop-women-clothing-tops', u'branded-shop-women-clothing-outerwear', u'branded-shop-women-clothing-bottoms', u'branded-shop-women-clothing-intimates', u'branded-shop-women-accessories|branded-shop-women-clothing', u'branded-shop-women-accessories-jewelry|', u'branded-shop-shoes-women|branded-shop-women-clothing', u'branded-shop-sale-women', u'/brandedshop/brandlist.aspx', u'promo-branded-boho-me', u'promo-branded-rare-london', u'promo-branded-selfie-leslie', u'sale-newly-added', u'sale_dresses', u'sale_tops', u'sale_outerwear', u'sale_sweaters', u'sale_bottoms', u'sale_intimates', u'sale_swimwear', u'sale_activewear', u'sale_acc', u'sale_shoes', u'the-outlet', u'sale-under-5', u'sale-under-10', u'sale-under-15']
Note the values of br and category GET parameters. f21 is the "Women" category, app-main is the main page for a category.
I'm doing a web scrape of a website with 122 different pages with 10 entries per page. The code breaks on random pages, on random entries each time it is ran. I can run the code on a url one time and it works while other times it does not.
def get_soup(url):
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
return soup
def from_soup(soup, myCellsList):
cellsList = soup.find_all('li', {'class' : 'product clearfix'})
for i in range (len(cellsList)):
ottdDict = {}
ottdDict['Name'] = cellsList[i].h3.text.strip()
This is only a piece of my code, but this is where the error is occurring. The problem is that when I use this code, the h3 tag is not always appearing in each item in the cellsList. This results in a NoneType error when the last line of the code is ran. However, the h3 tag is always there in the HTML when I inspect the webpage.
cellsList vs html 1
same comparison made from subsequent soup request
What could be causing these differences and how can I avoid this problem? I was able to run the code successfully for a time, and it seems to have all of a sudden stopped working. The code is able to scrape some pages without problem but it randomly does not register the h3 tags on random entries on random pages.
There are slight discrepancies in the html for various elements as you progress through the site pages, the best way to get the name is actually to select the outer div and extract the text from the anchor.
This will get all the info from each product and put it into dicts where the keys are 'Tissue', 'Cell' etc.. and the values are the relating descriptionm:
import requests
from time import sleep
def from_soup(url):
with requests.Session() as s:
s.headers.update({
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36"})
# id for next oage anchor.
id_ = "#layoutcontent_2_middlecontent_0_threecolumncontent_0_content_ctl00_rptCenterColumn_dcpCenterColumn_0_ctl00_0_productRecords_0_bottomPaging_0_liNextPage_0"
soup = BeautifulSoup(s.get(url).content)
for li in soup.select("ul.product-list li.product.clearfix"):
name = li.select_one("div.product-header.clearfix a").text.strip()
d = {"name": name}
for div in li.select("div.search-item"):
k = div.strong.text
d[k.rstrip(":")] = " ".join(div.text.replace(k, "", 1).split())
yield d
# get anchor for next page and loop until no longer there.
nxt = soup.select_one(id_)
# loop until mo more next page.
while nxt:
# sleep between requests
sleep(.5)
resp = s.get(nxt.a["href"])
soup = BeautifulSoup(resp.content)
for li in soup.select("ul.product-list li.product.clearfix"):
name = li.select_one("div.product-header.clearfix a").text.strip()
d = {"name": name}
for div in li.select("div.search-item"):
k = div.strong.text
d[k.rstrip(":")] = " ".join(div.text.replace(k,"",1).split())
yield d
After running:
for ind, h in enumerate(from_soup(
"https://www.lgcstandards-atcc.org/Products/Cells_and_Microorganisms/Cell_Lines/Human/Alphanumeric.aspx?geo_country=gb")):
print(ind, h)
You will see 1211 dicts with all the data.