Looping through Multiple pages while scraping with python

Looping through Multiple pages while scraping with python - python

I am trying to parse multiple page on IDMb. The parser is stuck on gathering information from one page. I have tried many forums to solve this to no avail. I suspect it has something to do with not setting up my embedded loop correctly or my initial request. Please help. Thanks.
Problems with this script: Loops on one page.
#Basic libraries
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
from random import randint
#More advanced libraries
from time import sleep
from time import time
from IPython.core.display import clear_output
from warnings import warn
base_url = 'http://www.imdb.com/search/title?release_date=2000,2017&sort=num_votes,desc&page='
response = get(base_url)
soup = BeautifulSoup(response.text, 'lxml')
#data lists to append
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
#preparing the monitoring loop
pages = str(range(1,5))
start_time = time()
requests = 0
#for every page in the interval 1-4
for page in pages:
#make a get request
response = get(base_url + page)
#pause the loop
sleep(randint(8,15))
#Monitor the requests
requests += 1
elapsed_time = time() - start_time
if requests > 4:
warn:('Number of requests was greater than expected.')
break
elif response.status_code != 200:
warn('Request: {}; Frequency: {} requests/s'.format(requests, response.status_code))
else:
print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
clear_output(wait = True)
page_html = BeautifulSoup(response.text, 'lxml')
#root
movie_containers = soup.find_all('div', class_= 'lister-item mode-advanced')
#looping through containers
for container in movie_containers:
if container.find('div', class_ = 'ratings-metascore') is not None:
#The name
name = container.h3.a.text
#print(name)
names.append(name)
#The Year
year = container.find('span', class_ = 'lister-item-year').text
#print(year)
years.append(year)
#IDMb rating
imdb = container.strong.text
#print(imdb)
imdb_ratings.append(imdb)
#Metascore
metascore = container.find('span', class_= 'metascore').text
#print(metascore)
metascores.append(int(metascore))
#Number of Votes
vote = container.find('span', attrs = {'name':'nv'})['data-value']
#print(vote)
votes.append(int(vote))
#keeping track of data
test_df= pd.DataFrame({'Movie': names,
'Year': years,
'IMDb': imdb_ratings,
'Metascore': metascores,
'Votes': votes})
print(test_df.info())
test_df
`

Solution 1:
What you could do is, anytime you collect your data on that page and you are done, go to the next page by changing the urls page value to +1.
http://www.imdb.com/search/title?
release_date=2000,2017&sort=num_votes,desc&page=2&ref_=adv_nxt
Solution 2: You can get the same behavior by clicking the next url at the bottom of the page. To do that, you have to scroll down to the bottom of the page.
Here is your corrected code, which outputs at the end a test.csv
#Basic libraries
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
from random import randint
#More advanced libraries
from time import sleep
from time import time
from IPython.core.display import clear_output
from warnings import warn
base_url = 'http://www.imdb.com/search/title?release_date=2000,2017&sort=num_votes,desc&page='
response = get(base_url)
soup = BeautifulSoup(response.text, 'lxml')
#data lists to append
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
#preparing the monitoring loop
pages = str(range(1,5))
start_time = time()
requests = 0
#for every page in the interval 1-4
urls = [base_url+str(x) for x in range(0,10)]
for url in urls:
#make a get request
response = get(url)
#pause the loop
sleep(randint(2,3))
#Monitor the requests
requests += 1
elapsed_time = time() - start_time
if requests > 4:
warn('Number of requests was greater than expected.')
break
elif response.status_code != 200:
warn('Request: {}; Frequency: {} requests/s'.format(requests, response.status_code))
else:
print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
clear_output(wait = True)
page_html = BeautifulSoup(response.text, 'lxml')
#root
movie_containers = soup.find_all('div', class_= 'lister-item mode-advanced')
#looping through containers
for container in movie_containers:
if container.find('div', class_ = 'ratings-metascore') is not None:
#The name
name = container.h3.a.text
#print(name)
names.append(name)
#The Year
year = container.find('span', class_ = 'lister-item-year').text
#print(year)
years.append(year)
#IDMb rating
imdb = container.strong.text
#print(imdb)
imdb_ratings.append(imdb)
#Metascore
metascore = container.find('span', class_= 'metascore').text
#print(metascore)
metascores.append(int(metascore))
#Number of Votes
vote = container.find('span', attrs = {'name':'nv'})['data-value']
#print(vote)
votes.append(int(vote))
#keeping track of data
test_df= pd.DataFrame({'Movie': names,
'Year': years,
'IMDb': imdb_ratings,
'Metascore': metascores,
'Votes': votes})
print(test_df.info())
test_df.to_csv("test.csv", sep=",", encoding="utf-8")

I figure it out. You just have to add page+=1 at the end of the loop and add commas to the end of each data entry:
pages = 1
ranger = range(1,4)
requests = 0
for n in ranger:
#make a get request
response = get(base_url + str(pages))
soup = BeautifulSoup(response.text, 'lxml')
#pause the loop
sleep(randint(2,3))
#Monitor the requests
requests += 1
elapsed_time = time() - start_time
if requests > 4:
warn('Number of requests was greater than expected.')
break
if response.status_code != 200:
warn('Request: {}; Frequency: {} requests/s'.format(requests, response.status_code))
else:
print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
clear_output(wait = True)
#root
movie_containers = soup.find_all('div', class_= 'lister-item mode-advanced')
#looping through containers
for container in movie_containers:
if container.find('div', class_ = 'ratings-metascore') is not None:
#The name
name = container.h3.a.text
#print(name)
names.append(name + ',')
#The Year
year = container.find('span', class_ = 'lister-item-year').text
years.append(year + ',')
#IDMb rating
imdb = container.strong.text
#print(imdb)
imdb_ratings.append(imdb + ',')
#Metascore
metascore = container.find('span', class_= 'metascore').text
metascores.append(metascore + ',')
#Number of Votes
vote = container.find('span', attrs = {'name':'nv'})['data-value']
votes.append(vote + ',')
pages += 1

Related

extract names in custom <h2> but It is extracted many times beautifulsoup

I am trying to extract names in custom <h2>, but the names I want are extracted many times.
how to fix this problem and extract it one time
The page I am pulling data from
here
import requests
import csv
from bs4 import BeautifulSoup
from itertools import zip_longest
lawy_name = []
page_num = 1
phone = []
logo = []
website = []
links = []
while True:
try:
result = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
src = result.content
soup = BeautifulSoup(src, "lxml")
page_limit = int("126")
if(page_num > page_limit // 25):
print("page ended, terminate")
break
lawy_names = soup.select('div.poap.serp-container.lawyer h2.indigo_text')
for i in range(len(lawy_names)) :
lawy_name.append(lawy_names[i].text.strip())
links.append(lawy_names[i].find("a").attrs["href"])
for link in links:
result = requests.get(link)
src = result.content
soup = BeautifulSoup(src, "lxml")
phones = soup.find("a", {"class":"profile-phone-header profile-contact-btn"})
phone.append(phones["href"])
logos = soup.find("div", {"class":"photo-container"})
logo.append(logos.find('img')['src'])
websites = soup.find("a", {"class":"profile-website-header","id":"firm_website"})
website.append(websites.text.strip())
page_num +=1
print("page switched")
except:
print("error")
break
file_list = [lawy_name, phone, website, logo]
exported = zip_longest(*file_list)
with open("/Users/dsoky/Desktop/fonts/Moaaz.csv", "w") as myfile:
wr = csv.writer(myfile)
wr.writerow(["lawyer name","phone","website","logo"])
wr.writerows(exported)
Problem:

The website does produce a lot of duplicate entries. You could probably assume that all entries have unique names, as such a dictionary could be used to hold all of your data. Simply skip any entries for which you have already seen the same name. For example:
from bs4 import BeautifulSoup
import requests
import csv
lawyers = {}
page_num = 1
while True:
print(f"Page {page_num}")
req = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
soup = BeautifulSoup(req.content, "lxml")
found = False
for id in ['sponsored_serps', 'ts_results', 'poap_results', 'basic_results']:
div_results = soup.find('div', id=id)
if div_results:
for result in div_results.find_all('div', class_='lawyer'):
name = result.h2.get_text(strip=True)
if name not in lawyers:
print(' ', name)
link = result.h2.a['href']
req_details = requests.get(link)
soup_details = BeautifulSoup(req_details.content, "lxml")
a_phone = soup_details.find("a", {"class":"profile-phone-header profile-contact-btn"}, href=True)
if a_phone:
phone = a_phone['href']
else:
phone = None
div_logo = soup_details.find("div", {"class":"photo-container"})
if div_logo.img:
logo = div_logo.img['src']
else:
logo = None
a_website = soup_details.find("a", {"class":"profile-website-header","id":"firm_website"})
if a_website:
website = a_website.get_text(strip=True)
else:
website = None
lawyers[name] = [phone, logo, website]
found = True
# Keep going until no new names found
if found:
page_num += 1
else:
break
with open('Moaaz.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['Name', 'Phone', 'Logo', 'Website'])
for name, details in lawyers.items():
csv_output.writerow([name, *details])

how to scrape texts from voetsmart via beautifulsoup

I am trying to scrape some statements made by U.S politicians on votesmart.org
I am experiencing errors in extracting the texts though the code could be run.
The code that I am using is as follow:
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import requests
import os
def main():
df=pd.read_csv('https://theunitedstates.io/congress-legislators/legislators-current.csv')
df = df[df.type=='sen']
df = df[~df.votesmart_id.isna()]
done_list = os.listdir('corpus')
print("{} senators".format(len(df)))
df = df[~df.full_name.isin(done_list)]
print("{} after some already done".format(len(df)))
df = df.sample(frac=1)
df.apply(scrape_politician_speeches,axis=1)
def scrape_politician_speeches(row):
print('Scraping {}...'.format(row.full_name))
vs_url='https://justfacts.votesmart.org/candidate/public-statements/{}'.format(int(row.votesmart_id))
vs_page = requests.get(vs_url) # fill in the last part of the url
soup = BeautifulSoup(vs_page.content, features="lxml")
n_pages = 1
page_num = 1
while page_num <= n_pages:
print("\tPage {} of {}".format(page_num,n_pages))
#speeches_url = vs_page.url + '?start=2019-01-01&speechType=14&p={}'.format(page_num)
speeches_url = vs_page.url + '/?s=date&start=2020/01/01&end=&p={}'.format(page_num)
speeches_page = requests.get(speeches_url)
soup = BeautifulSoup(speeches_page.content, features="lxml")
speech_table = soup.find('table', {'id':'statementsObjectsTables'})
speech_table = soup.find('tbody')
speech_links = speech_table.find_all('a',href=True)
speech_hrefs = [a.get('href') for a in speech_links]
for href in speech_hrefs:
scrape_speech(person=row.full_name, speech_url=href)
try:
n_pages = int(soup.find('h7').text.split()[-1])
except:
print("\tNo page numbers")
pass
page_num += 1
sleep(1)
def scrape_speech(person, speech_url):
try:
if not os.path.isdir('corpus/{}'.format(person)):
os.mkdir('corpus/{}'.format(person))
speech_page = requests.get(speech_url)
soup = BeautifulSoup(speech_page.content,features="lxml")
title = soup.find('h3').text
date = soup.find('span',{'itemprop':'datePublished'}).text
location = soup.find('span',{'itemprop':'contentLocation'}).text
body = soup.find('div', {'class':"main clear"})
p_list = body.find_all('p')
text_list = [p.text for p in p_list]
speech_text = '\n\n'.join(text_list)
full_text = '{}\n\n\n{}'.format(title,speech_text)
file_name = '{}, {}, {}.txt'.format(title.split(',')[0], date, location)
file_name = file_name.replace('/',' ')
with open('corpus/{}/{}'.format(person,file_name), 'w') as f:
f.write(full_text)
except:
print("\tError with {}".format(speech_url))
if __name__=='__main__':
main()
The errors are looking like this:
95 senators
95 after some already done
Scraping Tammy Duckworth...
Page 1 of 1
Error with https://votesmart.org/public-statement/1570841/durbin-duckworth-announce-135-million-for-springfield-rail-improvement-project
Error with https://votesmart.org/public-statement/1570825/durbin-duckworth-statement-on-nomination-of-ladon-reynolds-to-serve-as-us-marshal-for-the-northern-district-of-illinois
Error with https://votesmart.org/public-statement/1570826/durbin-duckworth-announce-16-million-in-telehealth-funding-for-illinois-health-care-providers
Thank you so much for your time and attention. I hope to learn more from this wonderful community.

scrape_speech is outdated, probably pages' design changed since script was writen, there's no <div class="main clear"> in html, there's no <span itemprop="datePublished"> and so on. You need to rewrite it using current css selectors.

Python web scraping empty result

I followed a youtube tutorial on web scraping to scrape this website https://books.toscrape.com/ but i'm getting an empty result
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
all_books = []
url = "http://books.toscrape.com/catalogue/page-1.html"
def get_page(url):
page = requests.get(url)
status = page.status_code
soup = bs(page.text, "lxml")
return [soup, status]
def get_links(soup):
links = []
listings = soup.find_all(class_="product_pod")
def get_links(soup):
links = []
listings = soup.find_all(class_="product_pod")
def extract_info(links):
for listing in listings:
bk_lnk = listing.find("h5").a.get("href")
base_url = "http://books.toscrape.com/catalogue"
cmplt_lnk = base_url + bk_lnk
links.append(cmplt_lnk)
return links
def extract_info(links):
for link in links:
res = requests.get(link).text
book_soup = bs(res, "lxml")
title = book_soup.find(class_ = "col-sm-6 product_main").h1. text.strip()
price = book_soup.find(class_ = "col-sm-6 product_main").p. text.strip()
book = {"title": title, "price": price}
all_books.append(book)
pg = 1
while True:
url = f"http://books.toscrape.com/catalogue/page-{pg}.html"
soup_status = get_page(url)
if soup_status[1] == 200:
print (f"scraping page {pg}")
extract_info(get_links(soup_status[0]))
pg += 1
else:
print("The End")
break
df = pd.DataFrame(all_books)
print (df)
here's the result am getting
Empty DataFrame
Columns: []
Index: []
my colab notebook link
https://colab.research.google.com/drive/1Lyvwt_WLpE9tqy1qheZg80N70CFSsk-E?usp=sharing

def get_links(soup):
links = []
listings = soup.find_all(class_="product_pod")
def extract_links():
for listing in listings:
bk_lnk = listing.find("h3").a.get("href")
base_url = "https://books.toscrape.com/catalogue/"
cmplt_lnk = base_url + bk_lnk
links.append(cmplt_lnk)
return links
return extract_links()
def extract_info(links):
for link in links:
res = requests.get(link).text
book_soup = bs(res, "lxml")
title = book_soup.find(class_ = "col-sm-6 product_main").h1.text.strip()
price = book_soup.find(class_ = "col-sm-6 product_main").p.text.strip()
book = {"title": title, "price": price}
all_books.append(book)
pg = 45
while True:
url = f"https://books.toscrape.com/catalogue/page-{pg}.html"
soup_status = get_page(url)
if soup_status[1] == 200:
print (f"scraping page {pg}")
extract_info(get_links(soup_status[0]))
pg += 1
else:
print("The End")
break

Your list is empty . Need to call your functions .. such as
Get_page(url) which should return a list which you can use soup in your subsequent function ..

Why can't I break from this get requests loop?

I'm web scraping and while I run the code, the requests keep going even though I've specified it to break when it reached 72. Help?
I've tried adding a print(variable) function and it didn't work either.
# Redeclaring the lists to store data in
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
# Preparing the monitoring of the loop
start_time = time.time()
requests = 0
# For every year in the interval 2000-2017
for year_url in years_url:
# For every page in the interval 1-4
for page in pages:
# Make a get request
response = get('http://www.imdb.com/search/title?release_date=' + year_url +
'&sort=num_votes,desc&page=' + page, headers = headers)
# Pause the loop
sleep(randint(8,15))
# Monitor the requests
requests += 1
elapsed_time = time.time() - start_time
print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
clear_output(wait = True)
# Throw a warning for non-200 status codes
if response.status_code != 200:
warn('Request: {}; Status code: {}'.format(requests, response.status_code))
# Break the loop if the number of requests is greater than expected
if requests > 72:
warn('Number of requests was greater than expected.')
break
# Parse the content of the request with BeautifulSoup
page_html = BeautifulSoup(response.text, 'html.parser')
# Select all the 50 movie containers from a single page
mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
# For every movie of these 50
for container in mv_containers:
# If the movie has a Metascore, then:
if container.find('div', class_ = 'ratings-metascore') is not None:
# Scrape the name
name = container.h3.a.text
names.append(name)
# Scrape the year
year = container.h3.find('span', class_ = 'lister-item-year').text
years.append(year)
# Scrape the IMDB rating
imdb = float(container.strong.text)
imdb_ratings.append(imdb)
# Scrape the Metascore
m_score = container.find('span', class_ = 'metascore').text
metascores.append(int(m_score))
# Scrape the number of votes
vote = container.find('span', attrs = {'name':'nv'})['data-value']
votes.append(int(vote))
The requests keep running and won't stop. I'm doing this exactly as listed on this tutorial and I've been trying to figure what went wrong for hours.

You could set a bool var when breaking from the inner loop. In outer loop you break if bool is true. I'm sure there is a more elegant solution, but I can't post my suggestion as a comment.

How to make request to new url?

I already have this code, helped by a friend before. I already get all the links in the site. I want to get the name, merk, price, picture, description of the product, and the link of the product. The description's product only appear if we click the product.
I'm a beginner in Python.
from bs4 import BeautifulSoup
import urllib.request
count = 1
url = "https://www.sociolla.com/155-foundation?p=%d"
def get_url(url):
req = urllib.request.Request(url)
return urllib.request.urlopen(req)
expected_url = url % count
response = get_url(expected_url)
link = []
name = []
merk = []
price = []
pic = []
description = []
while (response.url == expected_url):
#print("GET {0}".format(expected_url))
soup = BeautifulSoup(response.read(), "html.parser")
products = soup.find("div",{"id":"product-list-grid"})
for i in products:
data = products.findAll("div",{"class":"product-item"})
for j in range(0, len(data)):
link.append(data[j]["data-eec-href"])
count += 1
expected_url = url % count
response = get_url(expected_url)
print(len(link))
"""
import csv
dataset=zip(link, merk, name, pic, price, description)
with open("foundation_sociolla.csv","w", newline='') as csvfile:
writer=csv.writer(csvfile)
header=['link', 'merk', 'name', 'pic', 'price', 'description']
writer.writerow(header)
writer.writerows(dataset)
"""

You need to make a request to the URL. Parse the content of that request and extract the data you want.
from bs4 import BeautifulSoup
import urllib.request
count = 1
url = "https://www.sociolla.com/155-foundation?p=%d"
def get_url(url):
req = urllib.request.Request(url)
return urllib.request.urlopen(req)
expected_url = url % count
response = get_url(expected_url)
link = []
name = []
make = []
price = []
pic = []
description = []
while response.url == expected_url:
soup = BeautifulSoup(response.read(), "html.parser")
for product in soup.select("div.product-item"):
product_url = (product['data-eec-href'])
link.append(product_url)
product_response = get_url(product_url)
product_soup = BeautifulSoup(product_response.read(), "html.parser")
product_pic = product_soup.select('img#bigpic')[0]['src']
pic.append(product_pic)
product_price = product_soup.select('span#our_price_display')[0].text.strip()
price.append(product_price)
product_name = product_soup.select('div.detail-product-logo p')[0].text.strip()
name.append(product_name)
product_make = product_soup.select('div.detail-product-logo h3')[0].text.strip()
make.append(product_make)
product_description = product_soup.select('div#Details article')[0].text.strip()
description.append(product_description)
print(product_url, product_pic, product_price, product_name, product_make, product_description)
count += 1
expected_url = url % count
response = get_url(expected_url)
But if your going to scrape a lot of pages you are much better off using something like Scrapy https://scrapy.org/

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Looping through Multiple pages while scraping with python - python

Related

extract names in custom <h2> but It is extracted many times beautifulsoup

how to scrape texts from voetsmart via beautifulsoup

Python web scraping empty result

Why can't I break from this get requests loop?

How to make request to new url?

Categories

Resources