Why can't I break from this get requests loop?

Why can't I break from this get requests loop? - python

I'm web scraping and while I run the code, the requests keep going even though I've specified it to break when it reached 72. Help?
I've tried adding a print(variable) function and it didn't work either.
# Redeclaring the lists to store data in
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
# Preparing the monitoring of the loop
start_time = time.time()
requests = 0
# For every year in the interval 2000-2017
for year_url in years_url:
# For every page in the interval 1-4
for page in pages:
# Make a get request
response = get('http://www.imdb.com/search/title?release_date=' + year_url +
'&sort=num_votes,desc&page=' + page, headers = headers)
# Pause the loop
sleep(randint(8,15))
# Monitor the requests
requests += 1
elapsed_time = time.time() - start_time
print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
clear_output(wait = True)
# Throw a warning for non-200 status codes
if response.status_code != 200:
warn('Request: {}; Status code: {}'.format(requests, response.status_code))
# Break the loop if the number of requests is greater than expected
if requests > 72:
warn('Number of requests was greater than expected.')
break
# Parse the content of the request with BeautifulSoup
page_html = BeautifulSoup(response.text, 'html.parser')
# Select all the 50 movie containers from a single page
mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
# For every movie of these 50
for container in mv_containers:
# If the movie has a Metascore, then:
if container.find('div', class_ = 'ratings-metascore') is not None:
# Scrape the name
name = container.h3.a.text
names.append(name)
# Scrape the year
year = container.h3.find('span', class_ = 'lister-item-year').text
years.append(year)
# Scrape the IMDB rating
imdb = float(container.strong.text)
imdb_ratings.append(imdb)
# Scrape the Metascore
m_score = container.find('span', class_ = 'metascore').text
metascores.append(int(m_score))
# Scrape the number of votes
vote = container.find('span', attrs = {'name':'nv'})['data-value']
votes.append(int(vote))
The requests keep running and won't stop. I'm doing this exactly as listed on this tutorial and I've been trying to figure what went wrong for hours.

You could set a bool var when breaking from the inner loop. In outer loop you break if bool is true. I'm sure there is a more elegant solution, but I can't post my suggestion as a comment.

Related

Web Crawler Looping the URL to crawl many pages

I am lost with making a loop to go through all of the pages on this book site. The url ends in 'all?page=' followed by the page number, so it should be easy I thought, but I'm stuck. All the info gathering works fine, I just don't know how to move to the next pages. Any help would be appreciated.
import requests
from bs4 import BeautifulSoup
URL = 'https://www.bookdepository.com/category/352/Science-Fiction/browse/viewmode/all?page=' +str(page)
page = 1
page += 1
for page in max_pages:
html = requests.get(URL)
soup = BeautifulSoup(html.content, "html.parser")
# ^This part I need help with^
# results = all books present on page
# books = each individual book on the page
results = soup.find(class_='tab search')
books = results.find_all('div', class_='book-item')
for book in books:
title = book.h3.a
author = book.p.span
# in case there is no rating on a book
if len(book.find('div','rating-wrap').findAll('span', 'full-star')) == None:
pass
else: rating = len(book.find('div','rating-wrap').findAll('span', 'full-star'))
publish_date = book.find(class_='published')
format = book.find(class_='format')
price = book.find('span', class_='sale-price').text.strip()
# if there is no discount
if book.find(class_='rrp') == None:
pass
else:
original_price = book.find(class_='rrp').text.strip()
if book.find(class_='price-save') == None:
pass
else:
discount = book.find(class_='price-save').text.strip()
# unneeded text removed such as 'US' before the price shown
price = price.replace('US', '')
original_price = original_price.replace('US', '')
discount = discount.replace('Save US', '')
# .text.strip() gets text and rids of empty spaces
print(title.text.strip())
print(author.text.strip())
print(rating, 'stars')
print(publish_date.text.strip())
print(format.text.strip())
print(price)
print(original_price)
print(discount, 'in savings!')

What the code does is it loops 5 times in this case with page going up one every singe time.
max_pages = 5
for page in range(max_pages):
URL = f"https://www.bookdepository.com/category/352/Science-Fiction/browse/viewmode/all?page={page}"
html = requests.get(URL)
soup = BeautifulSoup(html.content, "html.parser")

How to add condition in while true

here i just put some code to scrape some data from website and i want to take these data from all page so i make a loop while true: and there have 550 page but i want to scrape only 10 or 20 page so how i put condition to pull 10/20 or 100 page
import requests
from bs4 import BeautifulSoup
import pandas as pd
re=requests.get("https://katmoviehd.sk/")
soup=BeautifulSoup(re.text,"html.parser")
while True:
page = soup.find_all('h2')[1:]
Category = soup.find_all('span', class_ = 'meta-category')
Category
Category_list = []
for i in Category:
Category2 = i.text
Category_list.append(Category2)
link_list = []
for i in page:
link = (i.find("a")['href'])
link_list.append(link)
title_list = []
for i in page:
title = (i.find("a")['title'])
title_list.append(title)
Table = pd.DataFrame({'Links':link_list, 'Title':title_list, 'Category':Category_list})
next_page = soup.find('a', class_ = 'next page-numbers').get('href')
next_page
url = next_page
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')

Add an if and use break:
while True:
....
if time_to_quit:
break
Or use a variable instead of true:
keep_going = True
whlle keep_going:
...
keep_going = not am_i_done() # or whatever fits
Or a page count:
pages = 0
while pages < 20:
...
pages += 1

Well, you could have a simple counter variable outside your loop, and every time you successfully read a page, update the counter by 1.
Then instead of while loop with True condition, you can do something like this:
counter = 0
TOTAL_PAGES = 20 # or 100 whatever you decide
...
while counter < TOTAL_PAGES:
...
counter += 1

Webscraping when some attributes aren't available for all pages

I am trying to webscrape from a website called knowyourcity.info with many settlements on with information. This is my current loop:
for u in urllist:
response = get(u)
html_soup = BeautifulSoup(response.text, "html.parser")
headers_containers = html_soup.find('div', class_ = 'settlement-base-status section text-center')
names = headers_containers.h2.text
name.append(names)
year_established = headers_containers.h3.text
year.append(year_established)
headers1_containers = html_soup.find('div', class_ = 'col-xs-12 text-center')
countries = headers1_containers.h4.a.text
country.append(countries)
headers2_containers = html_soup.find('div', class_ = 'bold-it', id = "population")
populations = headers2_containers.text
population.append(populations)
headers3_containers = html_soup.find('div', class_ ='bold-it', id='sharedTaps')
tap = headers3_containers.text
taps.append(tap)
headers4_containers = html_soup.find_all('div', class_ = 'bold-it')
toiletSeat_toPerson = headers4_containers[7].text
toiletsToPerson.append(toiletSeat_toPerson)
However, for some settlements some attributes are not available. How do I add to this loop an "if true" statement?

If you want to skip a loop cycle by condition you can use the continue keyword.
for url in urllist:
if condition:
continue
Will break the current loop cycle if condition is True
and start with the next url in urllist

BeautifulSoup get links and info inside of them

I would like to scrape a website. Website has 10 preview of complaints in each page. I wrote this script to get links of 10 complaints and some info inside of each link. When I run the script I got this error message "RecursionError: maximum recursion depth exceeded".
Can someone say to me what is the problem. Thank you in advance!!
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
# Create list objects for each information section
C_date = []
C_title = []
C_text = []
U_name = []
U_id = []
C_count = []
R_name = []
R_date = []
R_text = []
# Get 10 links for preview of complaints
def getLinks(url):
response = get(url)
html_soup = BeautifulSoup(response.text, 'html.parser')
c_containers = html_soup.find_all('div', class_='media')
# Store wanted links in a list
allLinks = []
for link in c_containers:
find_tag = link.find('a')
find_links = find_tag.get('href')
full_link = "".join((url, find_links))
allLinks.append(full_link)
# Get total number of links
print(len(allLinks))
return allLinks
def GetData(Each_Link):
each_complaint_page = get(Each_Link)
html_soup = BeautifulSoup(each_complaint_page.text, 'html.parser')
# Get date of complaint
dt = html_soup.main.find('span')
date = dt['title']
C_date.append(date)
# Get Title of complaint
TL = html_soup.main.find('h1', {'class': 'title'})
Title = TL.text
C_title.append(Title)
# Get main text of complaint
Tx = html_soup.main.find('div', {'class': 'description'})
Text = Tx.text
C_text.append(Text)
# Get user name and id
Uname = html_soup.main.find('span', {'class': 'user'})
User_name = Uname.span.text
User_id = Uname.attrs['data-memberid']
U_name.append(User_name)
U_id.append(User_id)
# Get view count of complaint
Vcount = html_soup.main.find('span', {'view-count-detail'})
View_count = Vcount.text
C_count.append(View_count)
# Get reply for complaint
Rpnm = html_soup.main.find('h4', {'name'})
Reply_name = Rpnm.next
R_name.append(Reply_name)
# Get reply date
Rpdt = html_soup.main.find('span', {'date-tips'})
Reply_date = Rpdt.attrs['title']
R_date.append(Reply_date)
# Get reply text
Rptx = html_soup.main.find('p', {'comment-content-msg company-comment-msg'})
Reply_text = Rptx.text
R_text.append(Reply_text)
link_list = getLinks('https://www.sikayetvar.com/arcelik')
for i in link_list:
z = GetData(i)
print(z)
PS: My next step will be to put all information in a data frame

Your GetData() method calls itself, with no base-case: this causes infinite recursion:
def GetData(data):
for i in GetData(data):
You're also calling response = get(i) but then ignoring the result... perhaps you meant to say
def GetData(link):
i = get(link)
...

Looping through Multiple pages while scraping with python

I am trying to parse multiple page on IDMb. The parser is stuck on gathering information from one page. I have tried many forums to solve this to no avail. I suspect it has something to do with not setting up my embedded loop correctly or my initial request. Please help. Thanks.
Problems with this script: Loops on one page.
#Basic libraries
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
from random import randint
#More advanced libraries
from time import sleep
from time import time
from IPython.core.display import clear_output
from warnings import warn
base_url = 'http://www.imdb.com/search/title?release_date=2000,2017&sort=num_votes,desc&page='
response = get(base_url)
soup = BeautifulSoup(response.text, 'lxml')
#data lists to append
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
#preparing the monitoring loop
pages = str(range(1,5))
start_time = time()
requests = 0
#for every page in the interval 1-4
for page in pages:
#make a get request
response = get(base_url + page)
#pause the loop
sleep(randint(8,15))
#Monitor the requests
requests += 1
elapsed_time = time() - start_time
if requests > 4:
warn:('Number of requests was greater than expected.')
break
elif response.status_code != 200:
warn('Request: {}; Frequency: {} requests/s'.format(requests, response.status_code))
else:
print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
clear_output(wait = True)
page_html = BeautifulSoup(response.text, 'lxml')
#root
movie_containers = soup.find_all('div', class_= 'lister-item mode-advanced')
#looping through containers
for container in movie_containers:
if container.find('div', class_ = 'ratings-metascore') is not None:
#The name
name = container.h3.a.text
#print(name)
names.append(name)
#The Year
year = container.find('span', class_ = 'lister-item-year').text
#print(year)
years.append(year)
#IDMb rating
imdb = container.strong.text
#print(imdb)
imdb_ratings.append(imdb)
#Metascore
metascore = container.find('span', class_= 'metascore').text
#print(metascore)
metascores.append(int(metascore))
#Number of Votes
vote = container.find('span', attrs = {'name':'nv'})['data-value']
#print(vote)
votes.append(int(vote))
#keeping track of data
test_df= pd.DataFrame({'Movie': names,
'Year': years,
'IMDb': imdb_ratings,
'Metascore': metascores,
'Votes': votes})
print(test_df.info())
test_df
`

Solution 1:
What you could do is, anytime you collect your data on that page and you are done, go to the next page by changing the urls page value to +1.
http://www.imdb.com/search/title?
release_date=2000,2017&sort=num_votes,desc&page=2&ref_=adv_nxt
Solution 2: You can get the same behavior by clicking the next url at the bottom of the page. To do that, you have to scroll down to the bottom of the page.
Here is your corrected code, which outputs at the end a test.csv
#Basic libraries
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
from random import randint
#More advanced libraries
from time import sleep
from time import time
from IPython.core.display import clear_output
from warnings import warn
base_url = 'http://www.imdb.com/search/title?release_date=2000,2017&sort=num_votes,desc&page='
response = get(base_url)
soup = BeautifulSoup(response.text, 'lxml')
#data lists to append
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
#preparing the monitoring loop
pages = str(range(1,5))
start_time = time()
requests = 0
#for every page in the interval 1-4
urls = [base_url+str(x) for x in range(0,10)]
for url in urls:
#make a get request
response = get(url)
#pause the loop
sleep(randint(2,3))
#Monitor the requests
requests += 1
elapsed_time = time() - start_time
if requests > 4:
warn('Number of requests was greater than expected.')
break
elif response.status_code != 200:
warn('Request: {}; Frequency: {} requests/s'.format(requests, response.status_code))
else:
print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
clear_output(wait = True)
page_html = BeautifulSoup(response.text, 'lxml')
#root
movie_containers = soup.find_all('div', class_= 'lister-item mode-advanced')
#looping through containers
for container in movie_containers:
if container.find('div', class_ = 'ratings-metascore') is not None:
#The name
name = container.h3.a.text
#print(name)
names.append(name)
#The Year
year = container.find('span', class_ = 'lister-item-year').text
#print(year)
years.append(year)
#IDMb rating
imdb = container.strong.text
#print(imdb)
imdb_ratings.append(imdb)
#Metascore
metascore = container.find('span', class_= 'metascore').text
#print(metascore)
metascores.append(int(metascore))
#Number of Votes
vote = container.find('span', attrs = {'name':'nv'})['data-value']
#print(vote)
votes.append(int(vote))
#keeping track of data
test_df= pd.DataFrame({'Movie': names,
'Year': years,
'IMDb': imdb_ratings,
'Metascore': metascores,
'Votes': votes})
print(test_df.info())
test_df.to_csv("test.csv", sep=",", encoding="utf-8")

I figure it out. You just have to add page+=1 at the end of the loop and add commas to the end of each data entry:
pages = 1
ranger = range(1,4)
requests = 0
for n in ranger:
#make a get request
response = get(base_url + str(pages))
soup = BeautifulSoup(response.text, 'lxml')
#pause the loop
sleep(randint(2,3))
#Monitor the requests
requests += 1
elapsed_time = time() - start_time
if requests > 4:
warn('Number of requests was greater than expected.')
break
if response.status_code != 200:
warn('Request: {}; Frequency: {} requests/s'.format(requests, response.status_code))
else:
print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
clear_output(wait = True)
#root
movie_containers = soup.find_all('div', class_= 'lister-item mode-advanced')
#looping through containers
for container in movie_containers:
if container.find('div', class_ = 'ratings-metascore') is not None:
#The name
name = container.h3.a.text
#print(name)
names.append(name + ',')
#The Year
year = container.find('span', class_ = 'lister-item-year').text
years.append(year + ',')
#IDMb rating
imdb = container.strong.text
#print(imdb)
imdb_ratings.append(imdb + ',')
#Metascore
metascore = container.find('span', class_= 'metascore').text
metascores.append(metascore + ',')
#Number of Votes
vote = container.find('span', attrs = {'name':'nv'})['data-value']
votes.append(vote + ',')
pages += 1

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Why can't I break from this get requests loop? - python

You could set a bool var when breaking from the inner loop. In outer loop you break if bool is true. I'm sure there is a more elegant solution, but I can't post my suggestion as a comment.

Related

Web Crawler Looping the URL to crawl many pages

How to add condition in while true

Webscraping when some attributes aren't available for all pages

BeautifulSoup get links and info inside of them

Looping through Multiple pages while scraping with python

Categories

Resources