Looping through all the pages in Python web scraping error

Looping through all the pages in Python web scraping error - python

I am trying to scrape a webpage and looping through all the pages within a link. When I am looping through all the pages below code gives many duplicates
lst = []
urls = ['https://www.f150forum.com/f118/2019-adding-adaptive-cruise-454662/','https://www.f150forum.com/f118/adaptive-cruise-control-module-300894/']
for url in urls:
with requests.Session() as req:
for item in range(1,33):
response = req.get(f"{url}index{item}/")
soup = BeautifulSoup(response.content, "html.parser")
threadtitle = soup.find('h1',attrs={"class":"threadtitle"})
for item in soup.findAll('a',attrs={"class":"bigusername"}):
lst.append([threadtitle.text])
for div in soup.find_all('div', class_="ism-true"):
try:
div.find('div', class_="panel alt2").extract()
except AttributeError:
pass
try:
div.find('label').extract()
except AttributeError:
pass
result = [div.get_text(strip=True, separator=" ")]
comments.append(result)
Modification to the code as below doesnot give duplicates but skips last page of the url
comments= []
for url in urls:
with requests.Session() as req:
index=1
while(True):
response = req.get(url+"index{}/".format(index))
index=index+1
soup = BeautifulSoup(response.content, "html.parser")
if 'disabled' in soup.select_one('a#mb_pagenext').attrs['class']:
break
posts = soup.find(id = "posts")
threadtitle = soup.find('h1',attrs={"class":"threadtitle"})
for item in soup.findAll('a',attrs={"class":"bigusername"}):
lst.append([threadtitle.text])
for div in soup.find_all('div', class_="ism-true"):
try:
div.find('div', class_="panel alt2").extract()
except AttributeError:
pass # sometimes there is no 'panel alt2'
try:
div.find('label').extract()
except AttributeError:
pass # sometimes there is no 'Quote'
result = [div.get_text(strip=True, separator=" ")]
comments.append(result)
removing " if 'disabled' in soup.select_one('a#mb_pagenext').attrs['class']: break" this code gives infinite loop. How can I loop through pages without getting duplicates

Just change the order of the if condition to bottom of the loop so that once all the items grab then this will check disabled or not.If you provide at top this will break without capturing the values from last page.
comments= []
for url in urls:
with requests.Session() as req:
index=1
while(True):
response = req.get(url+"index{}/".format(index))
index=index+1
soup = BeautifulSoup(response.content, "html.parser")
posts = soup.find(id = "posts")
threadtitle = soup.find('h1',attrs={"class":"threadtitle"})
for item in soup.findAll('a',attrs={"class":"bigusername"}):
lst.append([threadtitle.text])
for div in soup.find_all('div', class_="ism-true"):
try:
div.find('div', class_="panel alt2").extract()
except AttributeError:
pass # sometimes there is no 'panel alt2'
try:
div.find('label').extract()
except AttributeError:
pass # sometimes there is no 'Quote'
result = [div.get_text(strip=True, separator=" ")]
comments.append(result)
if 'disabled' in soup.select_one('a#mb_pagenext').attrs['class']:
break

Related

Webscraping when some attributes aren't available for all pages

I am trying to webscrape from a website called knowyourcity.info with many settlements on with information. This is my current loop:
for u in urllist:
response = get(u)
html_soup = BeautifulSoup(response.text, "html.parser")
headers_containers = html_soup.find('div', class_ = 'settlement-base-status section text-center')
names = headers_containers.h2.text
name.append(names)
year_established = headers_containers.h3.text
year.append(year_established)
headers1_containers = html_soup.find('div', class_ = 'col-xs-12 text-center')
countries = headers1_containers.h4.a.text
country.append(countries)
headers2_containers = html_soup.find('div', class_ = 'bold-it', id = "population")
populations = headers2_containers.text
population.append(populations)
headers3_containers = html_soup.find('div', class_ ='bold-it', id='sharedTaps')
tap = headers3_containers.text
taps.append(tap)
headers4_containers = html_soup.find_all('div', class_ = 'bold-it')
toiletSeat_toPerson = headers4_containers[7].text
toiletsToPerson.append(toiletSeat_toPerson)
However, for some settlements some attributes are not available. How do I add to this loop an "if true" statement?

If you want to skip a loop cycle by condition you can use the continue keyword.
for url in urllist:
if condition:
continue
Will break the current loop cycle if condition is True
and start with the next url in urllist

What's the best way to grab attributes of a particular td in html table?

everybody.
So, I'm trying to write this function as a part of my python course. What it should do is go to a wiki page, parse the table with Greek philosophers there, and return the list of tuples, each containing the name of the philosopher and a link to his wiki page. Below is what I've got:
def get_philosophers():
url="https://en.wikipedia.org/wiki/List_of_ancient_Greek_philosophers"
philosophers = []
import requests
from bs4 import BeautifulSoup
try:
response = requests.get(url)
if not response.status_code == 200:
return 'Main page error'
page = BeautifulSoup(response.content, "lxml")
table = page.find('table',class_='wikitable')
trs = table.find_all('tr')
bigname = ()
for tr in trs:
tds = tr.find_all('td')
name = tds[0].find('a').get('title')
link = "https://wikipedia.org" + tds[0].find('a').get('href')
bigname = (name, link)
philosophers.append(bigname)
return len(philosophers)
except:
print('Scraping error')
I've tried commands via console, they mainly worked; except for the 'for' loop, which returned 'index out of range' error on the name = tds[0].find('a').get('title') line, but when earlier I tried same commands not as a loop, but just for one of the elements, they worked alright.
UPD: modified the function:
url="https://en.wikipedia.org/wiki/List_of_ancient_Greek_philosophers"
philosophers = []
import requests
from bs4 import BeautifulSoup
try:
response = requests.get(url)
if not response.status_code == 200:
return 'Main page error'
page = BeautifulSoup(response.content, "lxml")
table = page.find('table',class_='wikitable')
trs = table.find_all('tr')
bigname = ()
for tr in trs[1:]: #skip the thead tr element
try:
tds = tr.find_all('td')
name = tds[0].find('a').get('title')
link = "https://wikipedia.org" + tds[0].find('a').get('href')
bigname = (name, link)
philosophers.append(bigname)
# return philosophers
except:
print('Loop error')
return philosophers
except:
print('Scraping error')
works as intended.

It was the position of try - except that created the issue. Try :
def get_philosophers():
url="https://en.wikipedia.org/wiki/List_of_ancient_Greek_philosophers"
philosophers = []
import requests
from bs4 import BeautifulSoup
response = requests.get(url)
if not response.status_code == 200:
return 'Main page error'
page = BeautifulSoup(response.content, "lxml")
table = page.find('table',class_='wikitable')
trs = table.find_all('tr')
bigname = ()
for tr in trs:
try:
tds = tr.find_all('td')
name = tds[0].find('a').get('title')
link = "https://wikipedia.org" + tds[0].find('a').get('href')
bigname = (name, link)
philosophers.append(bigname)
except:
pass
return len(philosophers)
Now call it:
x = get_philosophers()
print(x)
What this does is that, it skips the error causing tr while iterating.
Or just delete the first error causer:
def get_philosophers():
url="https://en.wikipedia.org/wiki/List_of_ancient_Greek_philosophers"
philosophers = []
import requests
from bs4 import BeautifulSoup
try:
response = requests.get(url)
if not response.status_code == 200:
return 'Main page error'
page = BeautifulSoup(response.content, "lxml")
table = page.find('table',class_='wikitable')
trs = table.find_all('tr')
bigname = ()
del trs[0] # deletion
for tr in trs:
tds = tr.find_all('td')
name = tds[0].find('a').get('title')
link = "https://wikipedia.org" + tds[0].find('a').get('href')
bigname = (name, link)
print(bigname)
philosophers.append(bigname)
return len(philosophers)
except:
print('Scraping error')

Appending items Multiprocessing

In the function get_links, I am fetching the links of URLs. And in Scrape function, I am getting the content of each URL using text_from_html function( Not in the code). I want to append the url and visible_text into two lists containing urls and visible_text of each url. Here the list contains only one item and previous one is getting replaced. I want to keep the previous values also.
I'm getting the output as:
['https://www.scrapinghub.com']
['https://www.goodreads.com/quotes']
I need them in a single list.
def get_links(url):
visited_list.append(url)
try:
source_code = requests.get(url)
except Exception:
get_links(fringe.pop(0))
plain_text = source_code.text
soup = BeautifulSoup(plain_text,"lxml")
for link in soup.findAll(re.compile(r'(li|a)')):
href = link.get('href')
if (href is None) or (href in visited_list) or (href in fringe) or (('http://' not in href) and ('https://' not in href)):
continue
else:
subs = href.split('/')[2]
fstr = repr(fringe)
if subs in fstr:
continue
else:
if('blah' in href):
if('www' not in href):
href = href.split(":")[0] + ':' + "//" + "www." + href.split(":")[1][2:]
fringe.append(href)
else:
fringe.append(href)
return fringe
def test(url):
try:
res = requests.get(url)
plain_text = res.text
soup = BeautifulSoup(plain_text,"lxml")
visible_text = text_from_html(plain_text)
URL.append(url)
paragraph.append(visible_text)
except Exception:
print("CHECK the URL {}".format(url))
if __name__ == "__main__":
p = Pool(10)
p.map(test,fringe)
p.terminate()
p.join()

Pagination with BeautifulSoup

I am trying to get some data from the following website. https://www.drugbank.ca/drugs
For every drug in the table, I will need to go deeply and have the name and some other specific features like categories, structured indication (please click on drug name to see the features I will use).
I wrote the following code but the issue that I can't make my code handle pagination (as you see there more than 2000 pages!).
import requests
from bs4 import BeautifulSoup
def drug_data():
url = 'https://www.drugbank.ca/drugs/'
r = requests.get(url)
soup = BeautifulSoup(r.text ,"lxml")
for link in soup.select('name-head a'):
href = 'https://www.drugbank.ca/drugs/' + link.get('href')
pages_data(href)
def pages_data(item_url):
r = requests.get(item_url)
soup = BeautifulSoup(r.text, "lxml")
g_data = soup.select('div.content-container')
for item in g_data:
print item.contents[1].text
print item.contents[3].findAll('td')[1].text
try:
print item.contents[5].findAll('td',{'class':'col-md-2 col-sm-4'})
[0].text
except:
pass
print item_url
drug_data()
How can I scrape all of the data and handle pagination properly?

This page uses almost the same url for all pages so you can use for loop to generate them
def drug_data(page_number):
url = 'https://www.drugbank.ca/drugs/?page=' + str(page_number)
#... rest ...
# --- later ---
for x in range(1, 2001):
drug_data(x)
Or using while and try/except to get more then 2000 pages
def drug_data(page_number):
url = 'https://www.drugbank.ca/drugs/?page=' + str(page_number)
#... rest ...
# --- later ---
page = 0
while True:
try:
page += 1
drug_data(page)
except Exception as ex:
print(ex)
print("probably last page:", page)
break # exit `while` loop
You can also find url to next page in HTML
<a rel="next" class="page-link" href="/drugs?approved=1&c=name&d=up&page=2">›</a>
so you can use BeautifulSoup to get this link and use it.
It displays current url, finds link to next page (using class="page-link" rel="next") and loads it
import requests
from bs4 import BeautifulSoup
def drug_data():
url = 'https://www.drugbank.ca/drugs/'
while url:
print(url)
r = requests.get(url)
soup = BeautifulSoup(r.text ,"lxml")
#data = soup.select('name-head a')
#for link in data:
# href = 'https://www.drugbank.ca/drugs/' + link.get('href')
# pages_data(href)
# next page url
url = soup.findAll('a', {'class': 'page-link', 'rel': 'next'})
print(url)
if url:
url = 'https://www.drugbank.ca' + url[0].get('href')
else:
break
drug_data()
BTW: never use except:pass because you can have error which you didn't expect and you will not know why it doesn't work. Better display error
except Exception as ex:
print('Error:', ex)

Execute loop based on a list - obtain result for each page (subpage)

I am trying to obtain the number of pages for each url from a list of urls. My code works as long as I have only one url, however as soon as I try it with a list of urls I only get the rest from one url. Guess the problem is related to my loop. given that I am new to python and beautifoul soup I dont manage to spot the mistake myself.
base_url = 'https://www.holidaycheck.de'
main_page = 'https://www.holidaycheck.de/dh/hotels-tunesien/e10cef63-45d4-3511-92f1-43df5cbd9fe1?p={}'
urls=[]
##Change URL into object (soup)
r = requests.get(main_page.format(0))
soup = BeautifulSoup(r.text, "html5lib")
#get max page number
soup = BeautifulSoup(r.text, 'lxml')
data = soup.find_all('a', {'class':'link'})
res = []
for i in data:
res.append(i.text) #writing each value to res list
res_int = []
for i in res:
try:
res_int.append(int(i))
except:
print("current value is not a number")
last_page=max(res_int)
#print(last_page)
for i in range (1,last_page):
page = main_page.format(i)
for link in soup.find_all('div', {'class':'hotel-reviews-bar'}):
urls = base_url + link.find('a').get('href')+"/-/p/{}"
print(urls)
So far, everything works, I obtain the max page number and get all the urls from each page. The problem lies in the code below (I believe):
for url in urls: #to loop through the list of urls
r = requests.get(url.format(0))
soup = BeautifulSoup(r.text, 'lxml')
daten = soup.find_all('a', {'class':'link'})
tes = []
for z in daten:
tes.append(z.text) #writing each value to res list
print(tes)
tes_int = []
for z in tes:
try:
tes_int.append(int(z))
except:
print("current value is not a number")
anzahl=max(tes_int)
print(anzahl)
I am trying to apply the same concept as in the code above for each url from the list urls- but instead of obtaining the max page number for each url I obtain 241 every time, as if I am caught in a loop...
Any thoughts on that? Help is highly appreciated.

You're equating urls to last link generated by loop.
To build valid list of urls you need to replace = on append():
urls = []
for i in range (1,last_page):
page = main_page.format(i)
r = requests.get(page) #these 2 rows added
soup = BeautifulSoup(r.text, 'lxml') #these 2 rows added
for link in soup.find_all('div', {'class':'hotel-reviews-bar'}):
try:
urls.append(base_url + link.find('a').get('href')+"/-/p/{}")
except:
print('no link available', i)
print(urls)
EDIT: okay, as far as I see you have several issues in your code. along with my initial fix I'm outlining my vision and understanding of how your code desired to work:
import requests
from bs4 import BeautifulSoup
base_url = 'https://www.holidaycheck.de'
main_page = 'https://www.holidaycheck.de/dh/hotels-tunesien/e10cef63-45d4-3511-92f1-43df5cbd9fe1?p={}'
##Change URL into object (soup)
r = requests.get(main_page.format(0))
soup = BeautifulSoup(r.text, "html5lib")
#get max page number
soup = BeautifulSoup(r.text, 'lxml')
data = soup.find_all('a', {'class':'link'})
res = []
for i in data:
res.append(i.text) #writing each value to res list
res_int = []
for i in res:
try:
res_int.append(int(i))
except:
print("current value is not a number")
last_page=max(res_int)
#print(last_page)
urls = []
for i in range (1,last_page):
page = main_page.format(i)
r = requests.get(page) #these 2 rows added
soup = BeautifulSoup(r.text, 'lxml') #these 2 rows added
for link in soup.find_all('div', {'class':'hotel-reviews-bar'}):
try: #also adding try-except for escaping broken/unavailable links
urls.append(base_url + link.find('a').get('href')+"/-/p/{}")
except:
print('no link available', i)
urls = list(set(urls)) #check and drop duplicated in links list
for url in urls: #to loop through the list of urls
try:
r = requests.get(url.format(0))
print(url.format(0))
soup = BeautifulSoup(r.text, 'lxml')
daten = soup.find_all('a', {'class':'link'})
except:
print('broken link')
tes = []
for z in daten:
tes.append(z.text) #writing each value to res list
# print(tes)
tes_int = []
for z in tes:
try:
tes_int.append(int(z))
except:
print("current value is not a number")
try:
anzahl=max(tes_int)
print(anzahl)
except:
print('maximum cannot be calculated')

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Looping through all the pages in Python web scraping error - python

Related

Webscraping when some attributes aren't available for all pages

What's the best way to grab attributes of a particular td in html table?

Appending items Multiprocessing

Pagination with BeautifulSoup

Execute loop based on a list - obtain result for each page (subpage)

Categories

Resources