Appending items Multiprocessing - python

In the function get_links, I am fetching the links of URLs. And in Scrape function, I am getting the content of each URL using text_from_html function( Not in the code). I want to append the url and visible_text into two lists containing urls and visible_text of each url. Here the list contains only one item and previous one is getting replaced. I want to keep the previous values also.
I'm getting the output as:
['https://www.scrapinghub.com']
['https://www.goodreads.com/quotes']
I need them in a single list.
def get_links(url):
visited_list.append(url)
try:
source_code = requests.get(url)
except Exception:
get_links(fringe.pop(0))
plain_text = source_code.text
soup = BeautifulSoup(plain_text,"lxml")
for link in soup.findAll(re.compile(r'(li|a)')):
href = link.get('href')
if (href is None) or (href in visited_list) or (href in fringe) or (('http://' not in href) and ('https://' not in href)):
continue
else:
subs = href.split('/')[2]
fstr = repr(fringe)
if subs in fstr:
continue
else:
if('blah' in href):
if('www' not in href):
href = href.split(":")[0] + ':' + "//" + "www." + href.split(":")[1][2:]
fringe.append(href)
else:
fringe.append(href)
return fringe
def test(url):
try:
res = requests.get(url)
plain_text = res.text
soup = BeautifulSoup(plain_text,"lxml")
visible_text = text_from_html(plain_text)
URL.append(url)
paragraph.append(visible_text)
except Exception:
print("CHECK the URL {}".format(url))
if __name__ == "__main__":
p = Pool(10)
p.map(test,fringe)
p.terminate()
p.join()

Related

What's the best way to grab attributes of a particular td in html table?

everybody.
So, I'm trying to write this function as a part of my python course. What it should do is go to a wiki page, parse the table with Greek philosophers there, and return the list of tuples, each containing the name of the philosopher and a link to his wiki page. Below is what I've got:
def get_philosophers():
url="https://en.wikipedia.org/wiki/List_of_ancient_Greek_philosophers"
philosophers = []
import requests
from bs4 import BeautifulSoup
try:
response = requests.get(url)
if not response.status_code == 200:
return 'Main page error'
page = BeautifulSoup(response.content, "lxml")
table = page.find('table',class_='wikitable')
trs = table.find_all('tr')
bigname = ()
for tr in trs:
tds = tr.find_all('td')
name = tds[0].find('a').get('title')
link = "https://wikipedia.org" + tds[0].find('a').get('href')
bigname = (name, link)
philosophers.append(bigname)
return len(philosophers)
except:
print('Scraping error')
I've tried commands via console, they mainly worked; except for the 'for' loop, which returned 'index out of range' error on the name = tds[0].find('a').get('title') line, but when earlier I tried same commands not as a loop, but just for one of the elements, they worked alright.
UPD: modified the function:
url="https://en.wikipedia.org/wiki/List_of_ancient_Greek_philosophers"
philosophers = []
import requests
from bs4 import BeautifulSoup
try:
response = requests.get(url)
if not response.status_code == 200:
return 'Main page error'
page = BeautifulSoup(response.content, "lxml")
table = page.find('table',class_='wikitable')
trs = table.find_all('tr')
bigname = ()
for tr in trs[1:]: #skip the thead tr element
try:
tds = tr.find_all('td')
name = tds[0].find('a').get('title')
link = "https://wikipedia.org" + tds[0].find('a').get('href')
bigname = (name, link)
philosophers.append(bigname)
# return philosophers
except:
print('Loop error')
return philosophers
except:
print('Scraping error')
works as intended.
It was the position of try - except that created the issue. Try :
def get_philosophers():
url="https://en.wikipedia.org/wiki/List_of_ancient_Greek_philosophers"
philosophers = []
import requests
from bs4 import BeautifulSoup
response = requests.get(url)
if not response.status_code == 200:
return 'Main page error'
page = BeautifulSoup(response.content, "lxml")
table = page.find('table',class_='wikitable')
trs = table.find_all('tr')
bigname = ()
for tr in trs:
try:
tds = tr.find_all('td')
name = tds[0].find('a').get('title')
link = "https://wikipedia.org" + tds[0].find('a').get('href')
bigname = (name, link)
philosophers.append(bigname)
except:
pass
return len(philosophers)
Now call it:
x = get_philosophers()
print(x)
What this does is that, it skips the error causing tr while iterating.
Or just delete the first error causer:
def get_philosophers():
url="https://en.wikipedia.org/wiki/List_of_ancient_Greek_philosophers"
philosophers = []
import requests
from bs4 import BeautifulSoup
try:
response = requests.get(url)
if not response.status_code == 200:
return 'Main page error'
page = BeautifulSoup(response.content, "lxml")
table = page.find('table',class_='wikitable')
trs = table.find_all('tr')
bigname = ()
del trs[0] # deletion
for tr in trs:
tds = tr.find_all('td')
name = tds[0].find('a').get('title')
link = "https://wikipedia.org" + tds[0].find('a').get('href')
bigname = (name, link)
print(bigname)
philosophers.append(bigname)
return len(philosophers)
except:
print('Scraping error')

Looping through all the pages in Python web scraping error

I am trying to scrape a webpage and looping through all the pages within a link. When I am looping through all the pages below code gives many duplicates
lst = []
urls = ['https://www.f150forum.com/f118/2019-adding-adaptive-cruise-454662/','https://www.f150forum.com/f118/adaptive-cruise-control-module-300894/']
for url in urls:
with requests.Session() as req:
for item in range(1,33):
response = req.get(f"{url}index{item}/")
soup = BeautifulSoup(response.content, "html.parser")
threadtitle = soup.find('h1',attrs={"class":"threadtitle"})
for item in soup.findAll('a',attrs={"class":"bigusername"}):
lst.append([threadtitle.text])
for div in soup.find_all('div', class_="ism-true"):
try:
div.find('div', class_="panel alt2").extract()
except AttributeError:
pass
try:
div.find('label').extract()
except AttributeError:
pass
result = [div.get_text(strip=True, separator=" ")]
comments.append(result)
Modification to the code as below doesnot give duplicates but skips last page of the url
comments= []
for url in urls:
with requests.Session() as req:
index=1
while(True):
response = req.get(url+"index{}/".format(index))
index=index+1
soup = BeautifulSoup(response.content, "html.parser")
if 'disabled' in soup.select_one('a#mb_pagenext').attrs['class']:
break
posts = soup.find(id = "posts")
threadtitle = soup.find('h1',attrs={"class":"threadtitle"})
for item in soup.findAll('a',attrs={"class":"bigusername"}):
lst.append([threadtitle.text])
for div in soup.find_all('div', class_="ism-true"):
try:
div.find('div', class_="panel alt2").extract()
except AttributeError:
pass # sometimes there is no 'panel alt2'
try:
div.find('label').extract()
except AttributeError:
pass # sometimes there is no 'Quote'
result = [div.get_text(strip=True, separator=" ")]
comments.append(result)
removing " if 'disabled' in soup.select_one('a#mb_pagenext').attrs['class']: break" this code gives infinite loop. How can I loop through pages without getting duplicates
Just change the order of the if condition to bottom of the loop so that once all the items grab then this will check disabled or not.If you provide at top this will break without capturing the values from last page.
comments= []
for url in urls:
with requests.Session() as req:
index=1
while(True):
response = req.get(url+"index{}/".format(index))
index=index+1
soup = BeautifulSoup(response.content, "html.parser")
posts = soup.find(id = "posts")
threadtitle = soup.find('h1',attrs={"class":"threadtitle"})
for item in soup.findAll('a',attrs={"class":"bigusername"}):
lst.append([threadtitle.text])
for div in soup.find_all('div', class_="ism-true"):
try:
div.find('div', class_="panel alt2").extract()
except AttributeError:
pass # sometimes there is no 'panel alt2'
try:
div.find('label').extract()
except AttributeError:
pass # sometimes there is no 'Quote'
result = [div.get_text(strip=True, separator=" ")]
comments.append(result)
if 'disabled' in soup.select_one('a#mb_pagenext').attrs['class']:
break

Using BeautifulSoup to find links related to specific keyword

I have to modify this code so the scraping keeps only the links that contain a specific keyword. In my case I'm scraping a newspaper page to find news related to the term 'Brexit'.
I've tried modifying the method parse_links so it only keeps the links (or 'a' tags), that contain 'Brexit' in them, but it doesn't seem to work.
Where should i place the condition?
import requests
from bs4 import BeautifulSoup
from queue import Queue, Empty
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin, urlparse
class MultiThreadScraper:
def __init__(self, base_url):
self.base_url = base_url
self.root_url = '{}://{}'.format(urlparse(self.base_url).scheme, urlparse(self.base_url).netloc)
self.pool = ThreadPoolExecutor(max_workers=20)
self.scraped_pages = set([])
self.to_crawl = Queue(10)
self.to_crawl.put(self.base_url)
def parse_links(self, html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
url = link['href']
if url.startswith('/') or url.startswith(self.root_url):
url = urljoin(self.root_url, url)
if url not in self.scraped_pages:
self.to_crawl.put(url)
def scrape_info(self, html):
return
def post_scrape_callback(self, res):
result = res.result()
if result and result.status_code == 200:
self.parse_links(result.text)
self.scrape_info(result.text)
def scrape_page(self, url):
try:
res = requests.get(url, timeout=(3, 30))
return res
except requests.RequestException:
return
def run_scraper(self):
while True:
try:
target_url = self.to_crawl.get(timeout=60)
if target_url not in self.scraped_pages:
print("Scraping URL: {}".format(target_url))
self.scraped_pages.add(target_url)
job = self.pool.submit(self.scrape_page, target_url)
job.add_done_callback(self.post_scrape_callback)
except Empty:
return
except Exception as e:
print(e)
continue
if __name__ == '__main__':
s = MultiThreadScraper("https://elpais.com/")
s.run_scraper()
You need to import re module to get the specific text value.Try the below code.
import re
links = soup.find_all('a', text=re.compile("Brexit"))
This should return links which contains only Brexit.
You can get text of the element by using method getText() and check, if string actually contain "Brexit":
if "Brexit" in link.getText().split():
url = link["href"]
I added a check in this function. See if that does the rick for you:
def parse_links(self, html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
if 'BREXIT' in link.text.upper(): #<------ new if statement
url = link['href']
if url.startswith('/') or url.startswith(self.root_url):
url = urljoin(self.root_url, url)
if url not in self.scraped_pages:
self.to_crawl.put(url)

Execute loop based on a list - obtain result for each page (subpage)

I am trying to obtain the number of pages for each url from a list of urls. My code works as long as I have only one url, however as soon as I try it with a list of urls I only get the rest from one url. Guess the problem is related to my loop. given that I am new to python and beautifoul soup I dont manage to spot the mistake myself.
base_url = 'https://www.holidaycheck.de'
main_page = 'https://www.holidaycheck.de/dh/hotels-tunesien/e10cef63-45d4-3511-92f1-43df5cbd9fe1?p={}'
urls=[]
##Change URL into object (soup)
r = requests.get(main_page.format(0))
soup = BeautifulSoup(r.text, "html5lib")
#get max page number
soup = BeautifulSoup(r.text, 'lxml')
data = soup.find_all('a', {'class':'link'})
res = []
for i in data:
res.append(i.text) #writing each value to res list
res_int = []
for i in res:
try:
res_int.append(int(i))
except:
print("current value is not a number")
last_page=max(res_int)
#print(last_page)
for i in range (1,last_page):
page = main_page.format(i)
for link in soup.find_all('div', {'class':'hotel-reviews-bar'}):
urls = base_url + link.find('a').get('href')+"/-/p/{}"
print(urls)
So far, everything works, I obtain the max page number and get all the urls from each page. The problem lies in the code below (I believe):
for url in urls: #to loop through the list of urls
r = requests.get(url.format(0))
soup = BeautifulSoup(r.text, 'lxml')
daten = soup.find_all('a', {'class':'link'})
tes = []
for z in daten:
tes.append(z.text) #writing each value to res list
print(tes)
tes_int = []
for z in tes:
try:
tes_int.append(int(z))
except:
print("current value is not a number")
anzahl=max(tes_int)
print(anzahl)
I am trying to apply the same concept as in the code above for each url from the list urls- but instead of obtaining the max page number for each url I obtain 241 every time, as if I am caught in a loop...
Any thoughts on that? Help is highly appreciated.
You're equating urls to last link generated by loop.
To build valid list of urls you need to replace = on append():
urls = []
for i in range (1,last_page):
page = main_page.format(i)
r = requests.get(page) #these 2 rows added
soup = BeautifulSoup(r.text, 'lxml') #these 2 rows added
for link in soup.find_all('div', {'class':'hotel-reviews-bar'}):
try:
urls.append(base_url + link.find('a').get('href')+"/-/p/{}")
except:
print('no link available', i)
print(urls)
EDIT: okay, as far as I see you have several issues in your code. along with my initial fix I'm outlining my vision and understanding of how your code desired to work:
import requests
from bs4 import BeautifulSoup
base_url = 'https://www.holidaycheck.de'
main_page = 'https://www.holidaycheck.de/dh/hotels-tunesien/e10cef63-45d4-3511-92f1-43df5cbd9fe1?p={}'
##Change URL into object (soup)
r = requests.get(main_page.format(0))
soup = BeautifulSoup(r.text, "html5lib")
#get max page number
soup = BeautifulSoup(r.text, 'lxml')
data = soup.find_all('a', {'class':'link'})
res = []
for i in data:
res.append(i.text) #writing each value to res list
res_int = []
for i in res:
try:
res_int.append(int(i))
except:
print("current value is not a number")
last_page=max(res_int)
#print(last_page)
urls = []
for i in range (1,last_page):
page = main_page.format(i)
r = requests.get(page) #these 2 rows added
soup = BeautifulSoup(r.text, 'lxml') #these 2 rows added
for link in soup.find_all('div', {'class':'hotel-reviews-bar'}):
try: #also adding try-except for escaping broken/unavailable links
urls.append(base_url + link.find('a').get('href')+"/-/p/{}")
except:
print('no link available', i)
urls = list(set(urls)) #check and drop duplicated in links list
for url in urls: #to loop through the list of urls
try:
r = requests.get(url.format(0))
print(url.format(0))
soup = BeautifulSoup(r.text, 'lxml')
daten = soup.find_all('a', {'class':'link'})
except:
print('broken link')
tes = []
for z in daten:
tes.append(z.text) #writing each value to res list
# print(tes)
tes_int = []
for z in tes:
try:
tes_int.append(int(z))
except:
print("current value is not a number")
try:
anzahl=max(tes_int)
print(anzahl)
except:
print('maximum cannot be calculated')

How to scrape whole website using beautifulsoup

I am trying to get all the unique urls of the website by calling the all_pages function recursively but this function is not giving all the urls of the website.
All I want to do is get all the unique urls of the website using BeautifulSoup. My code looks like this:
base_url = "http://www.readings.com.pk/"
unique_urls=[]
def all_pages(base_url,unique_urls=[]):
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "html.parser")
for link in soup.find_all("a"):
url = link["href"]
absolute_url = urljoin(base_url, url)
if absolute_url not in unique_urls:
if base_url in absolute_url:
unique_urls.append(absolute_url)
print (absolute_url)
all_pages(absolute_url,unique_urls,book_urls)
all_pages(base_url,unique_urls)
Use response.text instead of response.content
Also, you need to return at some point. Additionally, instead of making unique_urls a list, make it a set and they will always be unique.
Additionally, your method is recursive and python has a max recursion depth, so maybe you should instead do this:
base_url = "http://www.readings.com.pk/"
def all_pages(base_url):
response = requests.get(base_url)
unique_urls = {base_url}
visited_urls = set()
while len(unique_urls) > len(visited_urls)
soup = BeautifulSoup(response.text, "html.parser")
for link in soup.find_all("a"):
try:
url = link["href"]
except:
continue
absolute_url = base_url + url
unique_urls.add(absolute_url)
unvisited_url = (unique_urls - visited_urls).pop()
visited_urls.add(unvisited_url)
response = requests.get(unvisited_url)
return unique_urls
all_pages(base_url)

Categories