I have to modify this code so the scraping keeps only the links that contain a specific keyword. In my case I'm scraping a newspaper page to find news related to the term 'Brexit'.
I've tried modifying the method parse_links so it only keeps the links (or 'a' tags), that contain 'Brexit' in them, but it doesn't seem to work.
Where should i place the condition?
import requests
from bs4 import BeautifulSoup
from queue import Queue, Empty
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin, urlparse
class MultiThreadScraper:
def __init__(self, base_url):
self.base_url = base_url
self.root_url = '{}://{}'.format(urlparse(self.base_url).scheme, urlparse(self.base_url).netloc)
self.pool = ThreadPoolExecutor(max_workers=20)
self.scraped_pages = set([])
self.to_crawl = Queue(10)
self.to_crawl.put(self.base_url)
def parse_links(self, html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
url = link['href']
if url.startswith('/') or url.startswith(self.root_url):
url = urljoin(self.root_url, url)
if url not in self.scraped_pages:
self.to_crawl.put(url)
def scrape_info(self, html):
return
def post_scrape_callback(self, res):
result = res.result()
if result and result.status_code == 200:
self.parse_links(result.text)
self.scrape_info(result.text)
def scrape_page(self, url):
try:
res = requests.get(url, timeout=(3, 30))
return res
except requests.RequestException:
return
def run_scraper(self):
while True:
try:
target_url = self.to_crawl.get(timeout=60)
if target_url not in self.scraped_pages:
print("Scraping URL: {}".format(target_url))
self.scraped_pages.add(target_url)
job = self.pool.submit(self.scrape_page, target_url)
job.add_done_callback(self.post_scrape_callback)
except Empty:
return
except Exception as e:
print(e)
continue
if __name__ == '__main__':
s = MultiThreadScraper("https://elpais.com/")
s.run_scraper()
You need to import re module to get the specific text value.Try the below code.
import re
links = soup.find_all('a', text=re.compile("Brexit"))
This should return links which contains only Brexit.
You can get text of the element by using method getText() and check, if string actually contain "Brexit":
if "Brexit" in link.getText().split():
url = link["href"]
I added a check in this function. See if that does the rick for you:
def parse_links(self, html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
if 'BREXIT' in link.text.upper(): #<------ new if statement
url = link['href']
if url.startswith('/') or url.startswith(self.root_url):
url = urljoin(self.root_url, url)
if url not in self.scraped_pages:
self.to_crawl.put(url)
Related
This is the website I am trying to scrape:
(https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage)
Below is the code that I have tried,but it repetitively return me first page and third page.
from bs4 import BeautifulSoup
from urllib.request import urlopen
def parse():
base_url = 'https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage'
url="https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=3"
while True:
html = urlopen(url)
soup = BeautifulSoup(html ,"html.parser")
for link in soup.find_all('div',class_='entry-content'):
try:
shops=soup.find_all('div',class_="col-9")
names=soup.find_all('tr',class_="clickable")
for n, k in zip(names, shops):
name = n.find_all('td')[1].text.replace(' ','')
desc = k.text.replace(' ','')
print(name + "\n")
print(desc)
except AttributeError as e:
print(e)
next_button = soup.find('a', href=True)
if next_button:
url = base_url + next_button['href']
else:
break
parse()
Select your elements more specific, used css selectors here to get the <a> that is child of an element with class="PagedList-skipToNext" :
next_button = soup.select_one('.PagedList-skipToNext a')
Also check the results of your selection, base_url is not needed here:
url = next_button.get('href')
Example
from bs4 import BeautifulSoup
import requests
def parse():
url = 'https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage'
while True:
soup = BeautifulSoup(requests.get(url).text)
print(url) ## to see what you are working on or enter code that should be performed
next_button = soup.select_one('.PagedList-skipToNext a')
if next_button:
url = next_button.get('href')
else:
break
parse()
Output
https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=2
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=3
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=4
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=5
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=6
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=7
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=8
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=9
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=10
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=11
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=12
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=13
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=14
Here is the website I am trying to scrape: https://books.toscrape.com/
Below are my functions. The scrape_all_pages() is not working. Is there a better way to get the page number from the website directly so I can use the range function instead?
I did checkout Finding number of pages using Python BeautifulSoup
import requests
from bs4 import BeautifulSoup
def get_soup(url):
"""Takes a URL and returns a BeautifulSoup() instance representing the HTML of the page."""
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
return soup
def scrape_page(num):
"""Takes a page and returns a list of links to the book that are on the page."""
BASE_URL = 'http://books.toscrape.com/catalogue/'
PAGE_URL = BASE_URL + str('page-')
book_url = []
soup = get_soup(PAGE_URL + str(num)+ '.html')
for x in soup.findAll("article", class_ = "product_pod"):
url = x.div.a.get('href')
link = BASE_URL + url
if x not in book_url:
book_url.append(link)
return book_url
def scrape_all_pages():
"""Scrapes all pages, returning a list of book links."""
page_num = 0
all_urls = []
while True:
url = scrape_page(page_num)
if not url:
break
all_urls += url
page_num += 1
return all_urls
It do not need range() in most cases. Would recommend to change strategy and take a look if there is a link to next page available or not:
if soup.select_one('li.next a[href]'):
nextPage = BASE_URL + soup.select_one('li.next a[href]')['href']
else:
nextPage = None
or from python3.8 and later:
nextPage = BASE_URL + a['href'] if(a := soup.select_one('li.next a[href]')) else None
Example
Note Starts from https://books.toscrape.com/catalogue/page-45.html to limit for demo. You could simply change it to get https://books.toscrape.com/ getting all pages scraped
import requests
from bs4 import BeautifulSoup
def get_soup(url):
"""Takes a URL and returns a BeautifulSoup() instance representing the HTML of the page."""
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
return soup
def scrape_page(url):
"""Takes a page and append link of the books that are on the page to global list"""
BASE_URL = 'http://books.toscrape.com/catalogue/'
soup = get_soup(url)
for x in soup.find_all("article", class_ = "product_pod"):
url = x.div.a.get('href')
link = BASE_URL + url
if x not in book_url:
book_url.append(link)
if soup.select_one('li.next a[href]'):
nextPage = BASE_URL + soup.select_one('li.next a[href]')['href']
else:
nextPage = None
return nextPage
def scrape_all_pages(url):
"""Scrapes all pages, returning a list of book links."""
while True:
if url:
print(url)
url = scrape_page(url)
else:
break
return book_url
book_url = []
scrape_all_pages('https://books.toscrape.com/catalogue/page-45.html')
Output
['http://books.toscrape.com/catalogue/annie-on-my-mind_120/index.html',
'http://books.toscrape.com/catalogue/and-then-there-were-none_119/index.html',
'http://books.toscrape.com/catalogue/a-walk-in-the-woods-rediscovering-america-on-the-appalachian-trail_118/index.html',
'http://books.toscrape.com/catalogue/a-visit-from-the-goon-squad_117/index.html',
'http://books.toscrape.com/catalogue/a-storm-of-swords-a-song-of-ice-and-fire-3_116/index.html',
'http://books.toscrape.com/catalogue/a-heartbreaking-work-of-staggering-genius_115/index.html',
'http://books.toscrape.com/catalogue/8-keys-to-mental-health-through-exercise_114/index.html',
'http://books.toscrape.com/catalogue/girlboss_113/index.html',
'http://books.toscrape.com/catalogue/the-suffragettes-little-black-classics-96_112/index.html',
'http://books.toscrape.com/catalogue/the-sense-of-an-ending_111/index.html',
'http://books.toscrape.com/catalogue/the-sandman-vol-2-the-dolls-house-the-sandman-volumes-2_110/index.html',
'http://books.toscrape.com/catalogue/the-course-of-love_109/index.html',
'http://books.toscrape.com/catalogue/sugar-rush-offensive-line-2_108/index.html',
'http://books.toscrape.com/catalogue/saga-volume-2-saga-collected-editions-2_107/index.html',
'http://books.toscrape.com/catalogue/run-spot-run-the-ethics-of-keeping-pets_106/index.html',...]
I am trying to scrape airbnb data using BeautifulSoup and Pandas. I checked a lot of tutorials and found the one I followed. The step in which the soup should scrape the data from the next page is not working, out of 15 pages, it scrapes only the first 2 or 3 pages or sometimes even none (even if the URLs of the pages are correct).
I cannot seem to understand why this happens and how to solve it. Can someone help out?
import requests
import bs4
import pandas as pd
import numpy as np
import csv
import time
url = 'https://www.airbnb.it/s/Italy/homes?checkin=2021-08-01&checkout=2021-08-02'
def get_page(url):
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, "html.parser")
return soup
def get_listings(soup):
result = []
result.extend(soup.find_all("div", {"class": "_8ssblpx"}))
return result
def get_listing_title(listing):
for l in listing:
try:
return str(l.find('div', {'class': '_1tanv1h'}).text)
except:
return None
def get_listing_subtitle(listing):
for l in listing:
try:
return str(l.find('span', {'class': '_1whrsux9'}).text)
except:
return None
def get_listing_info(listing):
for l in listing:
try:
return str(l.find_all('div', {'class': '_3c0zz1'})[0].text.lower())
except:
return None
def find_next_page(page):
base_url = "https://www.airbnb.it"
try:
nextpage = base_url + get_page(url).find_all("div", attrs={"class": "_jro6t0"})[0].find("a", attrs={'class':'_za9j7e'})['href']
except:
nextpage = None
return nextpage
title = []
subtitle = []
info = []
while url is not None:
soup = get_page(url)
listings = get_listings(soup)
for l in listings:
title.append(get_listing_title(l))
subtitle.append(get_listing_subtitle(l))
info.append(get_listing_info(l))
time.sleep(5)
url = find_next_page(soup)
print(url)
airbnb_data = pd.DataFrame(data = {'title': title,
'subtitle': subtitle,
'info': info})
airbnb_data
I am trying to build a web crawler to extract all the links on a webpage. I have created 2 python files. (class: scanner.py and object: vulnerability-scanner.py). When I run the script, it is continuously running without stopping. I am unable to find the error. Help me to solve this.
Here is my source code:
scanner.py
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama
class Scanner:
colorama.init()
def __init__(self, url):
self.target_url = url
self.target_links = []
def is_valid(self, url):
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def get_all_website_links(self, url):
GREEN = colorama.Fore.GREEN
WHITE = colorama.Fore.WHITE
RESET = colorama.Fore.RESET
urls = set()
internal_urls = set()
external_urls = set()
domain_name = urlparse(url).netloc
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
for a_tag in soup.findAll("a"):
href = a_tag.attrs.get("href")
if href == "" or href is None:
continue
href = urljoin(url, href)
parsed_href = urlparse(href)
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not self.is_valid(href):
continue
if href in internal_urls:
continue
if domain_name not in href:
if href not in external_urls:
print(f"{WHITE}[*] External link: {href}{RESET}")
external_urls.add(href)
continue
print(f"{GREEN}[*] Internal link: {href}{RESET}")
urls.add(href)
internal_urls.add(href)
return urls
def crawl(self, url):
href_links = self.get_all_website_links(url)
for link in href_links:
print(link)
self.crawl(link)
vulnerability-scanner.py
import argu
target_url = "https://hack.me/"
vul_scanner = argu.Scanner(target_url)
vul_scanner.crawl(target_url)
The following part is (almost) an infinite recursion:
for link in href_links:
print(link)
self.crawl(link)
I believe you added this on the notion of crawling the links in the page. But you didn't put a stopping condition. (Although currently, it seems like your only stopping condition is if there is a crawled page with no links at all).
One stopping condition might be to set a predefined number of "max" levels to crawl.
Something like this in your init function:
def __init__(self, url):
self.target_url = url
self.target_links = []
self.max_parse_levels = 5 #you can go a step further and make this as an input to the constructore (i.e. __init__ function)
self.cur_parse_levels = 0
.
.
.
def crawl(url):
if self.cur_parse_levels > self.max_parse_levels:
return
for link in href_links:
print(link)
self.crawl(link)
I'm trying to create a script using python applying multiprocessing within it to fetch the link of different users from a webpage. Although the link of the users are available in it's landing page, I'm trying to dig them out from their inner pages. However, when I use yield within get_links() function and print() within get_target_link(), I can get the results as expected.
My question is: how can I achieve the same using yield within both of the functions?
I've tried:
import requests
import concurrent.futures
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def get_links(url):
res = requests.get(url)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select(".summary .question-hyperlink"):
yield urljoin(base,item.get("href"))
def get_target_link(targeturl):
res = requests.get(targeturl)
soup = BeautifulSoup(res.text,"lxml")
name_link = urljoin(base,soup.select_one(".user-details > a").get("href"))
yield name_link
if __name__ == '__main__':
base = 'https://stackoverflow.com'
mlink = "https://stackoverflow.com/questions/tagged/web-scraping"
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
future_to_url = {executor.submit(get_target_link, url): url for url in get_links(mlink)}
concurrent.futures.as_completed(future_to_url)
The above script produces no result at all.
A few problems with your initial approach that causing "no result at all":
BeautifulSoup(res.text,"lxml") - change parser to html.parser (you are parsing html web-pages)
there's no benefit of making the function get_target_link as generator cause it's not supposed to become an iterator and it already produces out a single result at once.
concurrent.futures.as_completed returns an iterator over the Future instances, not the final result
The corrected approach would look as below:
import requests
import concurrent.futures as futures
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def get_links(url):
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")
for link in soup.select(".summary .question-hyperlink"):
yield urljoin(base, link.get("href"))
def get_target_link(target_url):
res = requests.get(target_url)
soup = BeautifulSoup(res.text, "html.parser")
name_link = urljoin(base, soup.select_one(".user-details a").get("href"))
return name_link
if __name__ == '__main__':
base = 'https://stackoverflow.com'
mlink = "https://stackoverflow.com/questions/tagged/web-scraping"
with futures.ThreadPoolExecutor(max_workers=10) as executor:
future_to_url = {executor.submit(get_target_link, url): url for url in get_links(mlink)}
for future in futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as ex:
print(f'Failed to extract user details from url: {url}')
else:
print(data)
The output:
https://stackoverflow.com/users/10035985/andrej-kesely
https://stackoverflow.com/users/11520568/rachit-gupta
https://stackoverflow.com/users/10568531/robots-txt
https://stackoverflow.com/users/10664939/logan-anderson
https://stackoverflow.com/users/688393/c%c3%a9sar
https://stackoverflow.com/users/903061/gregor
https://stackoverflow.com/users/9950503/saraherceg
https://stackoverflow.com/users/80851/gmile
https://stackoverflow.com/users/11793150/saurabh-rawat
https://stackoverflow.com/users/11793061/xzatar
https://stackoverflow.com/users/11759292/rachel9866
https://stackoverflow.com/users/2628114/user2628114
https://stackoverflow.com/users/9810397/bart
https://stackoverflow.com/users/838355/ir2pid
https://stackoverflow.com/users/10629482/shreya
https://stackoverflow.com/users/11669928/thor-is
https://stackoverflow.com/users/7660288/acro2142
https://stackoverflow.com/users/3342430/freddiev4
https://stackoverflow.com/users/11767045/k-%c3%96sterlund
https://stackoverflow.com/users/11781213/mohamed-shire
https://stackoverflow.com/users/5412619/a-nonymous
https://stackoverflow.com/users/4354477/forcebru
https://stackoverflow.com/users/10568531/robots-txt
https://stackoverflow.com/users/6622587/eyllanesc
https://stackoverflow.com/users/10568531/robots-txt
https://stackoverflow.com/users/3273177/casabonita
https://stackoverflow.com/users/1540328/dipesh-parmar
https://stackoverflow.com/users/6231957/perth
https://stackoverflow.com/users/11400264/workin-4weekend
https://stackoverflow.com/users/1000551/vadim-kotov
https://stackoverflow.com/users/331508/brock-adams
https://stackoverflow.com/users/11300154/helloworld1990
https://stackoverflow.com/users/11786268/mohsine-jirou
https://stackoverflow.com/users/9707561/fatima-tt
https://stackoverflow.com/users/11759292/rachel9866
https://stackoverflow.com/users/6622587/eyllanesc
https://stackoverflow.com/users/11485683/titan
https://stackoverflow.com/users/11593630/supek
https://stackoverflow.com/users/11717116/raja-kishore-patnayakuni
https://stackoverflow.com/users/975887/madushan
https://stackoverflow.com/users/10568531/robots-txt
https://stackoverflow.com/users/283366/phil
https://stackoverflow.com/users/8677101/bpdesilva
https://stackoverflow.com/users/3504096/programmerper
https://stackoverflow.com/users/6303216/akhlaq-ahmed
https://stackoverflow.com/users/11457578/sh-student
https://stackoverflow.com/users/11783947/alexis-cruz-cruz
https://stackoverflow.com/users/3579212/adnanmuttaleb
https://stackoverflow.com/users/1060350/anony-mousse
https://stackoverflow.com/users/8100732/khadija-saeed