Scraping multiple webpages with Python - python

from bs4 import BeautifulSoup
import urllib, time
class scrape(object):
def __init__(self):
self.urls = ['https://www.onthemarket.com/for-sale/property/wigan/', 'https://www.onthemarket.com/for-sale/property/wigan/?page=1', 'https://www.onthemarket.com/for-sale/property/wigan/?page=2', 'https://www.onthemarket.com/for-sale/property/wigan/?page=3', 'https://www.onthemarket.com/for-sale/property/wigan/?page=4', 'https://www.onthemarket.com/for-sale/property/wigan/?page=6']
self.telephones = []
def extract_info(self):
for link in self.urls:
data = urllib.request.urlopen(link).read()
soup = BeautifulSoup(data, "lxml")
for tel in soup.findAll("span", {"class":"call"}):
self.telephones.append(tel.text.strip())
time.sleep(1)
return self.telephones
to = scrape()
print(to.extract_info())
What is wrong? This code is hanging after second website. It should extract phone numbers from each webpage in list self.urls

All you need to do is put a headers in your request parameter and make a go. Try this:
from bs4 import BeautifulSoup
import requests, time
class scrape(object):
def __init__(self):
self.urls = ['https://www.onthemarket.com/for-sale/property/wigan/', 'https://www.onthemarket.com/for-sale/property/wigan/?page=1', 'https://www.onthemarket.com/for-sale/property/wigan/?page=2', 'https://www.onthemarket.com/for-sale/property/wigan/?page=3', 'https://www.onthemarket.com/for-sale/property/wigan/?page=4', 'https://www.onthemarket.com/for-sale/property/wigan/?page=6']
self.telephones = []
def extract_info(self):
for link in self.urls:
data = requests.get(link,headers={"User-Agent":"Mozilla/5.0"}) #it should do the trick
soup = BeautifulSoup(data.text, "lxml")
for tel in soup.find_all("span",{"class":"call"}):
self.telephones.append(tel.text.strip())
time.sleep(1)
return self.telephones
crawl = scrape()
print(crawl.extract_info())

Related

Beautiful soup doesn't scrape the data from the "next" pages

I am trying to scrape airbnb data using BeautifulSoup and Pandas. I checked a lot of tutorials and found the one I followed. The step in which the soup should scrape the data from the next page is not working, out of 15 pages, it scrapes only the first 2 or 3 pages or sometimes even none (even if the URLs of the pages are correct).
I cannot seem to understand why this happens and how to solve it. Can someone help out?
import requests
import bs4
import pandas as pd
import numpy as np
import csv
import time
url = 'https://www.airbnb.it/s/Italy/homes?checkin=2021-08-01&checkout=2021-08-02'
def get_page(url):
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, "html.parser")
return soup
def get_listings(soup):
result = []
result.extend(soup.find_all("div", {"class": "_8ssblpx"}))
return result
def get_listing_title(listing):
for l in listing:
try:
return str(l.find('div', {'class': '_1tanv1h'}).text)
except:
return None
def get_listing_subtitle(listing):
for l in listing:
try:
return str(l.find('span', {'class': '_1whrsux9'}).text)
except:
return None
def get_listing_info(listing):
for l in listing:
try:
return str(l.find_all('div', {'class': '_3c0zz1'})[0].text.lower())
except:
return None
def find_next_page(page):
base_url = "https://www.airbnb.it"
try:
nextpage = base_url + get_page(url).find_all("div", attrs={"class": "_jro6t0"})[0].find("a", attrs={'class':'_za9j7e'})['href']
except:
nextpage = None
return nextpage
title = []
subtitle = []
info = []
while url is not None:
soup = get_page(url)
listings = get_listings(soup)
for l in listings:
title.append(get_listing_title(l))
subtitle.append(get_listing_subtitle(l))
info.append(get_listing_info(l))
time.sleep(5)
url = find_next_page(soup)
print(url)
airbnb_data = pd.DataFrame(data = {'title': title,
'subtitle': subtitle,
'info': info})
airbnb_data

I want to scrape all business links from this page

I want to extract all links(not title) of companies. Please guide me! Thanks!
here is the url of web page: https://hipages.com.au/find/antenna_services/nsw/sydney
here is my code:
import requests
from bs4 import BeautifulSoup
import re
def get_index_data(soup):
try:
links = soup.find_all('a', {'class': 'sc-bZQynM sc-iwsKbI dpKmnV'}).get('href')
except:
links = []
print(links)
def Main():
r = requests.get("https://hipages.com.au/find/antenna_services/nsw/sydney")
get_index_data(r)
Main()
import requests
from bs4 import BeautifulSoup
r = requests.get("https://hipages.com.au/find/antenna_services/nsw/sydney")
soup = BeautifulSoup(r.text, 'html.parser')
for item in soup.findAll("h3", {'class': 'sc-bZQynM sc-iwsKbI dpKmnV'}):
print(f"https://hipages.com.au{item.previous_element.get('href')}")
Output:
https://hipages.com.au/connect/glencoelectricalbuildingmaintenanceairconditioningsecurityalarmscctv
https://hipages.com.au/connect/emcoelectricalservices
https://hipages.com.au/connect/abcelectricservicespl/service/126298
https://hipages.com.au/connect/ozyblindsnscreens
https://hipages.com.au/connect/samedaytvantennaservice
https://hipages.com.au/connect/langenelectricalnsw
https://hipages.com.au/connect/allprohandymanmaintenance
https://hipages.com.au/connect/amateairconditioningrefrigerationservices
https://hipages.com.au/connect/makeurmove
https://hipages.com.au/connect/uberantennas/service/184323
https://hipages.com.au/connect/cmkelectricalanddata
https://hipages.com.au/connect/antennadistributionservicesptyltd
https://hipages.com.au/connect/sydneysparky
https://hipages.com.au/connect/bluediamond
https://hipages.com.au/connect/digiproantennas
https://hipages.com.au/connect/vascom
https://hipages.com.au/connect/sparkyselectricalanddataptyltd
https://hipages.com.au/connect/prosparksolutions

The python script is continuously running

I am trying to build a web crawler to extract all the links on a webpage. I have created 2 python files. (class: scanner.py and object: vulnerability-scanner.py). When I run the script, it is continuously running without stopping. I am unable to find the error. Help me to solve this.
Here is my source code:
scanner.py
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama
class Scanner:
colorama.init()
def __init__(self, url):
self.target_url = url
self.target_links = []
def is_valid(self, url):
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def get_all_website_links(self, url):
GREEN = colorama.Fore.GREEN
WHITE = colorama.Fore.WHITE
RESET = colorama.Fore.RESET
urls = set()
internal_urls = set()
external_urls = set()
domain_name = urlparse(url).netloc
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
for a_tag in soup.findAll("a"):
href = a_tag.attrs.get("href")
if href == "" or href is None:
continue
href = urljoin(url, href)
parsed_href = urlparse(href)
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not self.is_valid(href):
continue
if href in internal_urls:
continue
if domain_name not in href:
if href not in external_urls:
print(f"{WHITE}[*] External link: {href}{RESET}")
external_urls.add(href)
continue
print(f"{GREEN}[*] Internal link: {href}{RESET}")
urls.add(href)
internal_urls.add(href)
return urls
def crawl(self, url):
href_links = self.get_all_website_links(url)
for link in href_links:
print(link)
self.crawl(link)
vulnerability-scanner.py
import argu
target_url = "https://hack.me/"
vul_scanner = argu.Scanner(target_url)
vul_scanner.crawl(target_url)
The following part is (almost) an infinite recursion:
for link in href_links:
print(link)
self.crawl(link)
I believe you added this on the notion of crawling the links in the page. But you didn't put a stopping condition. (Although currently, it seems like your only stopping condition is if there is a crawled page with no links at all).
One stopping condition might be to set a predefined number of "max" levels to crawl.
Something like this in your init function:
def __init__(self, url):
self.target_url = url
self.target_links = []
self.max_parse_levels = 5 #you can go a step further and make this as an input to the constructore (i.e. __init__ function)
self.cur_parse_levels = 0
.
.
.
def crawl(url):
if self.cur_parse_levels > self.max_parse_levels:
return
for link in href_links:
print(link)
self.crawl(link)

How do I automatically turn a page and crawl more data on Stack Overflow (Python)?

I want to crawl the questions/answers URLs on Stack Overflow with the Android Studio tag for research purposes. There should be around 55,628 questions on the site:
https://stackoverflow.com/questions/tagged/android-studio?sort=newest&page=1&pagesize=15
However, I can only crawl 50 questions by far. It ends crawling in the middle of the forth page
I feel like I should write a for loop to iterate the URL, but I cannot picture where to revise first.
How do revise my program?
import requests
from bs4 import BeautifulSoup
import re
import json
class Stack(object):
def __init__(self):
self.baseurl = "https://stackoverflow.com"
self.starturl = "https://stackoverflow.com/questions/tagged/android-studio"
# The second page's URL: https://stackoverflow.com/questions/tagged/android-studio?sort=newest&page=2&pagesize=15
# The third page's URL: https://stackoverflow.com/questions/tagged/android-studio?sort=newest&page=3&pagesize=15
def start_requests(self, url):
r = requests.get(url)
return r.content
def parse(self, text):
soup = BeautifulSoup(text, 'html.parser')
divs = soup.find_all('div', class_ = 'question-summary')
for div in divs:
div.find('div', class_ = 'summary').find_all('div')[1].find_all('a')
yield {
'title': div.h3.a.text,
'url': self.baseurl + div.h3.a.get('href')
}
def start(self):
text = self.start_requests(self.starturl) #呼叫function start_requests
items = self.parse(text)
s = json.dumps(list(items), indent = 4, ensure_ascii=False)
with open('stackoverflow.json', 'w', encoding = 'utf-8') as f:
# If answer is nonempty
f.write(s)
stack = Stack()
stack.start()

Using BeautifulSoup to find links related to specific keyword

I have to modify this code so the scraping keeps only the links that contain a specific keyword. In my case I'm scraping a newspaper page to find news related to the term 'Brexit'.
I've tried modifying the method parse_links so it only keeps the links (or 'a' tags), that contain 'Brexit' in them, but it doesn't seem to work.
Where should i place the condition?
import requests
from bs4 import BeautifulSoup
from queue import Queue, Empty
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin, urlparse
class MultiThreadScraper:
def __init__(self, base_url):
self.base_url = base_url
self.root_url = '{}://{}'.format(urlparse(self.base_url).scheme, urlparse(self.base_url).netloc)
self.pool = ThreadPoolExecutor(max_workers=20)
self.scraped_pages = set([])
self.to_crawl = Queue(10)
self.to_crawl.put(self.base_url)
def parse_links(self, html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
url = link['href']
if url.startswith('/') or url.startswith(self.root_url):
url = urljoin(self.root_url, url)
if url not in self.scraped_pages:
self.to_crawl.put(url)
def scrape_info(self, html):
return
def post_scrape_callback(self, res):
result = res.result()
if result and result.status_code == 200:
self.parse_links(result.text)
self.scrape_info(result.text)
def scrape_page(self, url):
try:
res = requests.get(url, timeout=(3, 30))
return res
except requests.RequestException:
return
def run_scraper(self):
while True:
try:
target_url = self.to_crawl.get(timeout=60)
if target_url not in self.scraped_pages:
print("Scraping URL: {}".format(target_url))
self.scraped_pages.add(target_url)
job = self.pool.submit(self.scrape_page, target_url)
job.add_done_callback(self.post_scrape_callback)
except Empty:
return
except Exception as e:
print(e)
continue
if __name__ == '__main__':
s = MultiThreadScraper("https://elpais.com/")
s.run_scraper()
You need to import re module to get the specific text value.Try the below code.
import re
links = soup.find_all('a', text=re.compile("Brexit"))
This should return links which contains only Brexit.
You can get text of the element by using method getText() and check, if string actually contain "Brexit":
if "Brexit" in link.getText().split():
url = link["href"]
I added a check in this function. See if that does the rick for you:
def parse_links(self, html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
if 'BREXIT' in link.text.upper(): #<------ new if statement
url = link['href']
if url.startswith('/') or url.startswith(self.root_url):
url = urljoin(self.root_url, url)
if url not in self.scraped_pages:
self.to_crawl.put(url)

Categories