I want to use Python to obtain all the links in a domain given the 'root' URL (in a list). Suppose given a URL http://www.example.com this should return all the links on this page of the same domain as the root URL, then recurse on each of these links visiting them and extracting all the links of the same domain and so on. What I mean by same domain is if given http://www.example.com the only links I want back are http://www.example.com/something, http://www.example.com/somethingelse ... Anything external such as http://www.otherwebsite.com should be discarded. How can I do this using Python?
EDIT: I made an attempt using lxml. I don't think this works fully, and I am not sure how to take into account links to already processed pages (causing infinite loop).
import urllib
import lxml.html
#given a url returns list of all sublinks within the same domain
def getLinks(url):
urlList = []
urlList.append(url)
sublinks = getSubLinks(url)
for link in sublinks:
absolute = url+'/'+link
urlList.extend(getLinks(absolute))
return urlList
#determine whether two links are within the same domain
def sameDomain(url, dom):
return url.startswith(dom)
#get tree of sublinks in same domain, url is root
def getSubLinks(url):
sublinks = []
connection = urllib.urlopen(url)
dom = lxml.html.fromstring(connection.read())
for link in dom.xpath('//a/#href'):
if not (link.startswith('#') or link.startswith('http') or link.startswith('mailto:')):
sublinks.append(link)
return sublinks
~
import sys
import requests
import hashlib
from bs4 import BeautifulSoup
from datetime import datetime
def get_soup(link):
"""
Return the BeautifulSoup object for input link
"""
request_object = requests.get(link, auth=('user', 'pass'))
soup = BeautifulSoup(request_object.content)
return soup
def get_status_code(link):
"""
Return the error code for any url
param: link
"""
try:
error_code = requests.get(link).status_code
except requests.exceptions.ConnectionError:
error_code =
return error_code
def find_internal_urls(lufthansa_url, depth=0, max_depth=2):
all_urls_info = []
status_dict = {}
soup = get_soup(lufthansa_url)
a_tags = soup.findAll("a", href=True)
if depth > max_depth:
return {}
else:
for a_tag in a_tags:
if "http" not in a_tag["href"] and "/" in a_tag["href"]:
url = "http://www.lufthansa.com" + a_tag['href']
elif "http" in a_tag["href"]:
url = a_tag["href"]
else:
continue
status_dict["url"] = url
status_dict["status_code"] = get_status_code(url)
status_dict["timestamp"] = datetime.now()
status_dict["depth"] = depth + 1
all_urls_info.append(status_dict)
return all_urls_info
if __name__ == "__main__":
depth = 2 # suppose
all_page_urls = find_internal_urls("someurl", 2, 2)
if depth > 1:
for status_dict in all_page_urls:
find_internal_urls(status_dict['url'])
The above snippet contains necessary modules for scrapping urls from lufthansa arlines website. The only thing additional here is you can specify depth to which you want to scrape recursively.
Here is what I've done, only following full urls like http://domain[xxx]. Quick but a bit dirty.
import requests
import re
domain = u"stackoverflow.com"
http_re = re.compile(u"(http:\/\/" + domain + "[\/\w \.-]*\/?)")
visited = set([])
def visit (url):
visited.add (url)
extracted_body = requests.get (url).text
matches = re.findall (http_re, extracted_body)
for match in matches:
if match not in visited :
visit (match)
visit(u"http://" + domain)
print (visited)
There are some bugs in the code of #namita . I modify it and it works well now.
import sys
import requests
import hashlib
from bs4 import BeautifulSoup
from datetime import datetime
def get_soup(link):
"""
Return the BeautifulSoup object for input link
"""
request_object = requests.get(link, auth=('user', 'pass'))
soup = BeautifulSoup(request_object.content, "lxml")
return soup
def get_status_code(link):
"""
Return the error code for any url
param: link
"""
try:
error_code = requests.get(link).status_code
except requests.exceptions.ConnectionError:
error_code = -1
return error_code
def find_internal_urls(main_url, depth=0, max_depth=2):
all_urls_info = []
soup = get_soup(main_url)
a_tags = soup.findAll("a", href=True)
if main_url.endswith("/"):
domain = main_url
else:
domain = "/".join(main_url.split("/")[:-1])
print(domain)
if depth > max_depth:
return {}
else:
for a_tag in a_tags:
if "http://" not in a_tag["href"] and "https://" not in a_tag["href"] and "/" in a_tag["href"]:
url = domain + a_tag['href']
elif "http://" in a_tag["href"] or "https://" in a_tag["href"]:
url = a_tag["href"]
else:
continue
# print(url)
status_dict = {}
status_dict["url"] = url
status_dict["status_code"] = get_status_code(url)
status_dict["timestamp"] = datetime.now()
status_dict["depth"] = depth + 1
all_urls_info.append(status_dict)
return all_urls_info
if __name__ == "__main__":
url = # your domain here
depth = 1
all_page_urls = find_internal_urls(url, 0, 2)
# print("\n\n",all_page_urls)
if depth > 1:
for status_dict in all_page_urls:
find_internal_urls(status_dict['url'])
The code worked, but I don't know if it's 100% correct
it is extracting all the internal urls in the website
import requests
from bs4 import BeautifulSoup
def get_soup(link):
"""
Return the BeautifulSoup object for input link
"""
request_object = requests.get(link, auth=('user', 'pass'))
soup = BeautifulSoup(request_object.content, "lxml")
return soup
visited = set([])
def visit (url,domain):
visited.add (url)
soup = get_soup(url)
a_tags = soup.findAll("a", href=True)
for a_tag in a_tags:
if "http://" not in a_tag["href"] and "https://" not in a_tag["href"] and "/" in a_tag["href"]:
url = domain + a_tag['href']
elif "http://" in a_tag["href"] or "https://" in a_tag["href"]:
url = a_tag["href"]
else:
continue
if url not in visited and domain in url:
# print(url)
visit (url,domain)
url=input("Url: ")
domain=input("domain: ")
visit(u"" + url,domain)
print (visited)
From the tags of your question, I assume you are using Beautiful Soup.
At first, you obviously need to download the webpage, for example with urllib.request. After you did that and have the contents in a string, you pass it to Beautiful Soup. After that, you can find all links with soup.find_all('a'), assuming soup is your beautiful soup object. After that, you simply need to check the hrefs:
The most simple version would be to just check if "http://www.example.com" is in the href, but that won't catch relative links. I guess some wild regular expression would do (find everything with "www.example.com" or starting with "/" or starting with "?" (php)), or you might look for everything that contains a www, but is not www.example.com and discard it, etc. The correct strategy might be depending on the website you are scraping, and it's coding style.
You can use regular expression to filter out such links
eg
<a\shref\=\"(http\:\/\/example\.com[^\"]*)\"
Take the above regex as reference and start writing script based on that.
Related
This is my code:
https://pastebin.com/R11qiTF4
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as req
from urllib.parse import urljoin
import re
urls = ["https://www.helios-gesundheit.de"]
domain_list = ["https://www.helios-gesundheit.de/kliniken/schwerin/"]
prohibited = ["info", "news"]
text_keywords = ["Helios", "Helios"]
url_list = []
desired = "https://www.helios-gesundheit.de/kliniken/schwerin/unser-angebot/unsere-fachbereiche-klinikum/allgemein-und-viszeralchirurgie/team-allgemein-und-viszeralchirurgie/"
for x in range(len(domain_list)):
url_list.append(urls[x]+domain_list[x].replace(urls[x], ""))
print(url_list)
def prohibitedChecker(prohibited_list, string):
for x in prohibited_list:
if x in string:
return True
else:
return False
break
def parseHTML(url):
requestHTML = req(url)
htmlPage = requestHTML.read()
requestHTML.close()
parsedHTML = soup(htmlPage, "html.parser")
return parsedHTML
searched_word = "Helios"
for url in url_list:
parsedHTML = parseHTML(url)
href_crawler = parsedHTML.find_all("a", href=True)
for href in href_crawler:
crawled_url = urljoin(url,href.get("href"))
print(crawled_url)
if "www" not in crawled_url:
continue
parsedHTML = parseHTML(crawled_url)
results = parsedHTML.body.find_all(string=re.compile('.*{0}.*'.format(searched_word)), recursive=True)
for single_result in results:
keyword_text_check = prohibitedChecker(text_keywords, single_result.string)
if keyword_text_check != True:
continue
print(single_result.string)
I'm trying to print the contents of ''desired'' variable. The problem is the following, my code doesn't even get to request the URL of ''desired'' because its not in the website scope. ''desired'' href link is inside another href link that's inside the page I'm currently scraping. I thought I'd fix this by adding another for loop inside line 39 for loop, that requests every href found in my first, but this is too messy and not efficient
Is there way to get a list of every directory of a website url?
I have to modify this code so the scraping keeps only the links that contain a specific keyword. In my case I'm scraping a newspaper page to find news related to the term 'Brexit'.
I've tried modifying the method parse_links so it only keeps the links (or 'a' tags), that contain 'Brexit' in them, but it doesn't seem to work.
Where should i place the condition?
import requests
from bs4 import BeautifulSoup
from queue import Queue, Empty
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin, urlparse
class MultiThreadScraper:
def __init__(self, base_url):
self.base_url = base_url
self.root_url = '{}://{}'.format(urlparse(self.base_url).scheme, urlparse(self.base_url).netloc)
self.pool = ThreadPoolExecutor(max_workers=20)
self.scraped_pages = set([])
self.to_crawl = Queue(10)
self.to_crawl.put(self.base_url)
def parse_links(self, html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
url = link['href']
if url.startswith('/') or url.startswith(self.root_url):
url = urljoin(self.root_url, url)
if url not in self.scraped_pages:
self.to_crawl.put(url)
def scrape_info(self, html):
return
def post_scrape_callback(self, res):
result = res.result()
if result and result.status_code == 200:
self.parse_links(result.text)
self.scrape_info(result.text)
def scrape_page(self, url):
try:
res = requests.get(url, timeout=(3, 30))
return res
except requests.RequestException:
return
def run_scraper(self):
while True:
try:
target_url = self.to_crawl.get(timeout=60)
if target_url not in self.scraped_pages:
print("Scraping URL: {}".format(target_url))
self.scraped_pages.add(target_url)
job = self.pool.submit(self.scrape_page, target_url)
job.add_done_callback(self.post_scrape_callback)
except Empty:
return
except Exception as e:
print(e)
continue
if __name__ == '__main__':
s = MultiThreadScraper("https://elpais.com/")
s.run_scraper()
You need to import re module to get the specific text value.Try the below code.
import re
links = soup.find_all('a', text=re.compile("Brexit"))
This should return links which contains only Brexit.
You can get text of the element by using method getText() and check, if string actually contain "Brexit":
if "Brexit" in link.getText().split():
url = link["href"]
I added a check in this function. See if that does the rick for you:
def parse_links(self, html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
if 'BREXIT' in link.text.upper(): #<------ new if statement
url = link['href']
if url.startswith('/') or url.startswith(self.root_url):
url = urljoin(self.root_url, url)
if url not in self.scraped_pages:
self.to_crawl.put(url)
I try to parse https://www.drugbank.ca/drugs. The idea is to extract all the drug names and some additional informationfor each drug. As you can see each webpage represents a table with drug names and the when we hit the drugname we can access to this drug information.
Let's say I will keep the following code to handle the pagination:
import requests
from bs4 import BeautifulSoup
def drug_data():
url = 'https://www.drugbank.ca/drugs/'
while url:
print(url)
r = requests.get(url)
soup = BeautifulSoup(r.text ,"lxml")
#data = soup.select('name-head a')
#for link in data:
# href = 'https://www.drugbank.ca/drugs/' + link.get('href')
# pages_data(href)
# next page url
url = soup.findAll('a', {'class': 'page-link', 'rel': 'next'})
print(url)
if url:
url = 'https://www.drugbank.ca' + url[0].get('href')
else:
break
drug_data()
The issue is that in each page, and for each drug in the table of this page I need to capture :
Name.
Accession Number.
Structured Indications,
Generic Prescription Products,
I used the classical request/beautifusoup but can't go deep ..
Some Help please
Create function with requests and BeautifulSoup to get data from subpage
import requests
from bs4 import BeautifulSoup
def get_details(url):
print('details:', url)
# get subpage
r = requests.get(url)
soup = BeautifulSoup(r.text ,"lxml")
# get data on subpabe
dts = soup.findAll('dt')
dds = soup.findAll('dd')
# display details
for dt, dd in zip(dts, dds):
print(dt.text)
print(dd.text)
print('---')
print('---------------------------')
def drug_data():
url = 'https://www.drugbank.ca/drugs/'
while url:
print(url)
r = requests.get(url)
soup = BeautifulSoup(r.text ,"lxml")
# get links to subpages
links = soup.select('strong a')
for link in links:
# exeecute function to get subpage
get_details('https://www.drugbank.ca' + link['href'])
# next page url
url = soup.findAll('a', {'class': 'page-link', 'rel': 'next'})
print(url)
if url:
url = 'https://www.drugbank.ca' + url[0].get('href')
else:
break
drug_data()
To crawl effectively, you'll want to implement a few measures, such as maintaining a queue of urls to visit and be aware of what urls you have already visited.
Keeping in mind that links can be absolute or relative and that redirects are very likely, you also probably want to construct the urls dynamically rather than string concatenation.
Here is a generic (we usually only want to use example.com on SO) crawling workflow...
from urllib.parse import urljoin, urlparse # python
# from urlparse import urljoin, urlparse # legacy python2
import requests
from bs4 import BeautifulSoup
def process_page(soup):
'''data extraction process'''
pass
def is_external(link, base='example.com'):
'''determine if the link is external to base'''
site = urlparse(link).netloc
return base not in site
def resolve_link(current_location, href):
'''resolves final location of a link including redirects'''
req_loc = urljoin(current_location, href)
response = requests.head(req_loc)
resolved_location = response.url # location after redirects
# if you don't want to visit external links...
if is_external(resolved_location):
return None
return resolved_location
url_queue = ['https://example.com']
visited = set()
while url_queue:
url = url_queue.pop() # removes a url from the queue and assign it to `url`
response = requests.get(url)
current_location = response.url # final location after redirects
visited.add(url) # note that we've visited the given url
visited.add(current_location) # and the final location
soup = BeautifulSoup(response.text, 'lxml')
process_page(soup) # scrape the page
link_tags = soup.find_all('a') # gather additional links
for anchor in link_tags:
href = anchor.get('href')
link_location = resolve_link(current_location, href)
if link_location and link_location not in visited:
url_queue.append(link_location)
Hi I wanted to create a mini crawler but not use Scrapy,
I created something like this:
response = requests.get(url)
homepage_link_list = []
soup = BeautifulSoup(response.content, 'lxml')
for link in soup.findAll("a"):
if link.get("href"):
homepage_link_list.append(link.get("href"))
link_list = []
for item in homepage_link_list:
response = requests.get(item)
soup = BeautifulSoup(response.content, 'lxml')
for link in soup.findAll("a"):
if link.get("href"):
link_list.append(link.get("href"))
Although the problem I am encountering is that it only get the the link within the link of webpage, how can I make it do get all the links within all the links of website.
You need a recursive call flow. I have written below a class-oriented code. Main points are as follows:
This implementation is depth-first
Keep track of already scraped URLs so that we don't scrape them again
Ignore targets on a page. Eg. if http://example.com#item1, ignore item1
If https://example.com is already crawled, ignore http://example.com
Discard trailing slash. Eg. if http://example.com is already crawled, ignore http://example.com/
''' Scraper.
'''
import re
from urllib.parse import urljoin, urlsplit, SplitResult
import requests
from bs4 import BeautifulSoup
class RecursiveScraper:
''' Scrape URLs in a recursive manner.
'''
def __init__(self, url):
''' Constructor to initialize domain name and main URL.
'''
self.domain = urlsplit(url).netloc
self.mainurl = url
self.urls = set()
def preprocess_url(self, referrer, url):
''' Clean and filter URLs before scraping.
'''
if not url:
return None
fields = urlsplit(urljoin(referrer, url))._asdict() # convert to absolute URLs and split
fields['path'] = re.sub(r'/$', '', fields['path']) # remove trailing /
fields['fragment'] = '' # remove targets within a page
fields = SplitResult(**fields)
if fields.netloc == self.domain:
# Scrape pages of current domain only
if fields.scheme == 'http':
httpurl = cleanurl = fields.geturl()
httpsurl = httpurl.replace('http:', 'https:', 1)
else:
httpsurl = cleanurl = fields.geturl()
httpurl = httpsurl.replace('https:', 'http:', 1)
if httpurl not in self.urls and httpsurl not in self.urls:
# Return URL only if it's not already in list
return cleanurl
return None
def scrape(self, url=None):
''' Scrape the URL and its outward links in a depth-first order.
If URL argument is None, start from main page.
'''
if url is None:
url = self.mainurl
print("Scraping {:s} ...".format(url))
self.urls.add(url)
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')
for link in soup.findAll("a"):
childurl = self.preprocess_url(url, link.get("href"))
if childurl:
self.scrape(childurl)
if __name__ == '__main__':
rscraper = RecursiveScraper("http://bbc.com")
rscraper.scrape()
print(rscraper.urls)
It could be that the links you want to scrape are not actually links. They could be images. Sorry for writing this answer here actally I dont have much reputation to comment,
Your code is not fetching all the links of the website because it is not recursive. You're fetching the homepage links and traversing the links available in the content of the homepage links. But, you're not traversing the links you get in the content of those links you just traversed. My advice is you should check out some tree traversal algorithms and develop a scheme of traversal (recursive) according to the algorithm. The nodes of the trees will represent the links, root node being the link you passed in the beginning.
95% based on #coder.in.me answer let me insert another code here that can resolve an issue I was facing with.
My issue was: "If you try to scrape a url like: https://www.americanexpress.com/hu-hu/, it will only keep the https://www.americanexpress.com/ part of it and scrape all the amex sites globally, but I don't need all the non-hungarian pages."
You just need to change the
if fields.netloc == self.domain:
code to
if fields.netloc == self.domain and (fields.path.startswith('/hu-hu') or fields.path.startswith('/en-hu')):
Here is the modified code:
import re
from urllib.parse import urljoin, urlsplit, SplitResult
import requests
from bs4 import BeautifulSoup
class RecursiveScraper:
''' Scrape URLs in a recursive manner.
'''
def __init__(self, url):
''' Constructor to initialize domain name and main URL.
'''
self.domain = urlsplit(url).netloc
self.mainurl = url
self.urls = set()
def preprocess_url(self, referrer, url):
''' Clean and filter URLs before scraping.
'''
if not url:
return None
fields = urlsplit(urljoin(referrer, url))._asdict() # convert to absolute URLs and split
fields['path'] = re.sub(r'/$', '', fields['path']) # remove trailing /
fields['fragment'] = '' # remove targets within a page
fields = SplitResult(**fields)
#if fields.netloc == self.domain:
if fields.netloc == self.domain and (fields.path.startswith('/hu-hu') or fields.path.startswith('/en-hu')):
# Scrape pages of current domain only
if fields.scheme == 'http':
httpurl = cleanurl = fields.geturl()
httpsurl = httpurl.replace('http:', 'https:', 1)
else:
httpsurl = cleanurl = fields.geturl()
httpurl = httpsurl.replace('https:', 'http:', 1)
if httpurl not in self.urls and httpsurl not in self.urls:
# Return URL only if it's not already in list
return cleanurl
return None
def scrape(self, url=None):
''' Scrape the URL and its outward links in a depth-first order.
If URL argument is None, start from main page.
'''
if url is None:
url = self.mainurl
print("Scraping {:s} ...".format(url))
try:
response = requests.get(url)
self.urls.add(url)
soup = BeautifulSoup(response.content, 'lxml')
for link in soup.findAll("a"):
childurl = self.preprocess_url(url, link.get("href"))
if childurl:
self.scrape(childurl)
except requests.exceptions.SSLError:
pass
except requests.exceptions.InvalidSchema:
pass
if __name__ == '__main__':
rscraper = RecursiveScraper('https://www.americanexpress.com/hu-hu/')
rscraper.scrape()
Thanks!
I am trying to get all the unique urls of the website by calling the all_pages function recursively but this function is not giving all the urls of the website.
All I want to do is get all the unique urls of the website using BeautifulSoup. My code looks like this:
base_url = "http://www.readings.com.pk/"
unique_urls=[]
def all_pages(base_url,unique_urls=[]):
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "html.parser")
for link in soup.find_all("a"):
url = link["href"]
absolute_url = urljoin(base_url, url)
if absolute_url not in unique_urls:
if base_url in absolute_url:
unique_urls.append(absolute_url)
print (absolute_url)
all_pages(absolute_url,unique_urls,book_urls)
all_pages(base_url,unique_urls)
Use response.text instead of response.content
Also, you need to return at some point. Additionally, instead of making unique_urls a list, make it a set and they will always be unique.
Additionally, your method is recursive and python has a max recursion depth, so maybe you should instead do this:
base_url = "http://www.readings.com.pk/"
def all_pages(base_url):
response = requests.get(base_url)
unique_urls = {base_url}
visited_urls = set()
while len(unique_urls) > len(visited_urls)
soup = BeautifulSoup(response.text, "html.parser")
for link in soup.find_all("a"):
try:
url = link["href"]
except:
continue
absolute_url = base_url + url
unique_urls.add(absolute_url)
unvisited_url = (unique_urls - visited_urls).pop()
visited_urls.add(unvisited_url)
response = requests.get(unvisited_url)
return unique_urls
all_pages(base_url)