Using Beautiful Soup to get the full URL in source code - python

So I was looking at some source code and I came across this bit of code
<img src="/gallery/2012-winners-finalists/HM_Watching%20birds2_Shane%20Conklin_MA_2012.jpg"
now in the source code the link is blue and when you click it, it takes you to the full URL where that picture is located, I know how to get what is shown in the source code in Python using Beautiful Soup I was wondering though how to get the full URL you get once clicking the link in the source code?
EDIT:
if I was given <a href = "/folder/big/a.jpg" how do you figure out the starting part of that url through python or beautiful soup?

<a href="/folder/big/a.jpg">
That’s an absolute address for the current host. So if the HTML file is at http://example.com/foo/bar.html, then applying the url /folder/big/a.jpg will result in this:
http://example.com/folder/big/a.jpg
I.e. take the host name and apply the new path to it.
Python has the builtin urljoin function to perform this operation for you:
>>> from urllib.parse import urljoin
>>> base = 'http://example.com/foo/bar.html'
>>> href = '/folder/big/a.jpg'
>>> urljoin(base, href)
'http://example.com/folder/big/a.jpg'
For Python 2, the function is within the urlparse module.

from bs4 import BeautifulSoup
import requests
import lxml
r = requests.get("http://example.com")
url = r.url # this is base url
data = r.content # this is content of page
soup = BeautifulSoup(data, 'lxml')
temp_url = soup.find('a')['href'] # you need to modify this selector
if temp_url[0:7] == "http://" or temp_url[0:8] == "https://" : # if url have http://
url = temp_url
else:
url = url + temp_url
print url # this is your full url

import os
current_url = 'https://example.com/b/c.html?a=1&b=2'
href = '/folder/big/a.jpg'
absolute_url = os.path.dirname(current_url) + href
print(absolute_url)

Related

Beautifulsoup extracting urls from a given website menu

Hello every one I'm new to beautifulsoup, I'm trying to write a function that will be able to extract second level urls from a given website.
For example if I have this website url : https://edition.cnn.com/ my function should be able to return
https://edition.cnn.com/world
https://edition.cnn.com/politics
https://edition.cnn.com/business
https://edition.cnn.com/health
https://edition.cnn.com/entertainment
https://edition.cnn.com/style
https://edition.cnn.com/travel
first I have tried this code to retrieve all links starting with the string of the url:
from bs4 import BeautifulSoup as bs4
import requests
import lxml
import re
def getLinks(url):
response = requests.get(url)
data = response.text
soup = bs4(data, 'lxml')
links = []
for link in soup.find_all('a', href=re.compile(str(url))):
links.append(link.get('href'))
return links
But then again the actual output is giving me all the links even links of articles which is not I'm looking for. is there a method that I can use to get what I want using regular expression or others.
The links are inside <nav> tag, so using CSS selector nav a[href] will select only links inside <nav> tag:
import requests
from bs4 import BeautifulSoup
url = 'https://edition.cnn.com'
soup = BeautifulSoup(requests.get(url).text, 'lxml')
for a in soup.select('nav a[href]'):
if a['href'].count('/') > 1 or '#' in a['href']:
continue
print(url + a['href'])
Prints:
https://edition.cnn.com/world
https://edition.cnn.com/politics
https://edition.cnn.com/business
https://edition.cnn.com/health
https://edition.cnn.com/entertainment
https://edition.cnn.com/style
https://edition.cnn.com/travel
https://edition.cnn.com/sport
https://edition.cnn.com/videos
https://edition.cnn.com/world
https://edition.cnn.com/africa
https://edition.cnn.com/americas
https://edition.cnn.com/asia
https://edition.cnn.com/australia
https://edition.cnn.com/china
https://edition.cnn.com/europe
https://edition.cnn.com/india
https://edition.cnn.com/middle-east
https://edition.cnn.com/uk
...and so on.

Need to take 'href' value from 'a' tag in following output [duplicate]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 2 years ago.
Improve this question
How can I retrieve the links of a webpage and copy the url address of the links using Python?
Here's a short snippet using the SoupStrainer class in BeautifulSoup:
import httplib2
from bs4 import BeautifulSoup, SoupStrainer
http = httplib2.Http()
status, response = http.request('http://www.nytimes.com')
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
if link.has_attr('href'):
print(link['href'])
The BeautifulSoup documentation is actually quite good, and covers a number of typical scenarios:
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
Edit: Note that I used the SoupStrainer class because it's a bit more efficient (memory and speed wise), if you know what you're parsing in advance.
For completeness sake, the BeautifulSoup 4 version, making use of the encoding supplied by the server as well:
from bs4 import BeautifulSoup
import urllib.request
parser = 'html.parser' # or 'lxml' (preferred) or 'html5lib', if installed
resp = urllib.request.urlopen("http://www.gpsbasecamp.com/national-parks")
soup = BeautifulSoup(resp, parser, from_encoding=resp.info().get_param('charset'))
for link in soup.find_all('a', href=True):
print(link['href'])
or the Python 2 version:
from bs4 import BeautifulSoup
import urllib2
parser = 'html.parser' # or 'lxml' (preferred) or 'html5lib', if installed
resp = urllib2.urlopen("http://www.gpsbasecamp.com/national-parks")
soup = BeautifulSoup(resp, parser, from_encoding=resp.info().getparam('charset'))
for link in soup.find_all('a', href=True):
print link['href']
and a version using the requests library, which as written will work in both Python 2 and 3:
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
import requests
parser = 'html.parser' # or 'lxml' (preferred) or 'html5lib', if installed
resp = requests.get("http://www.gpsbasecamp.com/national-parks")
http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
encoding = html_encoding or http_encoding
soup = BeautifulSoup(resp.content, parser, from_encoding=encoding)
for link in soup.find_all('a', href=True):
print(link['href'])
The soup.find_all('a', href=True) call finds all <a> elements that have an href attribute; elements without the attribute are skipped.
BeautifulSoup 3 stopped development in March 2012; new projects really should use BeautifulSoup 4, always.
Note that you should leave decoding the HTML from bytes to BeautifulSoup. You can inform BeautifulSoup of the characterset found in the HTTP response headers to assist in decoding, but this can be wrong and conflicting with a <meta> header info found in the HTML itself, which is why the above uses the BeautifulSoup internal class method EncodingDetector.find_declared_encoding() to make sure that such embedded encoding hints win over a misconfigured server.
With requests, the response.encoding attribute defaults to Latin-1 if the response has a text/* mimetype, even if no characterset was returned. This is consistent with the HTTP RFCs but painful when used with HTML parsing, so you should ignore that attribute when no charset is set in the Content-Type header.
Others have recommended BeautifulSoup, but it's much better to use lxml. Despite its name, it is also for parsing and scraping HTML. It's much, much faster than BeautifulSoup, and it even handles "broken" HTML better than BeautifulSoup (their claim to fame). It has a compatibility API for BeautifulSoup too if you don't want to learn the lxml API.
Ian Blicking agrees.
There's no reason to use BeautifulSoup anymore, unless you're on Google App Engine or something where anything not purely Python isn't allowed.
lxml.html also supports CSS3 selectors so this sort of thing is trivial.
An example with lxml and xpath would look like this:
import urllib
import lxml.html
connection = urllib.urlopen('http://www.nytimes.com')
dom = lxml.html.fromstring(connection.read())
for link in dom.xpath('//a/#href'): # select the url in href for all a tags(links)
print link
import urllib2
import BeautifulSoup
request = urllib2.Request("http://www.gpsbasecamp.com/national-parks")
response = urllib2.urlopen(request)
soup = BeautifulSoup.BeautifulSoup(response)
for a in soup.findAll('a'):
if 'national-park' in a['href']:
print 'found a url with national-park in the link'
The following code is to retrieve all the links available in a webpage using urllib2 and BeautifulSoup4:
import urllib2
from bs4 import BeautifulSoup
url = urllib2.urlopen("http://www.espncricinfo.com/").read()
soup = BeautifulSoup(url)
for line in soup.find_all('a'):
print(line.get('href'))
Links can be within a variety of attributes so you could pass a list of those attributes to select.
For example, with src and href attributes (here I am using the starts with ^ operator to specify that either of these attributes values starts with http):
from bs4 import BeautifulSoup as bs
import requests
r = requests.get('https://stackoverflow.com/')
soup = bs(r.content, 'lxml')
links = [item['href'] if item.get('href') is not None else item['src'] for item in soup.select('[href^="http"], [src^="http"]') ]
print(links)
Attribute = value selectors
[attr^=value]
Represents elements with an attribute name of attr whose value is prefixed (preceded) by value.
There are also the commonly used $ (ends with) and * (contains) operators. For a full syntax list see the link above.
Under the hood BeautifulSoup now uses lxml. Requests, lxml & list comprehensions makes a killer combo.
import requests
import lxml.html
dom = lxml.html.fromstring(requests.get('http://www.nytimes.com').content)
[x for x in dom.xpath('//a/#href') if '//' in x and 'nytimes.com' not in x]
In the list comp, the "if '//' and 'url.com' not in x" is a simple method to scrub the url list of the sites 'internal' navigation urls, etc.
just for getting the links, without B.soup and regex:
import urllib2
url="http://www.somewhere.com"
page=urllib2.urlopen(url)
data=page.read().split("</a>")
tag="<a href=\""
endtag="\">"
for item in data:
if "<a href" in item:
try:
ind = item.index(tag)
item=item[ind+len(tag):]
end=item.index(endtag)
except: pass
else:
print item[:end]
for more complex operations, of course BSoup is still preferred.
This script does what your looking for, But also resolves the relative links to absolute links.
import urllib
import lxml.html
import urlparse
def get_dom(url):
connection = urllib.urlopen(url)
return lxml.html.fromstring(connection.read())
def get_links(url):
return resolve_links((link for link in get_dom(url).xpath('//a/#href')))
def guess_root(links):
for link in links:
if link.startswith('http'):
parsed_link = urlparse.urlparse(link)
scheme = parsed_link.scheme + '://'
netloc = parsed_link.netloc
return scheme + netloc
def resolve_links(links):
root = guess_root(links)
for link in links:
if not link.startswith('http'):
link = urlparse.urljoin(root, link)
yield link
for link in get_links('http://www.google.com'):
print link
To find all the links, we will in this example use the urllib2 module together
with the re.module
*One of the most powerful function in the re module is "re.findall()".
While re.search() is used to find the first match for a pattern, re.findall() finds all
the matches and returns them as a list of strings, with each string representing one match*
import urllib2
import re
#connect to a URL
website = urllib2.urlopen(url)
#read html code
html = website.read()
#use re.findall to get all the links
links = re.findall('"((http|ftp)s?://.*?)"', html)
print links
Why not use regular expressions:
import urllib2
import re
url = "http://www.somewhere.com"
page = urllib2.urlopen(url)
page = page.read()
links = re.findall(r"<a.*?\s*href=\"(.*?)\".*?>(.*?)</a>", page)
for link in links:
print('href: %s, HTML text: %s' % (link[0], link[1]))
Here's an example using #ars accepted answer and the BeautifulSoup4, requests, and wget modules to handle the downloads.
import requests
import wget
import os
from bs4 import BeautifulSoup, SoupStrainer
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/eeg-mld/eeg_full/'
file_type = '.tar.gz'
response = requests.get(url)
for link in BeautifulSoup(response.content, 'html.parser', parse_only=SoupStrainer('a')):
if link.has_attr('href'):
if file_type in link['href']:
full_path = url + link['href']
wget.download(full_path)
I found the answer by #Blairg23 working , after the following correction (covering the scenario where it failed to work correctly):
for link in BeautifulSoup(response.content, 'html.parser', parse_only=SoupStrainer('a')):
if link.has_attr('href'):
if file_type in link['href']:
full_path =urlparse.urljoin(url , link['href']) #module urlparse need to be imported
wget.download(full_path)
For Python 3:
urllib.parse.urljoin has to be used in order to obtain the full URL instead.
BeatifulSoup's own parser can be slow. It might be more feasible to use lxml which is capable of parsing directly from a URL (with some limitations mentioned below).
import lxml.html
doc = lxml.html.parse(url)
links = doc.xpath('//a[#href]')
for link in links:
print link.attrib['href']
The code above will return the links as is, and in most cases they would be relative links or absolute from the site root. Since my use case was to only extract a certain type of links, below is a version that converts the links to full URLs and which optionally accepts a glob pattern like *.mp3. It won't handle single and double dots in the relative paths though, but so far I didn't have the need for it. If you need to parse URL fragments containing ../ or ./ then urlparse.urljoin might come in handy.
NOTE: Direct lxml url parsing doesn't handle loading from https and doesn't do redirects, so for this reason the version below is using urllib2 + lxml.
#!/usr/bin/env python
import sys
import urllib2
import urlparse
import lxml.html
import fnmatch
try:
import urltools as urltools
except ImportError:
sys.stderr.write('To normalize URLs run: `pip install urltools --user`')
urltools = None
def get_host(url):
p = urlparse.urlparse(url)
return "{}://{}".format(p.scheme, p.netloc)
if __name__ == '__main__':
url = sys.argv[1]
host = get_host(url)
glob_patt = len(sys.argv) > 2 and sys.argv[2] or '*'
doc = lxml.html.parse(urllib2.urlopen(url))
links = doc.xpath('//a[#href]')
for link in links:
href = link.attrib['href']
if fnmatch.fnmatch(href, glob_patt):
if not href.startswith(('http://', 'https://' 'ftp://')):
if href.startswith('/'):
href = host + href
else:
parent_url = url.rsplit('/', 1)[0]
href = urlparse.urljoin(parent_url, href)
if urltools:
href = urltools.normalize(href)
print href
The usage is as follows:
getlinks.py http://stackoverflow.com/a/37758066/191246
getlinks.py http://stackoverflow.com/a/37758066/191246 "*users*"
getlinks.py http://fakedomain.mu/somepage.html "*.mp3"
There can be many duplicate links together with both external and internal links. To differentiate between the two and just get unique links using sets:
# Python 3.
import urllib
from bs4 import BeautifulSoup
url = "http://www.espncricinfo.com/"
resp = urllib.request.urlopen(url)
# Get server encoding per recommendation of Martijn Pieters.
soup = BeautifulSoup(resp, from_encoding=resp.info().get_param('charset'))
external_links = set()
internal_links = set()
for line in soup.find_all('a'):
link = line.get('href')
if not link:
continue
if link.startswith('http'):
external_links.add(link)
else:
internal_links.add(link)
# Depending on usage, full internal links may be preferred.
full_internal_links = {
urllib.parse.urljoin(url, internal_link)
for internal_link in internal_links
}
# Print all unique external and full internal links.
for link in external_links.union(full_internal_links):
print(link)
import urllib2
from bs4 import BeautifulSoup
a=urllib2.urlopen('http://dir.yahoo.com')
code=a.read()
soup=BeautifulSoup(code)
links=soup.findAll("a")
#To get href part alone
print links[0].attrs['href']

Python Beautiful Soup img tag inside a div parsing wrong link showing

I have this code:
import urllib
import urllib.request
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
theurl= 'http://es.ninemanga.com/chapter/Dragon%20Ball%20Multiverse/279006.html'
req = Request(theurl + '.html', headers={'User-Agent': 'Mozilla/5.0'})
thepage = urlopen(req).read()
soup = BeautifulSoup(thepage, "html.parser")
for divs in soup.findAll('div', {"class": "pic_box"}):
temp = divs.find('img', {"id" : "manga_pic_1"})
temp1 = temp.get('src')
print(temp1 + "\n")
I want to get all div tags with a class pic_box and inside of them all the img tags and their src
I have done this correctly with soup.findAll('div', {"class": "pic_box"})
and then temp.get('src') but somehow I get:
http://a8.ninemanga.com/es_manga/43/555/279006/4c58c372ca4561627e5a01f6c841290e.jpg
instead of:
https://c5.ninemanga.com/es_manga/43/555/279006/939559ac8d7af80cf6b4ead0ada4f718.jpg
Are they somehow blocking my request or am I doing something wrong here?
repl to test it
referenced link in theurl variable from which I want to extract 'src'
Looks like they can detect scraping requests and block them. Even using a fake agent is not working(i tried). Try out something like Selenium , that can automate browser activity and download it through your browser itself.
the image has unique class attribute - 'manga_pic' get image with manga_pic class

Returning webSITE links Python Beautifulsoup

I am using Python 3.5 with beautifulsoup (bs4) and urllib. The code I will append returns all the links for ONE page.
How do I loop this so it runs across all pages in the website, using the links found on each page to dictate which pages are to be scraped next. As I don't know how many hops I need to go.
I have tried looping it of course, but it never stops as pages contain links to pages I have already scanned. I have tried creating sets of the links I have scanned putting in IF not in set ... but again it just runs forever.
import bs4
import re
import urllib.request
website = 'http://elderscrolls.wikia.com/wiki/Skyrim'
req = urllib.request.Request(website)
with urllib.request.urlopen(req) as response:
the_page = response.read()#store web page html
dSite = bs4.BeautifulSoup(the_page, "html.parser")
links = []
for link in dSite.find_all('a'):#grab all links on page
links.append(link.get('href'))
siteOnly = re.split('/', website)
validLinks = set()
for item in links:
if re.search('^/' +siteOnly[3] + '/', str(item)):#filter links to local website
newLink = 'http://' + str(siteOnly[2]) + str(item)
validLinks.add(newLink)
print(validLinks)
import bs4, requests
from urllib.parse import urljoin
base_url = 'http://elderscrolls.wikia.com/wiki/Skyrim'
response = requests.get(base_url)
soup = bs4.BeautifulSoup(response.text, 'lxml')
local_a_tags = soup.select('a[href^="/wiki/"]')
links = [a['href']for a in local_a_tags]
full_links = [urljoin(base_url, link) for link in links]
print (full_links)
out:
http://elderscrolls.wikia.com/wiki/The_Elder_Scrolls_Wiki
http://elderscrolls.wikia.com/wiki/Portal:Online
http://elderscrolls.wikia.com/wiki/Quests_(Online)
http://elderscrolls.wikia.com/wiki/Main_Quest_(Online)
http://elderscrolls.wikia.com/wiki/Aldmeri_Dominion_Quests
http://elderscrolls.wikia.com/wiki/Daggerfall_Covenant_Quests
http://elderscrolls.wikia.com/wiki/Ebonheart_Pact_Quests
http://elderscrolls.wikia.com/wiki/Category:Online:_Side_Quests
http://elderscrolls.wikia.com/wiki/Factions_(Online)
http://elderscrolls.wikia.com/wiki/Aldmeri_Dominion_(Online)
http://elderscrolls.wikia.com/wiki/Daggerfall_Covenant
http://elderscrolls.wikia.com/wiki/Ebonheart_Pact
http://elderscrolls.wikia.com/wiki/Classes_(Online)
http://elderscrolls.wikia.com/wiki/Dragonknight
http://elderscrolls.wikia.com/wiki/Sorcerer_(Online)
http://elderscrolls.wikia.com/wiki/Nightblade_(Online)
http://elderscrolls.wikia.com/wiki/Templar
http://elderscrolls.wikia.com/wiki/Races_(Online)
http://elderscrolls.wikia.com/wiki/Altmer_(Online)
http://elderscrolls.wikia.com/wiki/Argonian_(Online)
http://elderscrolls.wikia.com/wiki/Bosmer_(Online)
http://elderscrolls.wikia.com/wiki/Breton_(Online)
http://elderscrolls.wikia.com/wiki/Dunmer_(Online)
http://elderscrolls.wikia.com/wiki/Imperial_(Online)
http://elderscrolls.wikia.com/wiki/Khajiit_(Online)
http://elderscrolls.wikia.com/wiki/Nord_(Online)
http://elderscrolls.wikia.com/wiki/Orsimer_(Online)
http://elderscrolls.wikia.com/wiki/Redguard_(Online)
http://elderscrolls.wikia.com/wiki/Locations_(Online)
http://elderscrolls.wikia.com/wiki/Regions_(Online)
http://elderscrolls.wikia.com/wiki/Category:Online:_Realms
http://elderscrolls.wikia.com/wiki/Category:Online:_Cities
http://elderscrolls.wikia.com/wiki/Category:Online:_Dungeons
http://elderscrolls.wikia.com/wiki/Category:Online:_Dark_Anchors
http://elderscrolls.wikia.com/wiki/Wayshrines_(Online)
http://elderscrolls.wikia.com/wiki/Category:Online:_Unmarked_Locations
http://elderscrolls.wikia.com/wiki/Combat_(Online)
http://elderscrolls.wikia.com/wiki/Skills_(Online)
http://elderscrolls.wikia.com/wiki/Ultimate_Skills
http://elderscrolls.wikia.com/wiki/Synergy
http://elderscrolls.wikia.com/wiki/Finesse
http://elderscrolls.wikia.com/wiki/Add-ons
First, use requests instead of urllib.
Than, use Beautifulsoup CSS selector can to filter the href base on the start, you can refer the document to learn more.
Finally, use urljoin to convert relative url to absoulte url
>>> from urllib.parse import urljoin
>>> urljoin('http://www.cwi.nl/%7Eguido/Python.html', 'FAQ.html')
'http://www.cwi.nl/%7Eguido/FAQ.html'

retrieve links from web page using python and BeautifulSoup [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 2 years ago.
Improve this question
How can I retrieve the links of a webpage and copy the url address of the links using Python?
Here's a short snippet using the SoupStrainer class in BeautifulSoup:
import httplib2
from bs4 import BeautifulSoup, SoupStrainer
http = httplib2.Http()
status, response = http.request('http://www.nytimes.com')
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
if link.has_attr('href'):
print(link['href'])
The BeautifulSoup documentation is actually quite good, and covers a number of typical scenarios:
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
Edit: Note that I used the SoupStrainer class because it's a bit more efficient (memory and speed wise), if you know what you're parsing in advance.
For completeness sake, the BeautifulSoup 4 version, making use of the encoding supplied by the server as well:
from bs4 import BeautifulSoup
import urllib.request
parser = 'html.parser' # or 'lxml' (preferred) or 'html5lib', if installed
resp = urllib.request.urlopen("http://www.gpsbasecamp.com/national-parks")
soup = BeautifulSoup(resp, parser, from_encoding=resp.info().get_param('charset'))
for link in soup.find_all('a', href=True):
print(link['href'])
or the Python 2 version:
from bs4 import BeautifulSoup
import urllib2
parser = 'html.parser' # or 'lxml' (preferred) or 'html5lib', if installed
resp = urllib2.urlopen("http://www.gpsbasecamp.com/national-parks")
soup = BeautifulSoup(resp, parser, from_encoding=resp.info().getparam('charset'))
for link in soup.find_all('a', href=True):
print link['href']
and a version using the requests library, which as written will work in both Python 2 and 3:
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
import requests
parser = 'html.parser' # or 'lxml' (preferred) or 'html5lib', if installed
resp = requests.get("http://www.gpsbasecamp.com/national-parks")
http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
encoding = html_encoding or http_encoding
soup = BeautifulSoup(resp.content, parser, from_encoding=encoding)
for link in soup.find_all('a', href=True):
print(link['href'])
The soup.find_all('a', href=True) call finds all <a> elements that have an href attribute; elements without the attribute are skipped.
BeautifulSoup 3 stopped development in March 2012; new projects really should use BeautifulSoup 4, always.
Note that you should leave decoding the HTML from bytes to BeautifulSoup. You can inform BeautifulSoup of the characterset found in the HTTP response headers to assist in decoding, but this can be wrong and conflicting with a <meta> header info found in the HTML itself, which is why the above uses the BeautifulSoup internal class method EncodingDetector.find_declared_encoding() to make sure that such embedded encoding hints win over a misconfigured server.
With requests, the response.encoding attribute defaults to Latin-1 if the response has a text/* mimetype, even if no characterset was returned. This is consistent with the HTTP RFCs but painful when used with HTML parsing, so you should ignore that attribute when no charset is set in the Content-Type header.
Others have recommended BeautifulSoup, but it's much better to use lxml. Despite its name, it is also for parsing and scraping HTML. It's much, much faster than BeautifulSoup, and it even handles "broken" HTML better than BeautifulSoup (their claim to fame). It has a compatibility API for BeautifulSoup too if you don't want to learn the lxml API.
Ian Blicking agrees.
There's no reason to use BeautifulSoup anymore, unless you're on Google App Engine or something where anything not purely Python isn't allowed.
lxml.html also supports CSS3 selectors so this sort of thing is trivial.
An example with lxml and xpath would look like this:
import urllib
import lxml.html
connection = urllib.urlopen('http://www.nytimes.com')
dom = lxml.html.fromstring(connection.read())
for link in dom.xpath('//a/#href'): # select the url in href for all a tags(links)
print link
import urllib2
import BeautifulSoup
request = urllib2.Request("http://www.gpsbasecamp.com/national-parks")
response = urllib2.urlopen(request)
soup = BeautifulSoup.BeautifulSoup(response)
for a in soup.findAll('a'):
if 'national-park' in a['href']:
print 'found a url with national-park in the link'
The following code is to retrieve all the links available in a webpage using urllib2 and BeautifulSoup4:
import urllib2
from bs4 import BeautifulSoup
url = urllib2.urlopen("http://www.espncricinfo.com/").read()
soup = BeautifulSoup(url)
for line in soup.find_all('a'):
print(line.get('href'))
Links can be within a variety of attributes so you could pass a list of those attributes to select.
For example, with src and href attributes (here I am using the starts with ^ operator to specify that either of these attributes values starts with http):
from bs4 import BeautifulSoup as bs
import requests
r = requests.get('https://stackoverflow.com/')
soup = bs(r.content, 'lxml')
links = [item['href'] if item.get('href') is not None else item['src'] for item in soup.select('[href^="http"], [src^="http"]') ]
print(links)
Attribute = value selectors
[attr^=value]
Represents elements with an attribute name of attr whose value is prefixed (preceded) by value.
There are also the commonly used $ (ends with) and * (contains) operators. For a full syntax list see the link above.
Under the hood BeautifulSoup now uses lxml. Requests, lxml & list comprehensions makes a killer combo.
import requests
import lxml.html
dom = lxml.html.fromstring(requests.get('http://www.nytimes.com').content)
[x for x in dom.xpath('//a/#href') if '//' in x and 'nytimes.com' not in x]
In the list comp, the "if '//' and 'url.com' not in x" is a simple method to scrub the url list of the sites 'internal' navigation urls, etc.
just for getting the links, without B.soup and regex:
import urllib2
url="http://www.somewhere.com"
page=urllib2.urlopen(url)
data=page.read().split("</a>")
tag="<a href=\""
endtag="\">"
for item in data:
if "<a href" in item:
try:
ind = item.index(tag)
item=item[ind+len(tag):]
end=item.index(endtag)
except: pass
else:
print item[:end]
for more complex operations, of course BSoup is still preferred.
This script does what your looking for, But also resolves the relative links to absolute links.
import urllib
import lxml.html
import urlparse
def get_dom(url):
connection = urllib.urlopen(url)
return lxml.html.fromstring(connection.read())
def get_links(url):
return resolve_links((link for link in get_dom(url).xpath('//a/#href')))
def guess_root(links):
for link in links:
if link.startswith('http'):
parsed_link = urlparse.urlparse(link)
scheme = parsed_link.scheme + '://'
netloc = parsed_link.netloc
return scheme + netloc
def resolve_links(links):
root = guess_root(links)
for link in links:
if not link.startswith('http'):
link = urlparse.urljoin(root, link)
yield link
for link in get_links('http://www.google.com'):
print link
To find all the links, we will in this example use the urllib2 module together
with the re.module
*One of the most powerful function in the re module is "re.findall()".
While re.search() is used to find the first match for a pattern, re.findall() finds all
the matches and returns them as a list of strings, with each string representing one match*
import urllib2
import re
#connect to a URL
website = urllib2.urlopen(url)
#read html code
html = website.read()
#use re.findall to get all the links
links = re.findall('"((http|ftp)s?://.*?)"', html)
print links
Why not use regular expressions:
import urllib2
import re
url = "http://www.somewhere.com"
page = urllib2.urlopen(url)
page = page.read()
links = re.findall(r"<a.*?\s*href=\"(.*?)\".*?>(.*?)</a>", page)
for link in links:
print('href: %s, HTML text: %s' % (link[0], link[1]))
Here's an example using #ars accepted answer and the BeautifulSoup4, requests, and wget modules to handle the downloads.
import requests
import wget
import os
from bs4 import BeautifulSoup, SoupStrainer
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/eeg-mld/eeg_full/'
file_type = '.tar.gz'
response = requests.get(url)
for link in BeautifulSoup(response.content, 'html.parser', parse_only=SoupStrainer('a')):
if link.has_attr('href'):
if file_type in link['href']:
full_path = url + link['href']
wget.download(full_path)
I found the answer by #Blairg23 working , after the following correction (covering the scenario where it failed to work correctly):
for link in BeautifulSoup(response.content, 'html.parser', parse_only=SoupStrainer('a')):
if link.has_attr('href'):
if file_type in link['href']:
full_path =urlparse.urljoin(url , link['href']) #module urlparse need to be imported
wget.download(full_path)
For Python 3:
urllib.parse.urljoin has to be used in order to obtain the full URL instead.
BeatifulSoup's own parser can be slow. It might be more feasible to use lxml which is capable of parsing directly from a URL (with some limitations mentioned below).
import lxml.html
doc = lxml.html.parse(url)
links = doc.xpath('//a[#href]')
for link in links:
print link.attrib['href']
The code above will return the links as is, and in most cases they would be relative links or absolute from the site root. Since my use case was to only extract a certain type of links, below is a version that converts the links to full URLs and which optionally accepts a glob pattern like *.mp3. It won't handle single and double dots in the relative paths though, but so far I didn't have the need for it. If you need to parse URL fragments containing ../ or ./ then urlparse.urljoin might come in handy.
NOTE: Direct lxml url parsing doesn't handle loading from https and doesn't do redirects, so for this reason the version below is using urllib2 + lxml.
#!/usr/bin/env python
import sys
import urllib2
import urlparse
import lxml.html
import fnmatch
try:
import urltools as urltools
except ImportError:
sys.stderr.write('To normalize URLs run: `pip install urltools --user`')
urltools = None
def get_host(url):
p = urlparse.urlparse(url)
return "{}://{}".format(p.scheme, p.netloc)
if __name__ == '__main__':
url = sys.argv[1]
host = get_host(url)
glob_patt = len(sys.argv) > 2 and sys.argv[2] or '*'
doc = lxml.html.parse(urllib2.urlopen(url))
links = doc.xpath('//a[#href]')
for link in links:
href = link.attrib['href']
if fnmatch.fnmatch(href, glob_patt):
if not href.startswith(('http://', 'https://' 'ftp://')):
if href.startswith('/'):
href = host + href
else:
parent_url = url.rsplit('/', 1)[0]
href = urlparse.urljoin(parent_url, href)
if urltools:
href = urltools.normalize(href)
print href
The usage is as follows:
getlinks.py http://stackoverflow.com/a/37758066/191246
getlinks.py http://stackoverflow.com/a/37758066/191246 "*users*"
getlinks.py http://fakedomain.mu/somepage.html "*.mp3"
There can be many duplicate links together with both external and internal links. To differentiate between the two and just get unique links using sets:
# Python 3.
import urllib
from bs4 import BeautifulSoup
url = "http://www.espncricinfo.com/"
resp = urllib.request.urlopen(url)
# Get server encoding per recommendation of Martijn Pieters.
soup = BeautifulSoup(resp, from_encoding=resp.info().get_param('charset'))
external_links = set()
internal_links = set()
for line in soup.find_all('a'):
link = line.get('href')
if not link:
continue
if link.startswith('http'):
external_links.add(link)
else:
internal_links.add(link)
# Depending on usage, full internal links may be preferred.
full_internal_links = {
urllib.parse.urljoin(url, internal_link)
for internal_link in internal_links
}
# Print all unique external and full internal links.
for link in external_links.union(full_internal_links):
print(link)
import urllib2
from bs4 import BeautifulSoup
a=urllib2.urlopen('http://dir.yahoo.com')
code=a.read()
soup=BeautifulSoup(code)
links=soup.findAll("a")
#To get href part alone
print links[0].attrs['href']

Categories