How can I get href links from HTML using Python? - python

import urllib2
website = "WEBSITE"
openwebsite = urllib2.urlopen(website)
html = getwebsite.read()
print html
So far so good.
But I want only href links from the plain text HTML. How can I solve this problem?

Try with Beautifulsoup:
from BeautifulSoup import BeautifulSoup
import urllib2
import re
html_page = urllib2.urlopen("http://www.yourwebsite.com")
soup = BeautifulSoup(html_page)
for link in soup.findAll('a'):
print link.get('href')
In case you just want links starting with http://, you should use:
soup.findAll('a', attrs={'href': re.compile("^http://")})
In Python 3 with BS4 it should be:
from bs4 import BeautifulSoup
import urllib.request
html_page = urllib.request.urlopen("http://www.yourwebsite.com")
soup = BeautifulSoup(html_page, "html.parser")
for link in soup.findAll('a'):
print(link.get('href'))

You can use the HTMLParser module.
The code would probably look something like this:
from HTMLParser import HTMLParser
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
# Only parse the 'anchor' tag.
if tag == "a":
# Check the list of defined attributes.
for name, value in attrs:
# If href is defined, print it.
if name == "href":
print name, "=", value
parser = MyHTMLParser()
parser.feed(your_html_string)
Note: The HTMLParser module has been renamed to html.parser in Python 3.0. The 2to3 tool will automatically adapt imports when converting your sources to 3.0.

Look at using the beautiful soup html parsing library.
http://www.crummy.com/software/BeautifulSoup/
You will do something like this:
import BeautifulSoup
soup = BeautifulSoup.BeautifulSoup(html)
for link in soup.findAll("a"):
print link.get("href")

Using BS4 for this specific task seems overkill.
Try instead:
website = urllib2.urlopen('http://10.123.123.5/foo_images/Repo/')
html = website.read()
files = re.findall('href="(.*tgz|.*tar.gz)"', html)
print sorted(x for x in (files))
I found this nifty piece of code on http://www.pythonforbeginners.com/code/regular-expression-re-findall and works for me quite well.
I tested it only on my scenario of extracting a list of files from a web folder that exposes the files\folder in it, e.g.:
and I got a sorted list of the files\folders under the URL

My answer probably sucks compared to the real gurus out there, but using some simple math, string slicing, find and urllib, this little script will create a list containing link elements. I test google and my output seems right. Hope it helps!
import urllib
test = urllib.urlopen("http://www.google.com").read()
sane = 0
needlestack = []
while sane == 0:
curpos = test.find("href")
if curpos >= 0:
testlen = len(test)
test = test[curpos:testlen]
curpos = test.find('"')
testlen = len(test)
test = test[curpos+1:testlen]
curpos = test.find('"')
needle = test[0:curpos]
if needle.startswith("http" or "www"):
needlestack.append(needle)
else:
sane = 1
for item in needlestack:
print item

Using requests with BeautifulSoup and Python 3:
import requests
from bs4 import BeautifulSoup
page = requests.get('http://www.website.com')
bs = BeautifulSoup(page.content, features='lxml')
for link in bs.findAll('a'):
print(link.get('href'))

This is way late to answer but it will work for latest python users:
from bs4 import BeautifulSoup
import requests
html_page = requests.get('http://www.example.com').text
soup = BeautifulSoup(html_page, "lxml")
for link in soup.findAll('a'):
print(link.get('href'))
Don't forget to install "requests" and "BeautifulSoup" package and also "lxml". Use .text along with get otherwise it will throw an exception.
"lxml" is used to remove that warning of which parser to be used. You can also use "html.parser" whichever fits your case.

Here's a lazy version of #stephen's answer
import html.parser
import itertools
import urllib.request
class LinkParser(html.parser.HTMLParser):
def reset(self):
super().reset()
self.links = iter([])
def handle_starttag(self, tag, attrs):
if tag == 'a':
for (name, value) in attrs:
if name == 'href':
self.links = itertools.chain(self.links, [value])
def gen_links(stream, parser):
encoding = stream.headers.get_content_charset() or 'UTF-8'
for line in stream:
parser.feed(line.decode(encoding))
yield from parser.links
Use it like so:
>>> parser = LinkParser()
>>> stream = urllib.request.urlopen('http://stackoverflow.com/questions/3075550')
>>> links = gen_links(stream, parser)
>>> next(links)
'//stackoverflow.com'

This answer is similar to others with requests and BeautifulSoup, but using list comprehension.
Because find_all() is the most popular method in the Beautiful Soup search API, you can use soup("a") as a shortcut of soup.findAll("a") and using list comprehension:
import requests
from bs4 import BeautifulSoup
URL = "http://www.yourwebsite.com"
page = requests.get(URL)
soup = BeautifulSoup(page.content, features='lxml')
# Find links
all_links = [link.get("href") for link in soup("a")]
# Only external links
ext_links = [link.get("href") for link in soup("a") if "http" in link.get("href")]
https://www.crummy.com/software/BeautifulSoup/bs4/doc/#calling-a-tag-is-like-calling-find-all

Simplest way for me:
from urlextract import URLExtract
from requests import get
url = "sample.com/samplepage/"
req = requests.get(url)
text = req.text
# or if you already have the html source:
# text = "This is html for ex <a href='http://google.com/'>Google</a> <a href='http://yahoo.com/'>Yahoo</a>"
text = text.replace(' ', '').replace('=','')
extractor = URLExtract()
print(extractor.find_urls(text))
output:
['http://google.com/', 'http://yahoo.com/']

Related

How to select all links of apps from app store and extract its href?

from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen
url = f'https://www.apple.com/kr/search/youtube?src=globalnav'
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
links = soup.select(".rf-serp-productname-list")
print(links)
I want to crawl through all links of shown apps. When I searched for a keyword, I thought links = soup.select(".rf-serp-productname-list") would work, but links list is empty.
What should I do?
Just check this code, I think is what you want:
import re
import requests
from bs4 import BeautifulSoup
pages = set()
def get_links(page_url):
global pages
pattern = re.compile("^(/)")
html = requests.get(f"your_URL{page_url}").text # fstrings require Python 3.6+
soup = BeautifulSoup(html, "html.parser")
for link in soup.find_all("a", href=pattern):
if "href" in link.attrs:
if link.attrs["href"] not in pages:
new_page = link.attrs["href"]
print(new_page)
pages.add(new_page)
get_links(new_page)
get_links("")
Source:
https://gist.github.com/AO8/f721b6736c8a4805e99e377e72d3edbf
You can change the part:
for link in soup.find_all("a", href=pattern):
#do something
To check for a keyword I think
You are cooking a soup so first at all taste it and check if everything you expect contains in it.
ResultSet of your selection is empty cause structure in response differs a bit from your expected one from the developer tools.
To get the list of links select more specific:
links = [a.get('href') for a in soup.select('a.icon')]
Output:
['https://apps.apple.com/kr/app/youtube/id544007664', 'https://apps.apple.com/kr/app/%EC%BF%A0%ED%8C%A1%ED%94%8C%EB%A0%88%EC%9D%B4/id1536885649', 'https://apps.apple.com/kr/app/youtube-music/id1017492454', 'https://apps.apple.com/kr/app/instagram/id389801252', 'https://apps.apple.com/kr/app/youtube-kids/id936971630', 'https://apps.apple.com/kr/app/youtube-studio/id888530356', 'https://apps.apple.com/kr/app/google-chrome/id535886823', 'https://apps.apple.com/kr/app/tiktok-%ED%8B%B1%ED%86%A1/id1235601864', 'https://apps.apple.com/kr/app/google/id284815942']

Need to take 'href' value from 'a' tag in following output [duplicate]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 2 years ago.
Improve this question
How can I retrieve the links of a webpage and copy the url address of the links using Python?
Here's a short snippet using the SoupStrainer class in BeautifulSoup:
import httplib2
from bs4 import BeautifulSoup, SoupStrainer
http = httplib2.Http()
status, response = http.request('http://www.nytimes.com')
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
if link.has_attr('href'):
print(link['href'])
The BeautifulSoup documentation is actually quite good, and covers a number of typical scenarios:
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
Edit: Note that I used the SoupStrainer class because it's a bit more efficient (memory and speed wise), if you know what you're parsing in advance.
For completeness sake, the BeautifulSoup 4 version, making use of the encoding supplied by the server as well:
from bs4 import BeautifulSoup
import urllib.request
parser = 'html.parser' # or 'lxml' (preferred) or 'html5lib', if installed
resp = urllib.request.urlopen("http://www.gpsbasecamp.com/national-parks")
soup = BeautifulSoup(resp, parser, from_encoding=resp.info().get_param('charset'))
for link in soup.find_all('a', href=True):
print(link['href'])
or the Python 2 version:
from bs4 import BeautifulSoup
import urllib2
parser = 'html.parser' # or 'lxml' (preferred) or 'html5lib', if installed
resp = urllib2.urlopen("http://www.gpsbasecamp.com/national-parks")
soup = BeautifulSoup(resp, parser, from_encoding=resp.info().getparam('charset'))
for link in soup.find_all('a', href=True):
print link['href']
and a version using the requests library, which as written will work in both Python 2 and 3:
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
import requests
parser = 'html.parser' # or 'lxml' (preferred) or 'html5lib', if installed
resp = requests.get("http://www.gpsbasecamp.com/national-parks")
http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
encoding = html_encoding or http_encoding
soup = BeautifulSoup(resp.content, parser, from_encoding=encoding)
for link in soup.find_all('a', href=True):
print(link['href'])
The soup.find_all('a', href=True) call finds all <a> elements that have an href attribute; elements without the attribute are skipped.
BeautifulSoup 3 stopped development in March 2012; new projects really should use BeautifulSoup 4, always.
Note that you should leave decoding the HTML from bytes to BeautifulSoup. You can inform BeautifulSoup of the characterset found in the HTTP response headers to assist in decoding, but this can be wrong and conflicting with a <meta> header info found in the HTML itself, which is why the above uses the BeautifulSoup internal class method EncodingDetector.find_declared_encoding() to make sure that such embedded encoding hints win over a misconfigured server.
With requests, the response.encoding attribute defaults to Latin-1 if the response has a text/* mimetype, even if no characterset was returned. This is consistent with the HTTP RFCs but painful when used with HTML parsing, so you should ignore that attribute when no charset is set in the Content-Type header.
Others have recommended BeautifulSoup, but it's much better to use lxml. Despite its name, it is also for parsing and scraping HTML. It's much, much faster than BeautifulSoup, and it even handles "broken" HTML better than BeautifulSoup (their claim to fame). It has a compatibility API for BeautifulSoup too if you don't want to learn the lxml API.
Ian Blicking agrees.
There's no reason to use BeautifulSoup anymore, unless you're on Google App Engine or something where anything not purely Python isn't allowed.
lxml.html also supports CSS3 selectors so this sort of thing is trivial.
An example with lxml and xpath would look like this:
import urllib
import lxml.html
connection = urllib.urlopen('http://www.nytimes.com')
dom = lxml.html.fromstring(connection.read())
for link in dom.xpath('//a/#href'): # select the url in href for all a tags(links)
print link
import urllib2
import BeautifulSoup
request = urllib2.Request("http://www.gpsbasecamp.com/national-parks")
response = urllib2.urlopen(request)
soup = BeautifulSoup.BeautifulSoup(response)
for a in soup.findAll('a'):
if 'national-park' in a['href']:
print 'found a url with national-park in the link'
The following code is to retrieve all the links available in a webpage using urllib2 and BeautifulSoup4:
import urllib2
from bs4 import BeautifulSoup
url = urllib2.urlopen("http://www.espncricinfo.com/").read()
soup = BeautifulSoup(url)
for line in soup.find_all('a'):
print(line.get('href'))
Links can be within a variety of attributes so you could pass a list of those attributes to select.
For example, with src and href attributes (here I am using the starts with ^ operator to specify that either of these attributes values starts with http):
from bs4 import BeautifulSoup as bs
import requests
r = requests.get('https://stackoverflow.com/')
soup = bs(r.content, 'lxml')
links = [item['href'] if item.get('href') is not None else item['src'] for item in soup.select('[href^="http"], [src^="http"]') ]
print(links)
Attribute = value selectors
[attr^=value]
Represents elements with an attribute name of attr whose value is prefixed (preceded) by value.
There are also the commonly used $ (ends with) and * (contains) operators. For a full syntax list see the link above.
Under the hood BeautifulSoup now uses lxml. Requests, lxml & list comprehensions makes a killer combo.
import requests
import lxml.html
dom = lxml.html.fromstring(requests.get('http://www.nytimes.com').content)
[x for x in dom.xpath('//a/#href') if '//' in x and 'nytimes.com' not in x]
In the list comp, the "if '//' and 'url.com' not in x" is a simple method to scrub the url list of the sites 'internal' navigation urls, etc.
just for getting the links, without B.soup and regex:
import urllib2
url="http://www.somewhere.com"
page=urllib2.urlopen(url)
data=page.read().split("</a>")
tag="<a href=\""
endtag="\">"
for item in data:
if "<a href" in item:
try:
ind = item.index(tag)
item=item[ind+len(tag):]
end=item.index(endtag)
except: pass
else:
print item[:end]
for more complex operations, of course BSoup is still preferred.
This script does what your looking for, But also resolves the relative links to absolute links.
import urllib
import lxml.html
import urlparse
def get_dom(url):
connection = urllib.urlopen(url)
return lxml.html.fromstring(connection.read())
def get_links(url):
return resolve_links((link for link in get_dom(url).xpath('//a/#href')))
def guess_root(links):
for link in links:
if link.startswith('http'):
parsed_link = urlparse.urlparse(link)
scheme = parsed_link.scheme + '://'
netloc = parsed_link.netloc
return scheme + netloc
def resolve_links(links):
root = guess_root(links)
for link in links:
if not link.startswith('http'):
link = urlparse.urljoin(root, link)
yield link
for link in get_links('http://www.google.com'):
print link
To find all the links, we will in this example use the urllib2 module together
with the re.module
*One of the most powerful function in the re module is "re.findall()".
While re.search() is used to find the first match for a pattern, re.findall() finds all
the matches and returns them as a list of strings, with each string representing one match*
import urllib2
import re
#connect to a URL
website = urllib2.urlopen(url)
#read html code
html = website.read()
#use re.findall to get all the links
links = re.findall('"((http|ftp)s?://.*?)"', html)
print links
Why not use regular expressions:
import urllib2
import re
url = "http://www.somewhere.com"
page = urllib2.urlopen(url)
page = page.read()
links = re.findall(r"<a.*?\s*href=\"(.*?)\".*?>(.*?)</a>", page)
for link in links:
print('href: %s, HTML text: %s' % (link[0], link[1]))
Here's an example using #ars accepted answer and the BeautifulSoup4, requests, and wget modules to handle the downloads.
import requests
import wget
import os
from bs4 import BeautifulSoup, SoupStrainer
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/eeg-mld/eeg_full/'
file_type = '.tar.gz'
response = requests.get(url)
for link in BeautifulSoup(response.content, 'html.parser', parse_only=SoupStrainer('a')):
if link.has_attr('href'):
if file_type in link['href']:
full_path = url + link['href']
wget.download(full_path)
I found the answer by #Blairg23 working , after the following correction (covering the scenario where it failed to work correctly):
for link in BeautifulSoup(response.content, 'html.parser', parse_only=SoupStrainer('a')):
if link.has_attr('href'):
if file_type in link['href']:
full_path =urlparse.urljoin(url , link['href']) #module urlparse need to be imported
wget.download(full_path)
For Python 3:
urllib.parse.urljoin has to be used in order to obtain the full URL instead.
BeatifulSoup's own parser can be slow. It might be more feasible to use lxml which is capable of parsing directly from a URL (with some limitations mentioned below).
import lxml.html
doc = lxml.html.parse(url)
links = doc.xpath('//a[#href]')
for link in links:
print link.attrib['href']
The code above will return the links as is, and in most cases they would be relative links or absolute from the site root. Since my use case was to only extract a certain type of links, below is a version that converts the links to full URLs and which optionally accepts a glob pattern like *.mp3. It won't handle single and double dots in the relative paths though, but so far I didn't have the need for it. If you need to parse URL fragments containing ../ or ./ then urlparse.urljoin might come in handy.
NOTE: Direct lxml url parsing doesn't handle loading from https and doesn't do redirects, so for this reason the version below is using urllib2 + lxml.
#!/usr/bin/env python
import sys
import urllib2
import urlparse
import lxml.html
import fnmatch
try:
import urltools as urltools
except ImportError:
sys.stderr.write('To normalize URLs run: `pip install urltools --user`')
urltools = None
def get_host(url):
p = urlparse.urlparse(url)
return "{}://{}".format(p.scheme, p.netloc)
if __name__ == '__main__':
url = sys.argv[1]
host = get_host(url)
glob_patt = len(sys.argv) > 2 and sys.argv[2] or '*'
doc = lxml.html.parse(urllib2.urlopen(url))
links = doc.xpath('//a[#href]')
for link in links:
href = link.attrib['href']
if fnmatch.fnmatch(href, glob_patt):
if not href.startswith(('http://', 'https://' 'ftp://')):
if href.startswith('/'):
href = host + href
else:
parent_url = url.rsplit('/', 1)[0]
href = urlparse.urljoin(parent_url, href)
if urltools:
href = urltools.normalize(href)
print href
The usage is as follows:
getlinks.py http://stackoverflow.com/a/37758066/191246
getlinks.py http://stackoverflow.com/a/37758066/191246 "*users*"
getlinks.py http://fakedomain.mu/somepage.html "*.mp3"
There can be many duplicate links together with both external and internal links. To differentiate between the two and just get unique links using sets:
# Python 3.
import urllib
from bs4 import BeautifulSoup
url = "http://www.espncricinfo.com/"
resp = urllib.request.urlopen(url)
# Get server encoding per recommendation of Martijn Pieters.
soup = BeautifulSoup(resp, from_encoding=resp.info().get_param('charset'))
external_links = set()
internal_links = set()
for line in soup.find_all('a'):
link = line.get('href')
if not link:
continue
if link.startswith('http'):
external_links.add(link)
else:
internal_links.add(link)
# Depending on usage, full internal links may be preferred.
full_internal_links = {
urllib.parse.urljoin(url, internal_link)
for internal_link in internal_links
}
# Print all unique external and full internal links.
for link in external_links.union(full_internal_links):
print(link)
import urllib2
from bs4 import BeautifulSoup
a=urllib2.urlopen('http://dir.yahoo.com')
code=a.read()
soup=BeautifulSoup(code)
links=soup.findAll("a")
#To get href part alone
print links[0].attrs['href']

Parsing a script tag with dicts in BeautifulSoup

Working on a partial answer to this question, I came across a bs4.element.Tag that is a mess of nested dicts and lists (s, below).
Is there a way to return a list of urls contained in s without using re.find_all? Other comments regarding the structure of this tag are helpful too.
from bs4 import BeautifulSoup
import requests
link = 'https://stackoverflow.com/jobs?med=site-ui&ref=jobs-tab&sort=p'
r = requests.get(link)
soup = BeautifulSoup(r.text, 'html.parser')
s = soup.find('script', type='application/ld+json')
## the first bit of s:
# s
# Out[116]:
# <script type="application/ld+json">
# {"#context":"http://schema.org","#type":"ItemList","numberOfItems":50,
What I've tried:
randomly perusing through methods with tab completion on s.
picking through the docs.
My problem is that s only has 1 attribute (type) and doesn't seem to have any child tags.
You can use s.text to get the content of the script. It's JSON, so you can then just parse it with json.loads. From there, it's simple dictionary access:
import json
from bs4 import BeautifulSoup
import requests
link = 'https://stackoverflow.com/jobs?med=site-ui&ref=jobs-tab&sort=p'
r = requests.get(link)
soup = BeautifulSoup(r.text, 'html.parser')
s = soup.find('script', type='application/ld+json')
urls = [el['url'] for el in json.loads(s.text)['itemListElement']]
print(urls)
More easy:
from bs4 import BeautifulSoup
import requests
link = 'https://stackoverflow.com/jobs?med=site-ui&ref=jobs-tab&sort=p'
r = requests.get(link)
soup = BeautifulSoup(r.text, 'html.parser')
s = soup.find('script', type='application/ld+json')
# JUST THIS
json = json.loads(s.string)

Beautiful Soup not returning everything in HTML file?

HTML noob here, so I could be misunderstanding something about the HTML document, so bear with me.
I'm using Beautiful Soup to parse web data in Python. Here is my code:
import urllib
import BeautifulSoup
url = "http://www.nba.com/gameline/20160323/"
page = urllib.urlopen(url).read()
soup = BeautifulSoup.BeautifulSoup(page)
indicateGameDone = str(soup.find("div", {"class": "nbaModTopStatus"}))
print indicateGameDone
now, if you look at the website, the HTML code has the line <p class="nbaLiveStatTxSm"> FINAL </p>, (inspect the 'Final' text on the left side of the container on the first ATL-WAS game on the page to see it for youself.) But when I run the code above, my code doesn't return the 'FINAL' that is seen on the webpage, and instead the nbaLiveStatTxSm class is empty.
On my machine, this is the output when I print indicateGameDone:
<div class="nbaModTopStatus"><p class="nbaLiveStatTx">Live</p><p class="nbaLiveStatTxSm"></p><p class="nbaFnlStatTx">Final</p><p class="nbaFnlStatTxSm"></p></div>
Does anyone know why this is happening?
EDIT: clarification: the problem isn't retrieving the text within the tag, the problem is that when I take the html code from the website and print it out in python, something that I saw when I inspected the element on the web is not there in the print statement in Python.
You can use this logic to extract any text.
This code allows you to extract any data between any tags.
Output - FINAL
import urllib
from bs4 import BeautifulSoup
url = "http://www.nba.com/gameline/20160323/"
page = urllib.urlopen(url)
soup = BeautifulSoup(page)
indicateGameDone = soup.find("div", {"class": "nbaFnlStatTx"})
for p in indicateGameDone:
p_text = soup.find("p", {"class": "nbaFnlStatTxSm"})
print(p_text.getText())
break;
It looks like your problem is not with BeautifulSoup but instead with urllib.
Try running the following commands
>>> import urllib
>>> url = "http://www.nba.com/gameline/20160323/"
>>> page = urllib.urlopen(url).read()
>>> page.find('<div class="nbaModTopStatus">')
44230
Which is no surprise considering that Beautiful Soup was able to find the div itself. However when we look a little deeper into what urllib is actually collecting we can see that the <p class="nbaFnlStatTxSm"> is empty by running
>>> page[44230:45000]
'<div class="nbaModTopStatus"><p class="nbaLiveStatTx">Live</p><p class="nbaLiveStatTxSm"></p><p class="nbaFnlStatTx">Final</p><p class="nbaFnlStatTxSm"></p></div><div id="nbaGLBroadcast"><img src="/.element/img/3.0/sect/gameline/broadcasters/lp.png"></div><div class="nbaTeamsRow"><div class="nbaModTopTeamScr nbaModTopTeamAw"><h5 class="nbaModTopTeamName awayteam">ATL</h5><img src="http://i.cdn.turner.com/nba/nba/.element/img/2.0/sect/gameline/teams/ATL.gif" width="34" height="22" title="Atlanta Hawks"><h4 class="nbaModTopTeamNum win"></h4></div><div class="nbaModTopTeamScr nbaModTopTeamHm"><h5 class="nbaModTopTeamName hometeam">WAS</h5><img src="http://i.cdn.turner.com/nba/nba/.element/img/2.0/sect/gameline/teams/WAS.gif" width="34" '
You can see that the tag is empty, so your problem is the data that's being passed to Beautiful Soup, not the package itself.
changed the import of beautifulsoup to the proper syntax for the current version of BeautifulSoup
corrected the way you were constructing the BeautifulSoup object
fixed your find statement, then used the .text command to get the string representation of the text in the HTML you're after.
With some minor modifications to your code as listed above, your code runs for me.
import urllib
from bs4 import BeautifulSoup
url = "http://www.nba.com/gameline/20160323/"
page = urllib.urlopen(url).read()
soup = BeautifulSoup(page)
indicateGameDone = soup.find("div", {"class": "nbaModTopStatus"})
print indicateGameDone.text ## "LiveFinal "
to address comments:
import urllib
from bs4 import BeautifulSoup
url = "http://www.nba.com/gameline/20160323/"
page = urllib.urlopen(url).read()
soup = BeautifulSoup(page)
indicateGameDone = soup.find("p", {"class": "nbaFnlStatTx"})
print indicateGameDone.text

retrieve links from web page using python and BeautifulSoup [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 2 years ago.
Improve this question
How can I retrieve the links of a webpage and copy the url address of the links using Python?
Here's a short snippet using the SoupStrainer class in BeautifulSoup:
import httplib2
from bs4 import BeautifulSoup, SoupStrainer
http = httplib2.Http()
status, response = http.request('http://www.nytimes.com')
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
if link.has_attr('href'):
print(link['href'])
The BeautifulSoup documentation is actually quite good, and covers a number of typical scenarios:
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
Edit: Note that I used the SoupStrainer class because it's a bit more efficient (memory and speed wise), if you know what you're parsing in advance.
For completeness sake, the BeautifulSoup 4 version, making use of the encoding supplied by the server as well:
from bs4 import BeautifulSoup
import urllib.request
parser = 'html.parser' # or 'lxml' (preferred) or 'html5lib', if installed
resp = urllib.request.urlopen("http://www.gpsbasecamp.com/national-parks")
soup = BeautifulSoup(resp, parser, from_encoding=resp.info().get_param('charset'))
for link in soup.find_all('a', href=True):
print(link['href'])
or the Python 2 version:
from bs4 import BeautifulSoup
import urllib2
parser = 'html.parser' # or 'lxml' (preferred) or 'html5lib', if installed
resp = urllib2.urlopen("http://www.gpsbasecamp.com/national-parks")
soup = BeautifulSoup(resp, parser, from_encoding=resp.info().getparam('charset'))
for link in soup.find_all('a', href=True):
print link['href']
and a version using the requests library, which as written will work in both Python 2 and 3:
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
import requests
parser = 'html.parser' # or 'lxml' (preferred) or 'html5lib', if installed
resp = requests.get("http://www.gpsbasecamp.com/national-parks")
http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
encoding = html_encoding or http_encoding
soup = BeautifulSoup(resp.content, parser, from_encoding=encoding)
for link in soup.find_all('a', href=True):
print(link['href'])
The soup.find_all('a', href=True) call finds all <a> elements that have an href attribute; elements without the attribute are skipped.
BeautifulSoup 3 stopped development in March 2012; new projects really should use BeautifulSoup 4, always.
Note that you should leave decoding the HTML from bytes to BeautifulSoup. You can inform BeautifulSoup of the characterset found in the HTTP response headers to assist in decoding, but this can be wrong and conflicting with a <meta> header info found in the HTML itself, which is why the above uses the BeautifulSoup internal class method EncodingDetector.find_declared_encoding() to make sure that such embedded encoding hints win over a misconfigured server.
With requests, the response.encoding attribute defaults to Latin-1 if the response has a text/* mimetype, even if no characterset was returned. This is consistent with the HTTP RFCs but painful when used with HTML parsing, so you should ignore that attribute when no charset is set in the Content-Type header.
Others have recommended BeautifulSoup, but it's much better to use lxml. Despite its name, it is also for parsing and scraping HTML. It's much, much faster than BeautifulSoup, and it even handles "broken" HTML better than BeautifulSoup (their claim to fame). It has a compatibility API for BeautifulSoup too if you don't want to learn the lxml API.
Ian Blicking agrees.
There's no reason to use BeautifulSoup anymore, unless you're on Google App Engine or something where anything not purely Python isn't allowed.
lxml.html also supports CSS3 selectors so this sort of thing is trivial.
An example with lxml and xpath would look like this:
import urllib
import lxml.html
connection = urllib.urlopen('http://www.nytimes.com')
dom = lxml.html.fromstring(connection.read())
for link in dom.xpath('//a/#href'): # select the url in href for all a tags(links)
print link
import urllib2
import BeautifulSoup
request = urllib2.Request("http://www.gpsbasecamp.com/national-parks")
response = urllib2.urlopen(request)
soup = BeautifulSoup.BeautifulSoup(response)
for a in soup.findAll('a'):
if 'national-park' in a['href']:
print 'found a url with national-park in the link'
The following code is to retrieve all the links available in a webpage using urllib2 and BeautifulSoup4:
import urllib2
from bs4 import BeautifulSoup
url = urllib2.urlopen("http://www.espncricinfo.com/").read()
soup = BeautifulSoup(url)
for line in soup.find_all('a'):
print(line.get('href'))
Links can be within a variety of attributes so you could pass a list of those attributes to select.
For example, with src and href attributes (here I am using the starts with ^ operator to specify that either of these attributes values starts with http):
from bs4 import BeautifulSoup as bs
import requests
r = requests.get('https://stackoverflow.com/')
soup = bs(r.content, 'lxml')
links = [item['href'] if item.get('href') is not None else item['src'] for item in soup.select('[href^="http"], [src^="http"]') ]
print(links)
Attribute = value selectors
[attr^=value]
Represents elements with an attribute name of attr whose value is prefixed (preceded) by value.
There are also the commonly used $ (ends with) and * (contains) operators. For a full syntax list see the link above.
Under the hood BeautifulSoup now uses lxml. Requests, lxml & list comprehensions makes a killer combo.
import requests
import lxml.html
dom = lxml.html.fromstring(requests.get('http://www.nytimes.com').content)
[x for x in dom.xpath('//a/#href') if '//' in x and 'nytimes.com' not in x]
In the list comp, the "if '//' and 'url.com' not in x" is a simple method to scrub the url list of the sites 'internal' navigation urls, etc.
just for getting the links, without B.soup and regex:
import urllib2
url="http://www.somewhere.com"
page=urllib2.urlopen(url)
data=page.read().split("</a>")
tag="<a href=\""
endtag="\">"
for item in data:
if "<a href" in item:
try:
ind = item.index(tag)
item=item[ind+len(tag):]
end=item.index(endtag)
except: pass
else:
print item[:end]
for more complex operations, of course BSoup is still preferred.
This script does what your looking for, But also resolves the relative links to absolute links.
import urllib
import lxml.html
import urlparse
def get_dom(url):
connection = urllib.urlopen(url)
return lxml.html.fromstring(connection.read())
def get_links(url):
return resolve_links((link for link in get_dom(url).xpath('//a/#href')))
def guess_root(links):
for link in links:
if link.startswith('http'):
parsed_link = urlparse.urlparse(link)
scheme = parsed_link.scheme + '://'
netloc = parsed_link.netloc
return scheme + netloc
def resolve_links(links):
root = guess_root(links)
for link in links:
if not link.startswith('http'):
link = urlparse.urljoin(root, link)
yield link
for link in get_links('http://www.google.com'):
print link
To find all the links, we will in this example use the urllib2 module together
with the re.module
*One of the most powerful function in the re module is "re.findall()".
While re.search() is used to find the first match for a pattern, re.findall() finds all
the matches and returns them as a list of strings, with each string representing one match*
import urllib2
import re
#connect to a URL
website = urllib2.urlopen(url)
#read html code
html = website.read()
#use re.findall to get all the links
links = re.findall('"((http|ftp)s?://.*?)"', html)
print links
Why not use regular expressions:
import urllib2
import re
url = "http://www.somewhere.com"
page = urllib2.urlopen(url)
page = page.read()
links = re.findall(r"<a.*?\s*href=\"(.*?)\".*?>(.*?)</a>", page)
for link in links:
print('href: %s, HTML text: %s' % (link[0], link[1]))
Here's an example using #ars accepted answer and the BeautifulSoup4, requests, and wget modules to handle the downloads.
import requests
import wget
import os
from bs4 import BeautifulSoup, SoupStrainer
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/eeg-mld/eeg_full/'
file_type = '.tar.gz'
response = requests.get(url)
for link in BeautifulSoup(response.content, 'html.parser', parse_only=SoupStrainer('a')):
if link.has_attr('href'):
if file_type in link['href']:
full_path = url + link['href']
wget.download(full_path)
I found the answer by #Blairg23 working , after the following correction (covering the scenario where it failed to work correctly):
for link in BeautifulSoup(response.content, 'html.parser', parse_only=SoupStrainer('a')):
if link.has_attr('href'):
if file_type in link['href']:
full_path =urlparse.urljoin(url , link['href']) #module urlparse need to be imported
wget.download(full_path)
For Python 3:
urllib.parse.urljoin has to be used in order to obtain the full URL instead.
BeatifulSoup's own parser can be slow. It might be more feasible to use lxml which is capable of parsing directly from a URL (with some limitations mentioned below).
import lxml.html
doc = lxml.html.parse(url)
links = doc.xpath('//a[#href]')
for link in links:
print link.attrib['href']
The code above will return the links as is, and in most cases they would be relative links or absolute from the site root. Since my use case was to only extract a certain type of links, below is a version that converts the links to full URLs and which optionally accepts a glob pattern like *.mp3. It won't handle single and double dots in the relative paths though, but so far I didn't have the need for it. If you need to parse URL fragments containing ../ or ./ then urlparse.urljoin might come in handy.
NOTE: Direct lxml url parsing doesn't handle loading from https and doesn't do redirects, so for this reason the version below is using urllib2 + lxml.
#!/usr/bin/env python
import sys
import urllib2
import urlparse
import lxml.html
import fnmatch
try:
import urltools as urltools
except ImportError:
sys.stderr.write('To normalize URLs run: `pip install urltools --user`')
urltools = None
def get_host(url):
p = urlparse.urlparse(url)
return "{}://{}".format(p.scheme, p.netloc)
if __name__ == '__main__':
url = sys.argv[1]
host = get_host(url)
glob_patt = len(sys.argv) > 2 and sys.argv[2] or '*'
doc = lxml.html.parse(urllib2.urlopen(url))
links = doc.xpath('//a[#href]')
for link in links:
href = link.attrib['href']
if fnmatch.fnmatch(href, glob_patt):
if not href.startswith(('http://', 'https://' 'ftp://')):
if href.startswith('/'):
href = host + href
else:
parent_url = url.rsplit('/', 1)[0]
href = urlparse.urljoin(parent_url, href)
if urltools:
href = urltools.normalize(href)
print href
The usage is as follows:
getlinks.py http://stackoverflow.com/a/37758066/191246
getlinks.py http://stackoverflow.com/a/37758066/191246 "*users*"
getlinks.py http://fakedomain.mu/somepage.html "*.mp3"
There can be many duplicate links together with both external and internal links. To differentiate between the two and just get unique links using sets:
# Python 3.
import urllib
from bs4 import BeautifulSoup
url = "http://www.espncricinfo.com/"
resp = urllib.request.urlopen(url)
# Get server encoding per recommendation of Martijn Pieters.
soup = BeautifulSoup(resp, from_encoding=resp.info().get_param('charset'))
external_links = set()
internal_links = set()
for line in soup.find_all('a'):
link = line.get('href')
if not link:
continue
if link.startswith('http'):
external_links.add(link)
else:
internal_links.add(link)
# Depending on usage, full internal links may be preferred.
full_internal_links = {
urllib.parse.urljoin(url, internal_link)
for internal_link in internal_links
}
# Print all unique external and full internal links.
for link in external_links.union(full_internal_links):
print(link)
import urllib2
from bs4 import BeautifulSoup
a=urllib2.urlopen('http://dir.yahoo.com')
code=a.read()
soup=BeautifulSoup(code)
links=soup.findAll("a")
#To get href part alone
print links[0].attrs['href']

Categories