Is there any way I can perserve HTML entities in the source when parsing it with BeautifulSoup?
from bs4 import BeautifulSoup
soup = BeautifulSoup('<p class="test">"Hello World!" I said')
print(soup.string)
# Outputs: '"Hello World!" I said'
# Wanted/Expected: '"Hello World!" I said'
Also, when writing those preserved html entities back to a file. Will f.write(str(soup)) do? The following code to is meant produce an identical copy of the original, which currently isn't:
from bs4 import BeautifulSoup
from pathlib import Path
# The original contains tons of HTML entities
original = Path("original.html")
output = Path("duplicate.html")
with open(original, "rt", encoding="utf8") as f:
soup = BeautifulSoup(f, "lxml")
with open(output, "wt", encoding="utf8") as f:
f.write(str(soup))
you have to create custom formatter
from bs4 import BeautifulSoup
def formatQuot(string):
return string.replace('"','"')
soup = BeautifulSoup('<p class="test">"Hello World!" I said ', 'html.parser')
print(soup.decode(formatter=formatQuot))
# <p class="test">"Hello World!" I said </p>
text = formatQuot(soup.text)
print(text)
# "Hello World!" I said
Thanks #uingtea for custom formatter suggestion. As I also need to preserve the tag attribute order, I've subclassed the HTMLFormatter as per BeautifulSoup docs:
from pathlib import Path
from bs4 import BeautifulSoup
from bs4.formatter import HTMLFormatter
def my_formatter(string):
string = string.replace('&', '&')
# string = string.replace('…', '…')
string = string.replace('"', '"').replace("'", ''')
string = string.replace('<', '<').replace('>', '>')
return string
class customFormat(HTMLFormatter):
def attributes(self, tag):
for k, v in tag.attrs.items():
yield k, v
cform = customFormat(my_formatter)
original = Path("original.html")
output = Path("output.html")
with open(original, "rt", encoding="utf8") as f:
soup = BeautifulSoup(f, "lxml")
with open(output, "wt", encoding="utf8", newline="\n") as f:
f.write(soup.decode(formatter=cform))
Is there a more "cleaner" way to write the custom parser, i.e. without defining a free function then passing it to the constructor of the subclassed formatter? The docs is pretty scant on how to write a custom/subclassed formatter.
Related
I need to find certain words in an html file and replace them with links. The result should be that the file (displayed by a browser) allows you to klick on the links as usual.
Beautiful Soup automatically escapes the tag. How can I avoid that behaviour?
Minimal Example
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import re
html = \
'''
Identify
'''
soup = BeautifulSoup(html,features="html.parser")
for txt in soup.findAll(text=True):
if re.search('identi',txt,re.I) and txt.parent.name != 'a':
newtext = re.sub('identify', ' test ', txt.lower())
txt.replace_with(newtext)
print(soup)
Result:
<a href="test.html"> test </a>
Intended result:
test
You can put new soup with markup as parameter to .replace_with(), for example:
import re
from bs4 import BeautifulSoup
html = '''
Other Identify Other
'''
soup = BeautifulSoup(html,features="html.parser")
for txt in soup.findAll(text=True):
if re.search('identi',txt,re.I) and txt.parent.name != 'a':
new_txt = re.sub(r'identi[^\s]*', 'test', txt, flags=re.I)
txt.replace_with(BeautifulSoup(new_txt, 'html.parser'))
print(soup)
Prints:
Other test Other
You can use w3lib, it's replace_entities() function to replace HTML entities from a string.
To Install: pip install w3lib
from bs4 import BeautifulSoup
import re
from w3lib.html import replace_entities
html = \
'''
Identify
'''
soup = BeautifulSoup(html,features="html.parser")
for txt in soup.findAll(text=True):
if re.search('identi',txt,re.I) and txt.parent.name != 'a':
newtext = re.sub('identify', r' test ', txt.lower())
txt.replace_with(newtext)
print(replace_entities(str(soup))) #str(soup) as its BeautifulSoup type not str
#Output
>>> test
I'm looking to get all the text in tag
It gives me the text in the console, but it doesn't put it in the .txt file.
It works with body.text, but not with article.text. I don't know what to do.
import bs4 as bs
import urllib.request
#import re
sauce = urllib.request.urlopen('http://www.bodoniparavia.it/index.php/it/amministrazione-trasparente/bandi-di-gara-e-contratti.html')
soup = bs.BeautifulSoup(sauce,'lxml')
body = soup.body
article = body.find('article')
article1 = article.text
print(article1)
x = open('file.txt','w')
x.write(article1)
x.close
It seems to be working fine for me but try adding encoding = 'utf-8' to the write statement. So the code would now look like this
import bs4 as bs
import urllib.request
#import re
sauce = urllib.request.urlopen('http://www.bodoniparavia.it/index.php/it/amministrazione-trasparente/bandi-di-gara-e-contratti.html')
soup = bs.BeautifulSoup(sauce,'lxml')
body = soup.body
article = body.find('article')
article1 = article.text
print(article1)
x = open('file.txt','w',encoding = 'utf-8')
x.write(article1)
x.close()
I want to return the list that comes out with the print function into a CSV list.
import re
import requests
from bs4 import BeautifulSoup
for i in range(146):
r = requests.get(("http://www.yellowpages.com/atlanta-ga/trends/{}").format(i))
soup = BeautifulSoup(r.content , "html.parser")
for link in soup.find_all("a",href=re.compile('/atlanta-ga/')):
if 'trends' not in link.get('href'):
link = (link.get('href'))
results = (("http://www.yellowpages.com{}?page=").format(link))
import csv
with open('Catagories', 'w') as myfile:
wr = csv.writer(myfile)
wr.writerow([results])
print(results)
The purpose of this should be very apparent
#tdelaney is right: every time you open the file with "w", you're overwritting the previous text.
The fix is to use "a" instead:
with open('Catagories', 'a') as myfile:
...
Check out the docs: https://docs.python.org/3/library/functions.html#open
I have thousands of html files stored in a remote directory. All these files have same HTML structure. Right now I am scraping every file manually with the following script
from string import punctuation, whitespace
import urllib2
import datetime
import re
from bs4 import BeautifulSoup as Soup
import csv
today = datetime.date.today()
html = urllib2.urlopen("http://hostname/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html").read()
soup = Soup(html)
for li in soup.findAll('li', attrs={'class':'g'}):
sLink = li.find('a')
print sLink['href']
sSpan = li.find('span', attrs={'class':'st'})
print sSpan
So the above script is for one URL. Like wise I wanna scrape through all the html files which are under that directory irrespective of the file names. I do not find that this question has been asked.
Update : Code
import urllib2
import BeautifulSoup
import re
Newlines = re.compile(r'[\r\n]\s+')
def getPageText(url):
# given a url, get page content
data = urllib2.urlopen(url).read()
# parse as html structured document
bs = BeautifulSoup.BeautifulSoup(data, convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES)
# kill javascript content
for li in bs.findAll('li', attrs={'class':'g'}):
sLink = li.find('a')
print sLink['href']
sSpan = li.find('span', attrs={'class':'st'})
print sSpan
def main():
urls = [
'http://192.168.1.200/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html',
'http://192.168.1.200/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html.html'
]
txt = [getPageText(url) for url in urls]
if __name__=="__main__":
main()
Use loop:
...
for url in url_list:
html = urllib2.urlopen(url).read()
soup = Soup(html)
for li in soup.findAll('li', attrs={'class':'g'}):
sLink = li.find('a')
print sLink['href']
sSpan = li.find('span', attrs={'class':'st'})
print sSpan
If you don't know url list in advance, you have to parse listing page.
import csv
import urllib2
import BeautifulSoup
def getPageText(url, filename):
data = urllib2.urlopen(url).read()
bs = BeautifulSoup.BeautifulSoup(data, convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES)
with open(filename, 'w') as f:
writer = csv.writer(f)
for li in bs.findAll('li', attrs={'class':'g'}):
sLink = li.find('a')
sSpan = li.find('span', attrs={'class':'st'})
writer.writerow([sLink['href'], sSpan])
def main():
urls = [
'http://192.168.1.200/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html',
'http://192.168.1.200/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html.html',
]
for i, url in enumerate(urls, 1):
getPageText(url, '{}.csv'.format(i))
if __name__=="__main__":
main()
I am trying to write an HTML parser in Python that takes as its input a URL or list of URLs and outputs specific data about each of those URLs in the format:
URL: data1: data2
The data points can be found at the exact same HTML node in each of the URLs. They are consistently between the same starting tags and ending tags. If anyone out there would like to help an amateur python programmer get the job done, it would be greatly appreciated. Extra points if you can come up with a way to output the information that can be easily copied and pasted into an excel document for subsequent data analysis!
For example, lets say I would like to output the view count for a particular YouTube video. For the URL http://www.youtube.com/watch?v=QOdW1OuZ1U0, the view count is around 3.6 million. For all YouTube videos, this number is found in the following format within the page's source:
<span class="watch-view-count ">
3,595,057
</span>
Fortunately, these exact tags are found only once on a particular YouTube video's page. These starting and ending tags can be inputted into the program or built-in and modified when necessary. The output of the program would be:
http://www.youtube.com/watch?v=QOdW1OuZ1U0: 3,595,057 (or 3595057).
import urllib2
from bs4 import BeautifulSoup
url = 'http://www.youtube.com/watch?v=QOdW1OuZ1U0'
f = urllib2.urlopen(url)
data = f.read()
soup = BeautifulSoup(data)
span = soup.find('span', attrs={'class':'watch-view-count'})
print '{}:{}'.format(url, span.text)
If you do not want to use BeautifulSoup, you can use re:
import urllib2
import re
url = 'http://www.youtube.com/watch?v=QOdW1OuZ1U0'
f = urllib2.urlopen(url)
data = f.read()
pattern = re.compile('<span class="watch-view-count.*?([\d,]+).*?</span>', re.DOTALL)
r = pattern.search(data)
print '{}:{}'.format(url, r.group(1))
As for the outputs, I think you can store them in a csv file.
I prefer HTMLParser over re for this type of task. However, HTMLParser can be a bit tricky. I use immutable objects to store data... I'm sure this this the wrong way of doing it. But its worked with several projects for me in the past.
import urllib2
from HTMLParser import HTMLParser
import csv
position = []
results = [""]
class hp(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == 'span' and ('class', 'watch-view-count ') in attrs:
position.append('bingo')
def handle_endtag(self, tag):
if tag == 'span' and 'bingo' in position:
position.remove('bingo')
def handle_data(self, data):
if 'bingo' in position:
results[0] += " " + data.strip() + " "
my_pages = ["http://www.youtube.com/watch?v=QOdW1OuZ1U0"]
data = []
for url in my_pages:
response = urllib2.urlopen(url)
page = str(response.read())
parser = hp()
parser.feed(page)
data.append(results[0])
# reinitialize immutiable objects
position = []
results = [""]
index = 0
with open('/path/to/test.csv', 'wb') as f:
writer = csv.writer(f)
header = ['url', 'output']
writer.writerow(header)
for d in data:
row = [my_pages[index], data[index]]
writer.writerow(row)
index += 1
Then just open /path/to/test.csv in Excel