Can't decode output from BeautifulSoup in Python - python

I've been attempting to write a little scraper in Python using BeautifulSoup.
Everything goes smoothly until I attempt to print (or write to a file) the strings contained inside the various HTML elements. The site i'm scraping is: http://www.yellowpages.ca/search/si/1/Boots/Montreal+QC which contains various french characters. For some reason, when I attempt to print the content in the terminal or into a file, instead of decoding the string like it's supposed to, I'm getting the raw unicode output.
Here's the script:
from BeautifulSoup import BeautifulSoup as bs
import urllib as ul
##import re
base_url = 'http://www.yellowpages.ca'
data_file = open('yellow_file.txt', 'a')
data = ul.urlopen(base_url + '/locations/Quebec/Montreal/90014002.html').readlines()
bt = bs(str(data))
result = bt.findAll('div', 'ypgCategory')
bt = bs(str(result))
result = bt.findAll('a')
for tag in result:
link = base_url + tag['href']
##print str(link)
data = ul.urlopen(link).readlines()
#data = str(data).decode('latin-1')
bt = bs(str(data), convertEntities=bs.HTML_ENTITIES, fromEncoding='latin-1')
titles = bt.findAll('span', 'listingTitle')
phones = bt.findAll('a', 'phoneNumber')
entries = zip(titles, phones)
for title, phone in entries:
#print title.prettify(encoding='latin-1')
#data_file.write(title.text.decode('utf-8') + " " + phone.text.decode('utf-8') + "\n")
print title.text
data_file.close()
/************/
And the output of this is: Projets Autochtones Du Qu\xc3\xa9bec
As you can see the e with accent that's supposed to go in Quebec isn't displaying. I've tried everything mentioned on SO, calling unicode(), passing fromEncoding to soup, .decode('latin-1') but i'm getting nothing.
Any ideas?

This should be something like what you want:
from BeautifulSoup import BeautifulSoup as bs
import urllib as ul
base_url = 'http://www.yellowpages.ca'
data_file = open('yellow_file.txt', 'a')
bt = bs(ul.urlopen(base_url + '/locations/Quebec/Montreal/90014002.html'))
for div in bt.findAll('div', 'ypgCategory'):
for a in div.findAll('a'):
link = base_url + a['href']
bt = bs(ul.urlopen(link), convertEntities=bs.HTML_ENTITIES)
titles = bt.findAll('span', 'listingTitle')
phones = bt.findAll('a', 'phoneNumber')
for title, phone in zip(titles, phones):
line = '%s %s\n' % (title.text, phone.text)
data_file.write(line.encode('utf-8'))
print line.rstrip()
data_file.close()

Who told you to use latin-1 to decode something that is UTF-8? (clearly specified on the meta tag)
If you ware on Windows you may have problems outputting Unicode to console, better to test writing to text files first.
if you open a file as text do no write binary to it:
codecs.open(...,"w","utf-8").write(unicode_str)
open(...,"wb").write(unicode_str.encode("utf_8"))

Related

Extracting particular string from a text file in Python

Hi I have a copy of HTML code in a TEXT file,So i need to EXTRACT few information from that code,I managed to do it like this ,but i'm not getting any specific patterns to EXTRAXT the text.
Position : 27
Position : 28
Position : 29
I want to EXTRACT the URL link after "href",and the name of the Product after the TEXT "aria-label".How can i do that in Python?
Currently i'm using the below script for finding the lines which is of interest to me,
import psycopg2
try:
filepath = filePath='''/Users/lins/Downloads/pladiv.txt'''
with open(filePath, 'r') as file:
print('entered loop')
cnt=1
for line in file:
if 'pla-unit-single-clickable-target clickable-card" rel="noopener noreferrer" target="_blank" aria-label="' in line:
print('Position : ' + str(cnt))
cnt=cnt+1
if 'href="' in line:
print(line)
fields=line.split(";")
#print(fields[0] + ' as URL')
except (Exception, psycopg2.Error) as error:
quit()
Note: I was inserting it to my PostgreSQL DB, The code is removed in the above sample.
You can either use regex, like this
import re
url = '<p>Hello World</p>More ExamplesEven More Examples'
urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', url)
>>> print urls
['http://example.com', 'http://example2.com']
Or you can parse the file as HTML
>>> from bs4 import BeautifulSoup as Soup
>>> html = Soup(s, 'html.parser') # Soup(s, 'lxml') if lxml is installed
>>> [a['href'] for a in html.find_all('a')]
['http://example.com', 'http://example2.com']
Either way both is fine.
EDIT - to get the entire value of href you can use this,
url = """"""
findall = re.findall("(https?://[^\s]+)", url)
print(findall)
['http://www.fliegende-pillen.de/product/doppelherz-folsaeure-800-b-vitamine-tabletten.230477.html?p=466453&noCS=1&adword=google/PLA&pk_campaign=google/PLA"']

Creating multiple text files with unique file names from scraped data

I took an introductory course in Python this semester and am now trying to do a project. However, I don't really know what code I should write to create multiple .txt files of which the title will be different for each file.
I scraped all the terms and definitions from the website http://www.hogwartsishere.com/library/book/99/. Title of the .txt file should for example be 'Aconite.txt' and the content of the file should be the title and the definition. Every term with its definition can be found in a separate p-tag and the term itself is a b-tag withing the p-tag. Can I use this to write my code?
I suppose I will need to use a for-loop for this, but I don't really know where to start. I searched StackOverflow and found several solutions, but all of them contain code I am not familiar with and/or relate to another issue.
This is what I have so far:
#!/usr/bin/env/ python
import requests
import bs4
def download(url):
r = requests.get(url)
html = r.text
soup = bs4.BeautifulSoup(html, 'html.parser')
terms_definition = []
#for item in soup.find_all('p'): #beter definiëren
items = soup.find_all("div", {"class" : "font-size-16 roboto"})
for item in items:
terms = item.find_all("p")
for term in terms:
#print(term)
if term.text is not 'None':
#print(term.text)
#print("\n")
term_split = term.text.split()
print(term_split)
if term.text != None and len(term.text) > 1:
if '-' in term.text.split():
print(term.text)
print('\n')
if item.find('p'):
terms_definition.append(item['p'])
print(terms_definition)
return terms_definition
def create_url(start, end):
list_url = []
base_url = 'http://www.hogwartsishere.com/library/book/99/chapter/'
for x in range(start, end):
list_url.append(base_url + str(x))
return list_url
def search_all_url(list_url):
for url in list_url:
download(url)
#write data into separate text files. Word in front of the dash should be title of the document, term and definition should be content of the text file
#all terms and definitions are in separate p-tags, title is a b-tag within the p-tag
def name_term
def text_files
path_write = os.path.join('data', name_term +'.txt') #'term' should be replaced by the scraped terms
with open(path_write, 'w') as f:
f.write()
#for loop? in front of dash = title / everything before and after dash = text (file content) / empty line = new file
if __name__ == '__main__':
download('http://www.hogwartsishere.com/library/book/99/chapter/1')
#list_url = create_url(1, 27)
#search_all_url(list_url)
Thanks in advance!
You can iterate over all pages (1-27) to get its content, then parse each page with bs4 and then save results to files:
import requests
import bs4
import re
for i in range(1, 27):
r = requests.get('http://www.hogwartsishere.com/library/book/99/chapter/{}/'.format(i)).text
soup = bs4.BeautifulSoup(r, 'html.parser')
items = soup.find_all("div", {"class": "font-size-16 roboto"})
for item in items:
terms = item.find_all("p")
for term in terms:
title = re.match('^(.*) -', term.text).group(1).replace('/', '-')
with open(title + '.txt', 'w', encoding='utf-8') as f:
f.write(term.text)
Output files:

Python 3 remove duplicate weblinks with extra character rstrip

Using Python 3. I am trying to pull all the unique links from a website and seem to have the code working except for a few links that have a / at the end.
For example: My program will include http://www.google.com & http://www.google.com/
I'd like to make sure my program removes that last character to ensure no duplicates will return. I have researched rstrip() but I can't seem to get it to work. Here is my code:
import bs4 as bs
import urllib.request
import urllib.parse
source = urllib.request.urlopen('https://www.census.gov/data/tables/2016/demo/popest/state-total.html').read()
soup = bs.BeautifulSoup(source,'lxml')
filename = "UniqueWebLinks.csv"
f = open(filename, "w")
headers = "WebLinks\n"
f.write(headers)
all_links = soup.find_all('a')
url_set = set()
for link in all_links:
web_links = link.get("href")
ab_url = urllib.parse.urljoin('https://www.census.gov/data/tables/2016/demo/popest/state-total.html', web_links)
print (ab_url)
if ab_url and ab_url not in url_set:
f.write(str(ab_url) + "\n")
url_set.add(ab_url)
I'd keep it simple and be very explicit about how you're cleaning URLs. For example, strip the last character if it's a slash (/) or a hash (#) (if a URL ends with a hash, it's the same as it not ending with a hash). After glancing at the data, I'd also remove any blank URLs because that's probably not what you're looking for.
BASE_URL = 'https://www.census.gov/data/tables/2016/demo/popest/state-total.html'
all_links = soup.find_all('a')
def clean_links(tags, base_url):
cleaned_links = set()
for tag in tags:
link = tag.get('href')
if link is None:
continue
if link.endswith('/') or link.endswith('#'):
link = link[-1]
full_url = urllib.parse.urljoin(base_url, link)
cleaned_links.add(full_url)
return cleaned_links
cleaned_links = clean_links(all_links, BASE_URL)
for link in cleaned_links:
f.write(str(link) + '\n')

How can I use Python to extract information from a HTML document?

I need python to extract some data from a HTML file.
The code I am using at the moment is bellow:
import urllib
recent = urllib.urlopen(http://gamebattles.majorleaguegaming.com/ps4/call-of-duty-ghosts/team/TeamCrYpToNGamingEU/match?id=46057240)
recentsource = recent.read()
I now need this to then print a list of the gamer tags that are in the table of that webpage for the other team.
How can I do this?
Thanks
Look at the Beautiful Soup module, which is a wonderful text parser.
If you do not want to or can't install it, you can download the source code, and just put the .py file in the same directory as your program.
To do so, download and extract the code from the website, and copy the "bs4" directory into the same folder as your python script.
Then, put this in the beginning of your code:
from bs4 import BeautifulSoup
# or
from bs4 import BeautifulSoup as bs
# To type bs instead of BeautifulSoup every single time you use it
You can learn how to use it from other stackoverflow questions or look at the documentation
You can use html2text for this job or you can use ntlk.
A sample code
import nltk
from urllib import urlopen
url = "http://any-url"
html = urlopen(url).read()
raw = nltk.clean_html(html)
print(raw)
pyparsing has some helpful constructs for pulling data from HTML pages, and the results tend to be self-structuring and self-naming (if you set up the parser/scanner correctly). Here is a pyparsing solution for this particular web page:
from pyparsing import *
# for stripping HTML tags
anyTag,anyClose = makeHTMLTags(Word(alphas,alphanums+":_"))
commonHTMLEntity.setParseAction(replaceHTMLEntity)
stripHTML = lambda tokens: (commonHTMLEntity | Suppress(anyTag | anyClose) ).transformString(''.join(tokens))
# make pyparsing expressions for HTML opening and closing tags
# (suppress all from results, as there is no interesting content in the tags or their attributes)
h3,h3End = map(Suppress,makeHTMLTags("h3"))
table,tableEnd = map(Suppress,makeHTMLTags("table"))
tr,trEnd = map(Suppress,makeHTMLTags("tr"))
th,thEnd = map(Suppress,makeHTMLTags("th"))
td,tdEnd = map(Suppress,makeHTMLTags("td"))
# nothing interesting in column headings - parse them, but suppress the results
colHeading = Suppress(th + SkipTo(thEnd) + thEnd)
# simple routine for defining data cells, with optional results name
colData = lambda name='' : td + SkipTo(tdEnd)(name) + tdEnd
playerListing = Group(tr + colData() + colData() +
colData("username") +
colData().setParseAction(stripHTML)("role") +
colData("networkID") +
trEnd)
teamListing = (h3 + ungroup(SkipTo("Match Players" + h3End, failOn=h3))("name") + "Match Players" + h3End +
table + tr + colHeading*5 + trEnd +
Group(OneOrMore(playerListing))("players"))
for team in teamListing.searchString(recentsource):
# use this to print out names and structures of results
#print team.dump()
print "Team:", team.name
for player in team.players:
print "- %s: %s (%s)" % (player.role, player.username, player.networkID)
# or like this
# print "- %(role)s: %(username)s (%(networkID)s)" % player
print
Prints:
Team: Team CrYpToN Gaming EU
- Leader: CrYpToN_Crossy (CrYpToN_Crossy)
- Captain: Juddanorty (CrYpToN_Judd)
- Member: BLaZe_Elfy (CrYpToN_Elfy)
Team: eXCeLâ„¢
- Leader: Caaahil (Caaahil)
- Member: eSportsmanship (eSportsmanship)
- Member: KillBoy-NL (iClown-x)

Python HTML parsing script that takes array of URLs and outputs specific data about each of the URLs

I am trying to write an HTML parser in Python that takes as its input a URL or list of URLs and outputs specific data about each of those URLs in the format:
URL: data1: data2
The data points can be found at the exact same HTML node in each of the URLs. They are consistently between the same starting tags and ending tags. If anyone out there would like to help an amateur python programmer get the job done, it would be greatly appreciated. Extra points if you can come up with a way to output the information that can be easily copied and pasted into an excel document for subsequent data analysis!
For example, lets say I would like to output the view count for a particular YouTube video. For the URL http://www.youtube.com/watch?v=QOdW1OuZ1U0, the view count is around 3.6 million. For all YouTube videos, this number is found in the following format within the page's source:
<span class="watch-view-count ">
3,595,057
</span>
Fortunately, these exact tags are found only once on a particular YouTube video's page. These starting and ending tags can be inputted into the program or built-in and modified when necessary. The output of the program would be:
http://www.youtube.com/watch?v=QOdW1OuZ1U0: 3,595,057 (or 3595057).
import urllib2
from bs4 import BeautifulSoup
url = 'http://www.youtube.com/watch?v=QOdW1OuZ1U0'
f = urllib2.urlopen(url)
data = f.read()
soup = BeautifulSoup(data)
span = soup.find('span', attrs={'class':'watch-view-count'})
print '{}:{}'.format(url, span.text)
If you do not want to use BeautifulSoup, you can use re:
import urllib2
import re
url = 'http://www.youtube.com/watch?v=QOdW1OuZ1U0'
f = urllib2.urlopen(url)
data = f.read()
pattern = re.compile('<span class="watch-view-count.*?([\d,]+).*?</span>', re.DOTALL)
r = pattern.search(data)
print '{}:{}'.format(url, r.group(1))
As for the outputs, I think you can store them in a csv file.
I prefer HTMLParser over re for this type of task. However, HTMLParser can be a bit tricky. I use immutable objects to store data... I'm sure this this the wrong way of doing it. But its worked with several projects for me in the past.
import urllib2
from HTMLParser import HTMLParser
import csv
position = []
results = [""]
class hp(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == 'span' and ('class', 'watch-view-count ') in attrs:
position.append('bingo')
def handle_endtag(self, tag):
if tag == 'span' and 'bingo' in position:
position.remove('bingo')
def handle_data(self, data):
if 'bingo' in position:
results[0] += " " + data.strip() + " "
my_pages = ["http://www.youtube.com/watch?v=QOdW1OuZ1U0"]
data = []
for url in my_pages:
response = urllib2.urlopen(url)
page = str(response.read())
parser = hp()
parser.feed(page)
data.append(results[0])
# reinitialize immutiable objects
position = []
results = [""]
index = 0
with open('/path/to/test.csv', 'wb') as f:
writer = csv.writer(f)
header = ['url', 'output']
writer.writerow(header)
for d in data:
row = [my_pages[index], data[index]]
writer.writerow(row)
index += 1
Then just open /path/to/test.csv in Excel

Categories