urllib is mangling my urls - python

I'm writing a little scraper. Here's the code so far.
from urllib import urlopen
from BeautifulSoup import BeautifulSoup
import re
soup = BeautifulSoup(
urlopen('http://www.high-rely.com/HR3/includes/ProductFamily.php').read()
)
links = soup.findAll('a', 'visible_link')
hrefs = ['www.high-rely.com' + relative for relative in [x['href'] for x in links]]
subpages = map(BeautifulSoup, [urlopen(x).read() for x in hrefs])
When I run it though, I get the following error.
Traceback (most recent call last):
File "C:/Users/josh.SCL/Desktop/Scraper.py", line 13, in <module>
subpages = map(BeautifulSoup, [urlopen(x).read() for x in hrefs])
File "C:\Python27\lib\urllib.py", line 84, in urlopen
return opener.open(url)
File "C:\Python27\lib\urllib.py", line 205, in open
return getattr(self, name)(url)
File "C:\Python27\lib\urllib.py", line 461, in open_file
return self.open_local_file(url)
File "C:\Python27\lib\urllib.py", line 475, in open_local_file
raise IOError(e.errno, e.strerror, e.filename)
IOError: [Errno 2] The system cannot find the path specified: 'www.high-rely.com\\HR3\\includes\\products\\5MinOverview.php'
If I loop through hrefs, I get this.
www.high-rely.com/HR3/includes/products/5MinOverview.php
www.high-rely.com/HR3/includes/products/10MinOverview.php
www.high-rely.com/HR3/includes/products/30MinOverview.php
www.high-rely.com/HR3/includes/HighRely/HighRely.php
www.high-rely.com/HR3/includes/HighRely/HighRely.php
www.high-rely.com/HR3/includes/RAIDFrame/RAIDFrame.php
www.high-rely.com/HR3/includes/RAIDFrame/RAIDFrame.php
www.high-rely.com/HR3/includes/MPac/MPac.php
www.high-rely.com/HR3/includes/MPac/MPac.php
www.high-rely.com/HR3/includes/BNAS/BNAS-HRS201.php
www.high-rely.com/HR3/includes/announcements.php
Which is correct. What's going on here?

You forgot to write http://:
hrefs = ['http://www.high-rely.com' + relative for relative in [x['href'] for x in links]]

Related

MemoryError when parsing XML file

I am trying to find the specific tag in an XML file and I used BeautifulSoup to read the XML file. It produces the following error:
soup = BeautifulSoup(XML, 'xml')
Traceback (most recent call last):
File "<ipython-input-5-f431fabb5903>", line 1, in <module>
soup = BeautifulSoup(XML, 'xml')
File "D:\software\Anaconda3\envs\py37\lib\site-packages\bs4\__init__.py", line 362, in __init__
self._feed()
File "D:\software\Anaconda3\envs\py37\lib\site-packages\bs4\__init__.py", line 448, in _feed
self.builder.feed(self.markup)
File "D:\software\Anaconda3\envs\py37\lib\site-packages\bs4\builder\_lxml.py", line 203, in feed
markup = StringIO(markup)
MemoryError
The size of the file is 353 MB but it has also parsed a larger file and did not produce this error. Do you know what the problem is?

How Do We Convert HTML to PDF using Python, Is there Any code please share it to me?

I have tried the Library called pytotree, But i didnt get any Answer
This is the code:
import pdftotree
file= open('C:/Users/chaitanya.naidu/Downloads/test.pdf', 'rb')
f = pdftotree.parse(file)
I am getting this error
Traceback (most recent call last):
File "<ipython-input-4-4a9a6b72801d>", line 1, in <module>
f = pdftotree.parse(file)
File "C:\Users\chaitanya.naidu\AppData\Local\Continuum\Anaconda3\lib\site-packages\pdftotree\core.py", line 63, in parse
if not extractor.is_scanned():
File "C:\Users\chaitanya.naidu\AppData\Local\Continuum\Anaconda3\lib\site-packages\pdftotree\TreeExtract.py", line 121, in is_scanned
self.parse()
File "C:\Users\chaitanya.naidu\AppData\Local\Continuum\Anaconda3\lib\site-packages\pdftotree\TreeExtract.py", line 91, in parse
for page_num, layout in enumerate(analyze_pages(self.pdf_file)):
File "C:\Users\chaitanya.naidu\AppData\Local\Continuum\Anaconda3\lib\site-packages\pdftotree\utils\pdf\pdf_utils.py", line 117, in analyze_pages
with open(os.path.realpath(file_name), "rb") as fp:
File "C:\Users\chaitanya.naidu\AppData\Local\Continuum\Anaconda3\lib\ntpath.py", line 542, in abspath
path = os.fspath(path)
TypeError: expected str, bytes or os.PathLike object, not _io.BufferedReader
You can use pdfkit, example:
import pdfkit
pdfkit.from_url('http://google.com', 'out.pdf')
pdfkit.from_file('test.html', 'out.pdf')
pdfkit.from_string('Hello!', 'out.pdf')

Python - BeautifulSoup error while scraping

UPDATE: Using lxml instead of html.parser helped solve the problem, as Freddier suggested in the answer below!
I am trying to webscrape some information off of this website: https://www.ticketmonster.co.kr/deal/952393926.
I get an error when I run soup(thispage, 'html.parser) but this error only happens for this specific page. Does anyone know why this is happening?
The code I have so far is very simple:
from bs4 import BeautifulSoup as soup
openU = urlopen(url)
thispage = openU.read()
open.close()
pageS = soup(thispage, 'html.parser')
The error I get is:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Users\Kathy\AppData\Local\Programs\Python\Python36\lib\site-packages\bs4\__init__.py", line 228, in __init__
self._feed()
File "C:\Users\Kathy\AppData\Local\Programs\Python\Python36\lib\site- packages\bs4\__init__.py", line 289, in _feed
self.builder.feed(self.markup)
File "C:\Users\Kathy\AppData\Local\Programs\Python\Python36\lib\site-packages\bs4\builder\_htmlparser.py", line 215, in feed
parser.feed(markup)
File "C:\Users\Kathy\AppData\Local\Programs\Python\Python36\lib\html\parser.py", line 111, in feed
self.goahead(0)
File "C:\Users\Kathy\AppData\Local\Programs\Python\Python36\lib\html\parser.py", line 179, in goahead
k = self.parse_html_declaration(i)
File "C:\Users\Kathy\AppData\Local\Programs\Python\Python36\lib\html\parser.py", line 264, in parse_html_declaration
return self.parse_marked_section(i)
File "C:\Users\Kathy\AppData\Local\Programs\Python\Python36\lib\_markupbase.py", line 149, in parse_marked_section
sectName, j = self._scan_name( i+3, i )
File "C:\Users\Kathy\AppData\Local\Programs\Python\Python36\lib\_markupbase.py", line 391, in _scan_name
% rawdata[declstartpos:declstartpos+20])
File "C:\Users\Kathy\AppData\Local\Programs\Python\Python36\lib\_markupbase.py", line 34, in error
"subclasses of ParserBase must override error()")
NotImplementedError: subclasses of ParserBase must override error()
Please help!
Try using
pageS = soup(thispage, 'lxml')
insted of
pageS = soup(thispage, 'html.parser')
It looks may be a problem with characters encoding using "html.parser"

Finding specific text using BeautifulSoup

I'm trying to grab all the winner categories from this page:
http://www.chicagoreader.com/chicago/BestOf?category=4053660&year=2013
I've written this in sublime:
import urllib2
from bs4 import BeautifulSoup
url = "http://www.chicagoreader.com/chicago/BestOf?category=4053660&year=2013"
page = urllib2.urlopen(url)
soup_package = BeautifulSoup(page)
page.close()
#find everything in the div class="bestOfItem). This works.
all_categories = soup_package.findAll("div",class_="bestOfItem")
# print(all_categories)
#this part breaks it:
soup = BeautifulSoup(all_categories)
winner = soup.a.string
print(winner)
When I run this in terminal, I get the following error:
Traceback (most recent call last):
File "winners.py", line 12, in <module>
soup = BeautifulSoup(all_categories)
File "build/bdist.macosx-10.9-intel/egg/bs4/__init__.py", line 193, in __init__
File "build/bdist.macosx-10.9-intel/egg/bs4/builder/_lxml.py", line 99, in prepare_markup
File "build/bdist.macosx-10.9-intel/egg/bs4/dammit.py", line 249, in encodings
File "build/bdist.macosx-10.9-intel/egg/bs4/dammit.py", line 304, in find_declared_encoding
TypeError: expected string or buffer
Any one know what's happening there?
You are trying to create a new BeautifulSoup object from a list of elements.
soup = BeautifulSoup(all_categories)
There is absolutely no need to do this here; just loop over each match instead:
for match in all_categories:
winner = match.a.string
print(winner)

Python lib execute error

I made this python lib and it had this function with uses urllib and urllib2 but when i execute the lib's functions from python shell i get this error
>>> from sabermanlib import geturl
>>> geturl("roblox.com","ggg.html")
Traceback (most recent call last):
File "<pyshell#11>", line 1, in <module>
geturl("roblox.com","ggg.html")
File "sabermanlib.py", line 21, in geturl
urllib.urlretrieve(Address,File)
File "C:\Users\Andres\Desktop\ddd\Portable Python 2.7.5.1\App\lib\urllib.py", line 94, in urlretrieve
return _urlopener.retrieve(url, filename, reporthook, data)
File "C:\Users\Andres\Desktop\ddd\Portable Python 2.7.5.1\App\lib\urllib.py", line 240, in retrieve
fp = self.open(url, data)
File "C:\Users\Andres\Desktop\ddd\Portable Python 2.7.5.1\App\lib\urllib.py", line 208, in open
return getattr(self, name)(url)
File "C:\Users\Andres\Desktop\ddd\Portable Python 2.7.5.1\App\lib\urllib.py", line 463, in open_file
return self.open_local_file(url)
File "C:\Users\Andres\Desktop\ddd\Portable Python 2.7.5.1\App\lib\urllib.py", line 477, in open_local_file
raise IOError(e.errno, e.strerror, e.filename)
IOError: [Errno 2] The system cannot find the file specified: 'roblox.com'
>>>
and here's the code for the lib i made:
import urllib
import urllib2
def geturl(Address,File):
urllib.urlretrieve(Address,File)
EDIT 2
I cant understand why i get this error in the python shell executing:
geturl(Address,File)
You don't want urllib.urlretrieve. This takes a file-like object. Instead, you want urllib.urlopen:
>>> help(urllib.urlopen)
urlopen(url, data=None, proxies=None)
Create a file-like object for the specified URL to read from.
Additionally, if you want to download and save a document, you'll need a more robust geturl function:
def geturl(Address, FileName):
html_data = urllib.urlopen(Address).read() # Open the URL
with open(FileName, 'wb') as f: # Open the file
f.write(html_data) # Write data from URL to file
geturl(u'http://roblox.com') # URL's must contain the full URI, including http://

Categories