I am a new python user. I'm getting the following error when trying to run my code in PythonAnywhere, despite it working fine on my local PC.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/zachfeatherstone/imputations.py", line 7, in <module>
html = urlopen(url).read()
File "/usr/local/lib/python3.9/urllib/request.py", line 214, in urlopen
return opener.open(url, data, timeout)
File "/usr/local/lib/python3.9/urllib/request.py", line 517, in open
response = self._open(req, data)
File "/usr/local/lib/python3.9/urllib/request.py", line 534, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "/usr/local/lib/python3.9/urllib/request.py", line 494, in _call_chain
result = func(*args)
File "/usr/local/lib/python3.9/urllib/request.py", line 1389, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "/usr/local/lib/python3.9/urllib/request.py", line 1349, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error Tunnel connection failed: 403 Forbidden>
It's similar to: urllib.request.urlopen: ValueError: unknown url type.
CODE
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import os.path
url = input("Enter the URL you want to analyse: ")
html = urlopen(url).read()
soup = BeautifulSoup(html, features="html.parser")
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
# get text
text = soup.get_text()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
wordList = text.split()
#imputations
imputations = []
if "unprofessional" in wordList:
unprofessional_imputation = "That our Client is unprofessional."
imputations.append(unprofessional_imputation)
print(imputations)
#print(wordList)
#print(text)
#saving file
save_path = 'C:/Users/team/downloads'
name_of_file = input("What do you want to save the file as? ")
completeName = os.path.join(save_path, name_of_file+".txt")
f = open(completeName, "w")
# traverse paragraphs from soup
for words in imputations:
f.write(words)
f.write("\n")
My apologies if this has been answered before. How do I manage to run this in PythonAnywhere so that I can deploy over the web?
It COULD be helpful to send header info along with your request. You can pass a Request object into your request like this:
url = input("Enter the URL you want to analyse: ")
header = {'User-Agent': 'Gandalf'}
req = urllib.request.Request(url, None, header)
html = urllib.request.urlopen(req)
html = html.read()
soup = BeautifulSoup(html, features="html.parser")
Related
i am attempting to make a program that downloads a series of product pictures from a site using python. The site stores its images under a certain url format https://www.sitename.com/XYZabcde where XYZ are three letters that represent the brand of the product and abcde are a series of numbers in between 00000 and 30000.
here is my code:
import urllib.request
def down(i, inp):
full_path = 'images/image-{}.jpg'.format(i)
url = "https://www.sitename.com/{}{}.jpg".format(inp,i)
urllib.request.urlretrieve(url, full_path)
print("saved")
return None
inp = input("brand :" )
i = 20100
while i <= 20105:
x = str(i)
y = x.zfill(5)
z = "https://www.sitename.com/{}{}.jpg".format(inp,y)
print(z)
down(y, inp)
i += 1
With the code i have written i can successfully download a series of pictures from it which i know exist for example brand RVL from 20100 to 20105 will succesfully download those six pictures.
however when i broaden the while loop to include links i dont know will give me an image i get this error code :
Traceback (most recent call last):
File "c:/Users/euan/Desktop/university/programming/Python/parser/test - Copy.py", line 20, in <module>
down(y, inp)
File "c:/Users/euan/Desktop/university/programming/Python/parser/test - Copy.py", line 6, in down
urllib.request.urlretrieve(url, full_path)
File "C:\Users\euan\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 247, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "C:\Users\euan\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\euan\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 531, in open
response = meth(req, response)
File "C:\Users\euan\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 640, in http_response
response = self.parent.error(
File "C:\Users\euan\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 569, in error
return self._call_chain(*args)
File "C:\Users\euan\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 502, in _call_chain
result = func(*args)
File "C:\Users\euan\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
what can i do to check and avoid any url that would yield this result?
You cannot as such know in advance which URLs you don't have access to, but you can surround the download with a try-except:
import urllib.request, urllib.error
...
def down(i, inp):
full_path = 'images/image-{}.jpg'.format(i)
url = "https://www.sitename.com/{}{}.jpg".format(inp,i)
try:
urllib.request.urlretrieve(url, full_path)
print("saved")
except urllib.error.HTTPError as e:
print("failed:", e)
return None
In that case it will just print e.g. "failed: HTTP Error 403: Forbidden" whenever a URL cannot be fetched, and the program will continue.
So I have been following along some videos to learn python, but can't get rid of this error. I have experience in other languages so I'm usually fine to fix errors, but no matter what I do, I either get the same error or something different.
I've tried switching the argument from 'xml' to 'lxml', but this only changes the error that I get
from bs4 import BeautifulSoup
import urllib.request
req = urllib.request.urlopen('http://pythonprogramming.net/')
xml = BeautifulSoup(req, 'xml')
for item in xml.findAll('link'):
url = item.text
news = urllib.request.urlopen(url).read()
print(news)
Ideally, this would print out some of the text within link tags, but instead, I get the following errors -
Error while using xml -
File "/Users/rodrigo/Desktop/ALL/Programming/Python/Python Web Programming/Working with HTML/scrapingParagraphData.py", line 13, in <module>
news = urllib.request.urlopen(url).read()
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 525, in open
response = self._open(req, data)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 548, in _open
'unknown_open', req)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 503, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 1387, in unknown_open
raise URLError('unknown url type: %s' % type)
urllib.error.URLError: <urlopen error unknown url type: #media (min-width>
Error while using lxml -
File "/Users/rodrigo/Desktop/ALL/Programming/Python/Python Web Programming/Working with HTML/scrapingParagraphData.py", line 13, in <module>
news = urllib.request.urlopen(url).read()
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 510, in open
req = Request(fullurl, data)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 328, in __init__
self.full_url = url
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 354, in full_url
self._parse()
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 383, in _parse
raise ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: ''
Your current code is targeting link elements and is extracting text rather than href so there is no known protocol to work with.
Even if you extracted the href they are relative so you would still have a problem with unknown protocol.
item['href'] would give:
/static/favicon.ico
/static/css/materialize.min.css
https://fonts.googleapis.com/icon?family=Material+Icons
/static/css/bootstrap.css
I don't think you are after these type of links. If you were after the tutorial links then you need something that targets those elements e.g.
tutorial_links = ['https://pythonprogramming.net' + i['href'] for i in xml.select('.waves-light.btn')]
I would probably rename the assignment variable in BeautifulSoup(req, 'lxml') to:
from bs4 import BeautifulSoup
import urllib.request
req = urllib.request.urlopen('http://pythonprogramming.net/')
soup = BeautifulSoup(req, 'lxml')
tutorial_links = ['https://pythonprogramming.net' + i['href'] for i in xml.select('.waves-light.btn')]
I am trying to do the following:
open a CSV file containing a list with URLs (GET-Requests)
read the CSV file and write the entries to a list
open every single URL and read the answer
write the answers back to a new CSV file
I get the following error:
Traceback (most recent call last):
File "C:\Users\l.buinui\Desktop\request2.py", line 16, in <module>
req = urllib2.urlopen(url)
File "C:\Python27\lib\urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 404, in open
response = self._open(req, data)
File "C:\Python27\lib\urllib2.py", line 427, in _open
'unknown_open', req)
File "C:\Python27\lib\urllib2.py", line 382, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 1247, in unknown_open
raise URLError('unknown url type: %s' % type)
URLError: <urlopen error unknown url type: ['http>
Here is the code I am using:
import urllib2
import urllib
import csv
# Open and read the source file and write entries to a list called link_list
source_file=open("source_new.csv", "rb")
source = csv.reader(source_file, delimiter=";")
link_list = [row for row in source]
source_file.close()
# Create an output file which contains the answer of the GET-Request
out=open("output.csv", "wb")
output = csv.writer(out, delimiter=";")
for row in link_list:
url = str(row)
req = urllib2.urlopen(url)
output.writerow(req.read())
out.close()
What is going wrong there?
Thanks in advance for any hints.
Cheers
Using the row variable will pass a list element (it contains only one element, the url) to urlopen, but passing row[0] will pass the string containing the url.
The csv.reader returns a list for each row it reads, no matter how many items are in the row.
It's working now. If I directly reference row[0]in the loop there are no problems.
import urllib2
import urllib
import csv
# Open and read the source file and write entries to a list called link_list
source_file=open("source.csv", "rb")
source = csv.reader(source_file, delimiter=";")
link_list = [row for row in source]
source_file.close()
# Create an output file which contains the answer of the GET-Request
out=open("output.csv", "wb")
output = csv.writer(out)
for row in link_list:
req = urllib2.urlopen(row[0])
answer = req.read()
output.writerow([answer])
out.close()
I am creating a web scraping script and divided it into four pieces. Separately they all work perfect, however when I put them all together I get the following error : urlopen error [Errno 111] Connection refused. I have looked at similar questions to mine and have tried to catch the error with try-except but even that doesn`t work. My all in one code is :
from selenium import webdriver
import re
import urllib2
site = ""
def phone():
global site
site = "https://www." + site
if "spokeo" in site:
browser = webdriver.Firefox()
browser.get(site)
content = browser.page_source
browser.quit()
m_obj = re.search(r"(\(\d{3}\)\s\d{3}-\*{4})", content)
if m_obj:
print m_obj.group(0)
elif "addresses" in site:
usock = urllib2.urlopen(site)
data = usock.read()
usock.close()
m_obj = re.search(r"(\(\d{3}\)\s\d{3}-\d{4})", data)
if m_obj:
print m_obj.group(0)
else :
usock = urllib2.urlopen(site)
data = usock.read()
usock.close()
m_obj = re.search(r"(\d{3}-\s\d{3}-\d{4})", data)
if m_obj:
print m_obj.group(0)
def pipl():
global site
url = "https://pipl.com/search/?q=tom+jones&l=Phoenix%2C+AZ%2C+US&sloc=US|AZ|Phoenix&in=6"
usock = urllib2.urlopen(url)
data = usock.read()
usock.close()
r_list = [#re.compile("spokeo.com/[^\s]+"),
re.compile("addresses.com/[^\s]+"),
re.compile("10digits.us/[^\s]+")]
for r in r_list:
match = re.findall(r,data)
for site in match:
site = site[:-6]
print site
phone()
pipl()
Here is my traceback:
Traceback (most recent call last):
File "/home/lazarov/.spyder2/.temp.py", line 48, in <module>
pipl()
File "/home/lazarov/.spyder2/.temp.py", line 46, in pipl
phone()
File "/home/lazarov/.spyder2/.temp.py", line 25, in phone
usock = urllib2.urlopen(site)
File "/usr/lib/python2.7/urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 400, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 418, in _open
'_open', req)
File "/usr/lib/python2.7/urllib2.py", line 378, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1215, in https_open
return self.do_open(httplib.HTTPSConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1177, in do_open
raise URLError(err)
urllib2.URLError: <urlopen error [Errno 111] Connection refused>
After manually debugging the code I found that the error comes from the function phone(), so I tried to run just that piece :
import re
import urllib2
url = 'http://www.10digits.us/n/Tom_Jones/Phoenix_AZ/1fe293a0b7'
usock = urllib2.urlopen(url)
data = usock.read()
usock.close()
m_obj = re.search(r"(\d{3}-\d{3}-\d{4})", data)
if m_obj:
print m_obj.group(0)
And it worked. Which, I believe, shows it`s not that the firewall is actively denying the connection or the respective service is not started on the other site or is overloaded. Any help would be apreciated.
Usually the devil is in the detail.
according to your traceback...
File "/usr/lib/python2.7/urllib2.py", line 1215, in https_open
return self.do_open(httplib.HTTPSConnection, req)
and your source code...
site = "https://www." + site
...I may suppose that in your code you are trying to access https://www.10digits.us/n/Tom_Jones/Phoenix_AZ/1fe293a0b7 whereas in your test you are connecting to http://www.10digits.us/n/Tom_Jones/Phoenix_AZ/1fe293a0b7.
try to replace the https with http (at least for www.10digits.us): probably the website you are trying to scraping does not respond to the port 443 but only to the port 80 (you can check it even with your browser)
I've got a list of 100 websites in CSV format. All of the sites have the same general format, including a large table with 7 columns. I wrote this script to extract the data from the 7th column of each of the websites and then write this data to file. The script below partially works, however: opening the output file (after running the script) shows that something is being skipped because it only shows 98 writes (clearly the script also registers a number of exceptions). Guidance on how to implement a "catching exception" in this context would be much appreciated. Thank you!
import csv, urllib2, re
def replace(variab): return variab.replace(",", " ")
urls = csv.reader(open('input100.txt', 'rb')) #access list of 100 URLs
for url in urls:
html = urllib2.urlopen(url[0]).read() #get HTML starting with the first URL
col7 = re.findall('td7.*?td', html) #use regex to get data from column 7
string = str(col7) #stringify data
neat = re.findall('div3.*?div', string) #use regex to get target text
result = map(replace, neat) #apply function to remove','s from elements
string2 = ", ".join(result) #separate list elements with ', ' for export to csv
output = open('output.csv', 'ab') #open file for writing
output.write(string2 + '\n') #append output to file and create new line
output.close()
Return:
Traceback (most recent call last):
File "C:\Python26\supertest3.py", line 6, in <module>
html = urllib2.urlopen(url[0]).read()
File "C:\Python26\lib\urllib2.py", line 124, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python26\lib\urllib2.py", line 383, in open
response = self._open(req, data)
File "C:\Python26\lib\urllib2.py", line 401, in _open
'_open', req)
File "C:\Python26\lib\urllib2.py", line 361, in _call_chain
result = func(*args)
File "C:\Python26\lib\urllib2.py", line 1130, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "C:\Python26\lib\urllib2.py", line 1103, in do_open
r = h.getresponse()
File "C:\Python26\lib\httplib.py", line 950, in getresponse
response.begin()
File "C:\Python26\lib\httplib.py", line 390, in begin
version, status, reason = self._read_status()
File "C:\Python26\lib\httplib.py", line 354, in _read_status
raise BadStatusLine(line)
BadStatusLine
>>>>
Make the body of your for loop into:
for url in urls:
try:
...the body you have now...
except Exception, e:
print>>sys.stderr, "Url %r not processed: error (%s) % (url, e)
(Or, use logging.error instead of the goofy print>>, if you're already using the logging module of the standard library [and you should;-)]).
I'd recommend reading the Errors and Exceptions Python documentation, especially section 8.3 -- Handling Exceptions.