I have the list of more than 1000 URLs (those URLs are to download the reports) saved in a .csv file.
Some of the URLs have 404 error and I want to find a way to remove them from the list.
I managed to write a code to identify which URL is invalid (for python 3) below. However I don't know how to remove those URLs from the list automatically given there any many URLs. Thank you!
from urllib.request import urlopen
from urllib.error import HTTPError
try:
urlopen("url")
except HTTPError as err:
if err.code == 404:
print ('invalid')
else:
raise
You can use another list to save the 404 url(if 404 url less than normal url), then get the difference set, so:
from urllib.request import urlopen
from urllib.error import HTTPError
exclude_urls = set()
try:
urlopen("url")
except HTTPError as err:
if err.code == 404:
exclude_urls.add(url)
valid_urls = set(all_urls) - exclude_urls
Consider List A has all the urls.
A = A.remove("invalid_url")
You can do something like this:
from urllib.request import urlopen
from urllib.error import HTTPError
def load_data(csv_name):
...
def save_data(data,csv_name):
...
links=load_data(csv_name)
new_links=set()
for i in links:
try:
urlopen("url")
except HTTPError as err:
if err.code == 404:
print ('invalid')
else:
new_links.add(i)
save_data( list(new_links),csv_name)
Create a loop and write valid urls in finally statement to new csv file would be the easiest solution.
Related
I want to access mouser.com using urllib. When I try to fetch the data from the URL, it hangs indefinitely.
Here is code:
import urllib.error
import urllib.request
try:
htmls = urllib.request.urlopen("https://www.mouser.com/")
except urllib.error.HTTPError as e:
print("HTTP ERROR")
except urllib.error.URLError as e:
print("URL ERROR")
else :
print(htmls.read().decode("utf-8"))
This piece of code works fine for most URLs, but for some URLs it doesn't like Mouser or element14.
Update
I am trying to define a function for a list of urls, the function is intended to print either if a certain link or a server is not found from the original list (job_title_links):
This is what I've got so far:
from urllib.error import URLError
from urllib.error import HTTPError
from urllib.request import urlopen
job_title_links =['https://www.salario.com.br/profissao/abacaxicultor-cbo-612510/',
'https://www.salario.com.br/profissao/abade-cbo-263105/',
'https://www.salario.com.br/profissao/abanador-na-agricultura-cbo-622020/']
def try_url_exist(links):
for link in job_title_links:
try:
html=urlopen(link)
except HTTPError as e:
print(e) # not found url
except URLError as e:
print(e) # server not found
try_url_exist(job_title_links)
However the function returns me a list of HTTPError 403 even when the
url's exist.
Console output:
HTTP Error 403: Forbidden
HTTP Error 403: Forbidden
HTTP Error 403: Forbidden
Expected function output should do nothing if the url exists and should return
either HTTPError or URLError and the name of the url when the url does not exist.
How could I accomplish this task?
By changing urlopen() to requests.get() from requests library and adding it
to an empty list, the code worked.
import requests
from urllib.error import URLError
from urllib.error import HTTPError
from urllib.request import urlopen
def try_url_exist(links):
for link in job_title_links:
try:
html=requests.get(link)
except HTTPError as e:
print(e)
except URLError as e:
print(e)
else:
print(link)
functional_links = []
functional_links = try_url_exist(job_title_links)
I have a list of image links that I'm downloading, it hangs after a minute, doesn't download more links, doesn't show errors, doesn't get interrupted.
this is my code, is there something wrong with it?
import requests
import os
from tqdm import tqdm
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin, urlparse
from random import randint
import urllib.request
from urllib.error import HTTPError
from urllib.error import URLError
def random_with_N_digits(n):
range_start = 10**(n-1)
range_end = (10**n)-1
return randint(range_start, range_end)
def download2(url, pathname):
filename = os.path.join(pathname,str(random_with_N_digits(10))+'-'+url.split("/")[-1])
urllib.request.urlretrieve(url, filename)
done = []
failed = []
for i in urls:
if i not in done:
if i not in failed:
try:
download2(i,'C:/Users/USER/Desktop/downloaded/')
print(i)
done.append(i)
except (HTTPError,URLError) as e:
print('exception at',i)
print(e)
failed.append(i)
pass
UPDATE:
based on the comments, printing the url before downloading it results in different urls:
first trial
for i in urls:
print("for i in urls",i)
if i not in done:
if i not in failed:
try:
download2(i,'C:/Users/abdelrahman.fathy/Desktop/downloaded/')
print(i)
done.append(i)
except (HTTPError,URLError,OSError) as e:
print('exception at',i)
print(e)
failed.append(i)
for i in urls http://amphenol.co.il/wp-content/uploads/2017/06/iso-9001-1.jpg
exception at http://amphenol.co.il/wp-content/uploads/2017/06/iso-9001-1.jpg
HTTP Error 410: Gone
for i in urls http://apscopower.com/about_us/iso/files/stacks-image-25baec8-606x800.png
second trial's outuput
http://image.hanrun.com/image/94db9804-fb66-41b2-a6c3-5a38c8b9eb69.jpg
for i in urls http://images02.cdn86.net/kps01/M00/2A/9C/wKiAiVRlaxrc7XGAAAFnIr8s0z4434.jpg
http://images02.cdn86.net/kps01/M00/2A/9C/wKiAiVRlaxrc7XGAAAFnIr8s0z4434.jpg
for i in urls http://images02.cdn86.net/kps01/M00/31/EB/wKiAiVR7y_CMgBErAAFz2tipXbA040.jpg
UPDATE 2:
urls = []
for i in df.imgsrc:
if i.strip() not in urls:
urls.append(i.strip())
len(urls)
2466
I'm following a book Web Scraping with Python and I'm trying this :
I'm in a Virtual Environment with python 3.4.3 on OSX
BeautifulSoup library is installed
When I'm trying this :
from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib.error import HTTPError
html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")
except urllib.error.HTTPError as e:
print(e.code)
if html is none:
print("url is not found")
else:
bsObj = BeautifulSoup(html.read());
print(bsObj
When I'm running it, I have the following error :
(scrapingEnv)Macintosh:scrapingenv nicolas$ python3 scrapetest.py
File "scrapetest.py", line 6
except urllib.error.HTTPError as e:
^
SyntaxError: invalid syntax
I also tried with "except urllib.HTTPError" in line 6 without any success.
What am I doing wrong ?
You are missing your try statement
from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib.error import HTTPError
try:
html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")
except urllib.error.HTTPError as e:
print(e.code)
if html is None:
print("url is not found")
Edit: you should also change none to None
I used Python's urllib2.urlopen and got a 500 error from the server. How do I find the text of the error? I'm hoping that it has useful information.
from urllib2 import urlopen, HTTPError
try:
f = urlopen(url)
except HTTPError, e:
print(e.read())