I have a list of image links that I'm downloading, it hangs after a minute, doesn't download more links, doesn't show errors, doesn't get interrupted.
this is my code, is there something wrong with it?
import requests
import os
from tqdm import tqdm
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin, urlparse
from random import randint
import urllib.request
from urllib.error import HTTPError
from urllib.error import URLError
def random_with_N_digits(n):
range_start = 10**(n-1)
range_end = (10**n)-1
return randint(range_start, range_end)
def download2(url, pathname):
filename = os.path.join(pathname,str(random_with_N_digits(10))+'-'+url.split("/")[-1])
urllib.request.urlretrieve(url, filename)
done = []
failed = []
for i in urls:
if i not in done:
if i not in failed:
try:
download2(i,'C:/Users/USER/Desktop/downloaded/')
print(i)
done.append(i)
except (HTTPError,URLError) as e:
print('exception at',i)
print(e)
failed.append(i)
pass
UPDATE:
based on the comments, printing the url before downloading it results in different urls:
first trial
for i in urls:
print("for i in urls",i)
if i not in done:
if i not in failed:
try:
download2(i,'C:/Users/abdelrahman.fathy/Desktop/downloaded/')
print(i)
done.append(i)
except (HTTPError,URLError,OSError) as e:
print('exception at',i)
print(e)
failed.append(i)
for i in urls http://amphenol.co.il/wp-content/uploads/2017/06/iso-9001-1.jpg
exception at http://amphenol.co.il/wp-content/uploads/2017/06/iso-9001-1.jpg
HTTP Error 410: Gone
for i in urls http://apscopower.com/about_us/iso/files/stacks-image-25baec8-606x800.png
second trial's outuput
http://image.hanrun.com/image/94db9804-fb66-41b2-a6c3-5a38c8b9eb69.jpg
for i in urls http://images02.cdn86.net/kps01/M00/2A/9C/wKiAiVRlaxrc7XGAAAFnIr8s0z4434.jpg
http://images02.cdn86.net/kps01/M00/2A/9C/wKiAiVRlaxrc7XGAAAFnIr8s0z4434.jpg
for i in urls http://images02.cdn86.net/kps01/M00/31/EB/wKiAiVR7y_CMgBErAAFz2tipXbA040.jpg
UPDATE 2:
urls = []
for i in df.imgsrc:
if i.strip() not in urls:
urls.append(i.strip())
len(urls)
2466
Related
I want to access mouser.com using urllib. When I try to fetch the data from the URL, it hangs indefinitely.
Here is code:
import urllib.error
import urllib.request
try:
htmls = urllib.request.urlopen("https://www.mouser.com/")
except urllib.error.HTTPError as e:
print("HTTP ERROR")
except urllib.error.URLError as e:
print("URL ERROR")
else :
print(htmls.read().decode("utf-8"))
This piece of code works fine for most URLs, but for some URLs it doesn't like Mouser or element14.
Update
I am trying to define a function for a list of urls, the function is intended to print either if a certain link or a server is not found from the original list (job_title_links):
This is what I've got so far:
from urllib.error import URLError
from urllib.error import HTTPError
from urllib.request import urlopen
job_title_links =['https://www.salario.com.br/profissao/abacaxicultor-cbo-612510/',
'https://www.salario.com.br/profissao/abade-cbo-263105/',
'https://www.salario.com.br/profissao/abanador-na-agricultura-cbo-622020/']
def try_url_exist(links):
for link in job_title_links:
try:
html=urlopen(link)
except HTTPError as e:
print(e) # not found url
except URLError as e:
print(e) # server not found
try_url_exist(job_title_links)
However the function returns me a list of HTTPError 403 even when the
url's exist.
Console output:
HTTP Error 403: Forbidden
HTTP Error 403: Forbidden
HTTP Error 403: Forbidden
Expected function output should do nothing if the url exists and should return
either HTTPError or URLError and the name of the url when the url does not exist.
How could I accomplish this task?
By changing urlopen() to requests.get() from requests library and adding it
to an empty list, the code worked.
import requests
from urllib.error import URLError
from urllib.error import HTTPError
from urllib.request import urlopen
def try_url_exist(links):
for link in job_title_links:
try:
html=requests.get(link)
except HTTPError as e:
print(e)
except URLError as e:
print(e)
else:
print(link)
functional_links = []
functional_links = try_url_exist(job_title_links)
I have the list of more than 1000 URLs (those URLs are to download the reports) saved in a .csv file.
Some of the URLs have 404 error and I want to find a way to remove them from the list.
I managed to write a code to identify which URL is invalid (for python 3) below. However I don't know how to remove those URLs from the list automatically given there any many URLs. Thank you!
from urllib.request import urlopen
from urllib.error import HTTPError
try:
urlopen("url")
except HTTPError as err:
if err.code == 404:
print ('invalid')
else:
raise
You can use another list to save the 404 url(if 404 url less than normal url), then get the difference set, so:
from urllib.request import urlopen
from urllib.error import HTTPError
exclude_urls = set()
try:
urlopen("url")
except HTTPError as err:
if err.code == 404:
exclude_urls.add(url)
valid_urls = set(all_urls) - exclude_urls
Consider List A has all the urls.
A = A.remove("invalid_url")
You can do something like this:
from urllib.request import urlopen
from urllib.error import HTTPError
def load_data(csv_name):
...
def save_data(data,csv_name):
...
links=load_data(csv_name)
new_links=set()
for i in links:
try:
urlopen("url")
except HTTPError as err:
if err.code == 404:
print ('invalid')
else:
new_links.add(i)
save_data( list(new_links),csv_name)
Create a loop and write valid urls in finally statement to new csv file would be the easiest solution.
This is the code I wrote in python for opening a url.
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import time
import requests
from random import randint
import urllib.parse
class AmazonReviews():
def __init__(self):
self.headers = {'User-Agent' : 'Mozilla/5.0'}
def open_url(self,url):
values = {}
data = urllib.parse.urlencode(values).encode('utf-8')
req = urllib.request.Request(url, data, self.headers)
response = urllib.request.urlopen(req)
html = response.read()
return html
def fetch_reviews(self,all_reviews_link):
try:
url = "https://www.amazon.in" + all_reviews_link
print(url)
html = self.open_url(url)
except HTTPError as e:
print(e)
review = AmazonReviews()
review.fetch_reviews('/gp/profile/amzn1.account.AFBWOEM2CWLC7ZRQ7WK6FQYXH6AA/ref=cm_cr_arp_d_gw_btm?ie=UTF8')
I am passing url as such because in the main project this url is scraped using href attribute that gives the relative path.
If there is any method to get absolute url please suggest.
Output -
https://www.amazon.in/gp/profile/amzn1.account.AFBWOEM2CWLC7ZRQ7WK6FQYXH6AA/ref=cm_cr_arp_d_gw_btm?ie=UTF8
HTTP Error 404: NotFound
Link of the code
https://onlinegdb.com/SyFPXzWVI
Use Selenium instead:
from selenium import webdriver
import os
browser = webdriver.Chrome(executable_path=os.path.abspath(os.getcwd()) + "/chromedriver")
link = "https://www.amazon.in/gp/profile/amzn1.account.AFBWOEM2CWLC7ZRQ7WK6FQYXH6AA/ref=cm_cr_arp_d_gw_btm?ie=UTF8"
browser.get(link)
name = browser.find_element_by_xpath('//*[#id="customer-profile-name-header"]/div[2]/span').text
Output:
Dheeraj Malhotra
I am trying to get html source by using urllib.request.open.
I don't know why my code did not stop.
from urllib.request import urlopen
url = "https://google.com"
try:
page = urlopen(url)
f = open("google.html","wb")
f.write(page.read())
except:
print("Error")
But with timeout, it received the source.
from urllib.request import urlopen
url = "https://google.com"
try:
page = urlopen(url, timeout=0.1)
f = open("google.html","wb")
f.write(page.read())
except:
print("Error")