I am working on a project where I need to scrape images off of the web. To do this, I write the image links to a file, and then I download each of them to a folder with requests. At first, I used Google as the scrape site, but do to several reasons, I have decided that wikipedia is a much better alternative. However, after I tried the first time, many of the images couldn't be opened, so I tried again with the change that when I downloaded the images, I downloaded them to names with endings that matched the endings of the links. More images were able to be accessed like this, but many were still not able to be opened. When I tested downloading the images myself (individually outside of the function), they downloaded perfectly, and when I used my function to download them afterwards, they kept downloading correctly (i.e. I could access them). I am not sure i it is important, but the image endings that I generally come across are svg.png and png. I want to know why this is occurring and what I may be able to do to prevent it. I have left some of my code below. Thank you.
Function:
def download_images(file):
object = file[0:file.index("IMAGELINKS") - 1]
folder_name = object + "_images"
dir = os.path.join("math_obj_images/original_images/", folder_name)
if not os.path.exists(dir):
os.mkdir(dir)
with open("math_obj_image_links/" + file, "r") as f:
count = 1
for line in f:
try:
if line[len(line) - 1] == "\n":
line = line[:len(line) - 1]
if line[0] != "/":
last_chunk = line.split("/")[len(line.split("/")) - 1]
endings = last_chunk.split(".")[1:]
image_ending = ""
for ending in endings:
image_ending += "." + ending
if image_ending == "":
continue
with open("math_obj_images/original_images/" + folder_name + "/" + object + str(count) + image_ending, "wb") as f:
f.write(requests.get(line).content)
file = object + "_IMAGEENDINGS.txt"
path = "math_obj_image_endings/" + file
with open(path, "a") as f:
f.write(image_ending + "\n")
count += 1
except:
continue
f.close()
Doing this outside of it worked:
with open("test" + image_ending, "wb") as f:
f.write(requests.get(line).content)
Example of image link file:
https://upload.wikimedia.org/wikipedia/commons/thumb/6/63/Triangle.TrigArea.svg/120px-Triangle.TrigArea.svg.png
https://upload.wikimedia.org/wikipedia/commons/thumb/c/c9/Square_%28geometry%29.svg/120px-Square_%28geometry%29.svg.png
https://upload.wikimedia.org/wikipedia/commons/thumb/3/33/Hexahedron.png/120px-Hexahedron.png
https://upload.wikimedia.org/wikipedia/commons/thumb/2/22/Hypercube.svg/110px-Hypercube.svg.png
https://wikimedia.org/api/rest_v1/media/math/render/svg/5f8ab564115bf2f7f7d12a9f873d9c6c7a50190e
https://en.wikipedia.org/wiki/Special:CentralAutoLogin/start?type=1x1
https:/static/images/footer/wikimedia-button.png
https:/static/images/footer/poweredby_mediawiki_88x31.png
If all the files are indeed in PNG format and the suffix is always .png, you could try something like this:
import requests
from pathlib import Path
u1 = "https://upload.wikimedia.org/wikipedia/commons/thumb/6/63/Triangle.TrigArea.svg/120px-Triangle.TrigArea.svg.png"
r = requests.get(u1)
Path('u1.png').write_bytes(r.content)
My previous answer works for PNG's only
For SVG files you need to check if the file contents start eith the string "<svg" and create a file with the .svg suffix.
The code below saves the downloaded files in the "downloads" subdirectory.
import requests
from pathlib import Path
# urls are stored in a file 'urls.txt'.
with open('urls.txt') as f:
for i, url in enumerate(f.readlines()):
url = url.strip() # MUST strip the line-ending char(s)!
try:
content = requests.get(url).content
except:
print('Cannot download url:', url)
continue
# Check if this is an SVG file
# Note that content is bytes hence the b in b'<svg'
if content.startswith(b'<svg'):
ext = 'svg'
elif url.endswith('.png'):
ext = 'png'
else:
print('Cannot process contents of url:', url)
Path('downloads', f'url{i}.{ext}').write_bytes(requests.get(url).content)
Contents of the urls.txt file:
(the last url is an svg)
https://upload.wikimedia.org/wikipedia/commons/thumb/6/63/Triangle.TrigArea.svg/120px-Triangle.TrigArea.svg.png
https://upload.wikimedia.org/wikipedia/commons/thumb/c/c9/Square_%28geometry%29.svg/120px-Square_%28geometry%29.svg.png
https://upload.wikimedia.org/wikipedia/commons/thumb/3/33/Hexahedron.png/120px-Hexahedron.png
https://upload.wikimedia.org/wikipedia/commons/thumb/2/22/Hypercube.svg/110px-Hypercube.svg.png
https://wikimedia.org/api/rest_v1/media/math/render/svg/5f8ab564115bf2f7f7d12a9f873d9c6c7a50190e
My code is working correctly to scour a directory of PDFs, download weblinks embedded within those PDFs, and sequentially name them with appropriate file extension.
That being said - I am getting a few random files that download but DON'T have an extension associated with them. In doing quality checks, I have all the attachments that matter - these extra files are truly garbage.
Is there a way to not download them or build in a check in the code so that I don't end up with these phantom files?
#!/usr/bin/env python3
import os
import glob
import pdfx
import wget
import urllib.parse
import requests
## Accessing and Creating Six Digit File Code
pdf_dir = "./"
pdf_files = glob.glob("%s/*.pdf" % pdf_dir)
for file in pdf_files:
## Identify File Name and Limit to Digits
filename = os.path.basename(file)
newname = filename[0:6]
## Run PDFX to identify and download links
pdf = pdfx.PDFx(filename)
url_list = pdf.get_references_as_dict()
attachment_counter = (1)
for x in url_list["url"]:
if x[0:4] == "http":
parsed_url = urllib.parse.quote(x)
extension = os.path.splitext(x)[1]
r = requests.get(x)
with open('temporary', 'wb') as f:
f.write(r.content)
##Concatenate File Name Once Downloaded
os.rename('./temporary', str(newname) + '_attach' + str(attachment_counter) + str(extension))
##Increase Attachment Count
attachment_counter += 1
for x in url_list["pdf"]:
parsed_url = urllib.parse.quote(x)
extension = os.path.splitext(x)[1]
r = requests.get(x)
with open('temporary', 'wb') as f:
f.write(r.content)
##Concatenate File Name Once Downloaded
os.rename('./temporary', str(newname) + '_attach' + str(attachment_counter) + str(extension))
##Increase Attachment Count
attachment_counter += 1
It's not clear which part of your code produces these "phantom" files, but anyplace you want to avoid downloading a file which doesn't have an extension, you can make the download conditional. If the component after the last slash doesn't contain a dot, do nothing.
if '.' in x.split('/')[-1]:
... dowload(x) etc
I have a list of URLs, which direct to filings from the SEC (e.g., https://www.sec.gov/Archives/edgar/data/18651/000119312509042636/d10k.htm)
My goal ist to write a for loop that opens the URLs, request the document and save it to a folder.
However, I need to be able to identify the documents later. Thats why I wanted to use "htps://www.sec.gov/Archives/edgar/data/18651/000119312509042636/d10k.htm" this filing-specific number as document name
directory = r"\Desktop\10ks"
for url in url_list:
response = requests.get(url).content
path = (directory + str(url)[40:-5] +".txt")
with open(path, "w") as f:
f.write(response)
f.close()
But everytime, I get the following error message: filenotfounderror: [errno 2] no such file or directory:
I really hope you can help me out!!
Thanks
import requests
import os
url_list = ["https://www.sec.gov/Archives/edgar/data/18651/000119312509042636/d10k.htm"]
#Create the path Desktop/10ks/
directory = os.path.expanduser("~/Desktop") + "\\10ks"
for url in url_list:
#Get the content as string instead of getting it as bytes
response = requests.get(url).text
#Replace slash in filename with underscore
filename = str(url)[40:-5].replace("/", "_")
#print filename to check if it is correct
print(filename)
path = (directory + "\\" + filename +".txt")
with open(path, "w") as f:
f.write(response)
f.close()
See comments.
I guess backslashes in filenames are not allowed, since
filename = str(url)[40:-5].replace("/", "\\")
gives me
FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\user/Desktop\\10ks\\18651\\000119312509042636\\d10.txt'
See also:
https://docs.python.org/3/library/os.path.html#os.path.expanduser
Get request python as a string
https://docs.python.org/3/library/stdtypes.html#str.replace
This works
for url in url_list:
response = requests.get(url).content.decode('utf-8')
path = (directory + str(url)[40:-5] +".txt").replace('/', '\\')
with open(path, "w+") as f:
f.write(response)
f.close()
the path that you were build was something like this \\Desktop\\10ks18651/000119312509042636/d10.txt I suppose you are working on windows for those backslashes, anyways you just need to replace the slashes that were coming in the url to backslashes.
Another thing, write receives a string, because of that you need to decode your response that is coming in bytes to string.
I hope this helps you!
I have a long list of .json files that I need to download to my computer. I want to download them as .json files (so no parsing or anything like that at this point).
I have some code that works for small files, but it is pretty buggy. Also it doesn't handle multiple links well.
Appreciate any advice to fix up this code:
import os
filename = 'test.json'
path = "C:/Users//Master"
fullpath = os.path.join(path, filename)
import urllib2
url = 'https://www.premierlife.com/secure/index.json'
response = urllib2.urlopen(url)
webContent = response.read()
f = open(fullpath, 'w')
f.write(webContent)
f.close
It's creating a blank file because the f.close at the end should be f.close().
I took your code and made a little function and then called it on a little loop to go through a .txt file with the list of urls called "list_of_urls.txt" having 1 url per line (you can change the delimiter in the split function if you want to format it differently).
def save_json(url):
import os
filename = url.replace('/','').replace(':','')
# this replaces / and : in urls
path = "C:/Users/Master"
fullpath = os.path.join(path, filename)
import urllib2
response = urllib2.urlopen(url)
webContent = response.read()
f = open(fullpath, 'w')
f.write(webContent)
f.close()
And then the loop:
f = open('list_of_urls.txt')
p = f.read()
url_list = p.split('\n') #here's where \n is the line break delimiter that can be changed
for url in url_list:
save_json(url)
I'm trying to create a Python function that does the same thing as this wget command:
wget -c --read-timeout=5 --tries=0 "$URL"
-c - Continue from where you left off if the download is interrupted.
--read-timeout=5 - If there is no new data coming in for over 5 seconds, give up and try again. Given -c this mean it will try again from where it left off.
--tries=0 - Retry forever.
Those three arguments used in tandem results in a download that cannot fail.
I want to duplicate those features in my Python script, but I don't know where to begin...
There is also a nice Python module named wget that is pretty easy to use. Keep in mind that the package has not been updated since 2015 and has not implemented a number of important features, so it may be better to use other methods. It depends entirely on your use case. For simple downloading, this module is the ticket. If you need to do more, there are other solutions out there.
>>> import wget
>>> url = 'http://www.futurecrew.com/skaven/song_files/mp3/razorback.mp3'
>>> filename = wget.download(url)
100% [................................................] 3841532 / 3841532>
>> filename
'razorback.mp3'
Enjoy.
However, if wget doesn't work (I've had trouble with certain PDF files), try this solution.
Edit: You can also use the out parameter to use a custom output directory instead of current working directory.
>>> output_directory = <directory_name>
>>> filename = wget.download(url, out=output_directory)
>>> filename
'razorback.mp3'
urllib.request should work.
Just set it up in a while(not done) loop, check if a localfile already exists, if it does send a GET with a RANGE header, specifying how far you got in downloading the localfile.
Be sure to use read() to append to the localfile until an error occurs.
This is also potentially a duplicate of Python urllib2 resume download doesn't work when network reconnects
I had to do something like this on a version of linux that didn't have the right options compiled into wget. This example is for downloading the memory analysis tool 'guppy'. I'm not sure if it's important or not, but I kept the target file's name the same as the url target name...
Here's what I came up with:
python -c "import requests; r = requests.get('https://pypi.python.org/packages/source/g/guppy/guppy-0.1.10.tar.gz') ; open('guppy-0.1.10.tar.gz' , 'wb').write(r.content)"
That's the one-liner, here's it a little more readable:
import requests
fname = 'guppy-0.1.10.tar.gz'
url = 'https://pypi.python.org/packages/source/g/guppy/' + fname
r = requests.get(url)
open(fname , 'wb').write(r.content)
This worked for downloading a tarball. I was able to extract the package and download it after downloading.
EDIT:
To address a question, here is an implementation with a progress bar printed to STDOUT. There is probably a more portable way to do this without the clint package, but this was tested on my machine and works fine:
#!/usr/bin/env python
from clint.textui import progress
import requests
fname = 'guppy-0.1.10.tar.gz'
url = 'https://pypi.python.org/packages/source/g/guppy/' + fname
r = requests.get(url, stream=True)
with open(fname, 'wb') as f:
total_length = int(r.headers.get('content-length'))
for chunk in progress.bar(r.iter_content(chunk_size=1024), expected_size=(total_length/1024) + 1):
if chunk:
f.write(chunk)
f.flush()
A solution that I often find simpler and more robust is to simply execute a terminal command within python. In your case:
import os
url = 'https://www.someurl.com'
os.system(f"""wget -c --read-timeout=5 --tries=0 "{url}"""")
import urllib2
import time
max_attempts = 80
attempts = 0
sleeptime = 10 #in seconds, no reason to continuously try if network is down
#while true: #Possibly Dangerous
while attempts < max_attempts:
time.sleep(sleeptime)
try:
response = urllib2.urlopen("http://example.com", timeout = 5)
content = response.read()
f = open( "local/index.html", 'w' )
f.write( content )
f.close()
break
except urllib2.URLError as e:
attempts += 1
print type(e)
For Windows and Python 3.x, my two cents contribution about renaming the file on download :
Install wget module : pip install wget
Use wget :
import wget
wget.download('Url', 'C:\\PathToMyDownloadFolder\\NewFileName.extension')
Truely working command line example :
python -c "import wget; wget.download(""https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-4.17.2.tar.xz"", ""C:\\Users\\TestName.TestExtension"")"
Note : 'C:\\PathToMyDownloadFolder\\NewFileName.extension' is not mandatory. By default, the file is not renamed, and the download folder is your local path.
Here's the code adopted from the torchvision library:
import urllib
def download_url(url, root, filename=None):
"""Download a file from a url and place it in root.
Args:
url (str): URL to download file from
root (str): Directory to place downloaded file in
filename (str, optional): Name to save the file under. If None, use the basename of the URL
"""
root = os.path.expanduser(root)
if not filename:
filename = os.path.basename(url)
fpath = os.path.join(root, filename)
os.makedirs(root, exist_ok=True)
try:
print('Downloading ' + url + ' to ' + fpath)
urllib.request.urlretrieve(url, fpath)
except (urllib.error.URLError, IOError) as e:
if url[:5] == 'https':
url = url.replace('https:', 'http:')
print('Failed download. Trying https -> http instead.'
' Downloading ' + url + ' to ' + fpath)
urllib.request.urlretrieve(url, fpath)
If you are ok to take dependency on torchvision library then you also also simply do:
from torchvision.datasets.utils import download_url
download_url('http://something.com/file.zip', '~/my_folder`)
Let me Improve a example with threads in case you want download many files.
import math
import random
import threading
import requests
from clint.textui import progress
# You must define a proxy list
# I suggests https://free-proxy-list.net/
proxies = {
0: {'http': 'http://34.208.47.183:80'},
1: {'http': 'http://40.69.191.149:3128'},
2: {'http': 'http://104.154.205.214:1080'},
3: {'http': 'http://52.11.190.64:3128'}
}
# you must define the list for files do you want download
videos = [
"https://i.stack.imgur.com/g2BHi.jpg",
"https://i.stack.imgur.com/NURaP.jpg"
]
downloaderses = list()
def downloaders(video, selected_proxy):
print("Downloading file named {} by proxy {}...".format(video, selected_proxy))
r = requests.get(video, stream=True, proxies=selected_proxy)
nombre_video = video.split("/")[3]
with open(nombre_video, 'wb') as f:
total_length = int(r.headers.get('content-length'))
for chunk in progress.bar(r.iter_content(chunk_size=1024), expected_size=(total_length / 1024) + 1):
if chunk:
f.write(chunk)
f.flush()
for video in videos:
selected_proxy = proxies[math.floor(random.random() * len(proxies))]
t = threading.Thread(target=downloaders, args=(video, selected_proxy))
downloaderses.append(t)
for _downloaders in downloaderses:
_downloaders.start()
easy as py:
class Downloder():
def download_manager(self, url, destination='Files/DownloderApp/', try_number="10", time_out="60"):
#threading.Thread(target=self._wget_dl, args=(url, destination, try_number, time_out, log_file)).start()
if self._wget_dl(url, destination, try_number, time_out, log_file) == 0:
return True
else:
return False
def _wget_dl(self,url, destination, try_number, time_out):
import subprocess
command=["wget", "-c", "-P", destination, "-t", try_number, "-T", time_out , url]
try:
download_state=subprocess.call(command)
except Exception as e:
print(e)
#if download_state==0 => successfull download
return download_state
TensorFlow makes life easier. file path gives us the location of downloaded file.
import tensorflow as tf
tf.keras.utils.get_file(origin='https://storage.googleapis.com/tf-datasets/titanic/train.csv',
fname='train.csv',
untar=False, extract=False)