I'm writting a downloader that will split url to parts and download with threading, probably I will not use "join" because join = unable to stream (cannot write file if all thread is not finish)
But problem is f.seek and write output really weird file, content of the file always have "NUL" character (in Notepad++) and text in the file is just 1/3 of the whole file.
He everybody, thank everybody for helping me, here is my version 2.0 of the code, thank Padraic Cunningham for his suggestion and exlaination, I fix my code almost like what you've suggested:
So please help me check the code, and I think need you guy help to convert it to http.server file streamming method:
import os, requests
import threading
import urllib3
import urllib.request, urllib.error, urllib.parse
import time
import re
pool = urllib3.PoolManager(maxsize=10)
URL = "https://raw.githubusercontent.com/langpavel/tampermonkey/master/src/emulation.js"
fileName = "1.js"
countsize = 0
#if os.path.exists(fileName):
# os.remove(fileName)
def defwrite(filename,data,offset):
f = open(filename,'wb')
f.seek(offset)
f.write(data)
f.close()
def buildRange(url, numsplits):
global pool
value = int(requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None))
print("Fullsize: ", value)
print("Try devide with 3 :", value / 3)
lst = []
for i in range(numsplits):
if i == range(numsplits):
lst.append('%s-%s' % (i * value//numsplits + 1, i * value//numsplits + 1 + (value - (i * value//numsplits + 1))))
if i == 0:
lst.append('%s-%s' % (0, value//numsplits))
else:
lst.append('%s-%s' % (i * value//numsplits + 1, (i + 1) * value//numsplits))
return lst
def main(url=None, splitBy=3):
global fileName, pool, countsize
start_time = time.time()
if not url:
print("Please Enter some url to begin download.")
return
#fileName = "1.jpg"
#print("%s bytes to download." % sizeInBytes)
# if not sizeInBytes:
# print("Size cannot be determined.")
# return
#sinzeInBytes = buildRange(url,
dataDict = {}
f = open(fileName,'wb')
# split total num bytes into ranges
#ranges = buildRange(url,int(sizeInBytes), splitBy)
ranges = buildRange(url, splitBy)
print(ranges)
def downloadChunk(idx, irange):
print(idx)
#time.sleep(1*idx)
#req = urllib.request.Request(url)
#req.headers['Range'] = 'bytes={}'.format(irange)
headers = urllib3._collections.HTTPHeaderDict()
headers.add('Range', 'bytes=' + str(irange))
data = pool.urlopen('GET', URL, headers=headers).data
#print(data)
#print("finish: " + str(irange))
offset = int(re.sub("(^.*?)-(.*?)$", "\\1", irange))
print(offset)
# print(irange)
f.seek(offset, 0)
#f.truncate(0)
#print(f.tell())
f.write(data)
#f.read()
#f.close()
countsize = countsize + offset
#defwrite("1.txt", req, re.sub("(^.*?)-", "\\1", str(irange)))
# create one downloading thread per chunk
downloaders = [
threading.Thread(
target=downloadChunk,
args=(idx, irange),
)
for idx,irange in enumerate(ranges)
]
# start threads, let run in parallel, wait for all to finish
for th in downloaders:
th.start()
#th.isAlive()
#for th in downloaders:
#th.join()
#print(th.join)
print(countsize)
#print('done: got {} chunks, total {} bytes'.format(
# len(dataDict), sum( (
## len(chunk) for chunk in list(dataDict.values())
# ) )
#))
#print("--- %s seconds ---" % str(time.time() - start_time))
# if os.path.exists(fileName):
# os.remove(fileName)
#reassemble file in correct order
#with open(fileName, 'wb') as fh:
# for _idx,chunk in sorted(dataDict.items()):
# fh.write(chunk)
#stream_chunk = 16 * 1024
#with open(fileName, 'wb') as fp:
# while True:
# for _idx,chunk in sorted(dataDict.items()):
#fh.write(chunk)
# chunking = chunk.read(stream_chunk)
# if not chunk:
# break
# fp.write(chunking)
# print("Finished Writing file %s" % fileName)
#print('file size {} bytes'.format(os.path.getsize(fileName)))
if __name__ == '__main__':
if os.path.exists(fileName):
os.remove(fileName)
main(URL, splitBy=16)
Here is my code, please help me fix it: Version 1.0, ignore it, version 2.0 above:
import os, requests
import threading
import urllib3
import urllib.request, urllib.error, urllib.parse
import time
import re
pool = urllib3.PoolManager(maxsize=10)
URL = "https://raw.githubusercontent.com/langpavel/tampermonkey/master/src/emulation.js"
fileName = "1.js"
#if os.path.exists(fileName):
# os.remove(fileName)
def defwrite(filename,data,offset):
f = open(filename,'wb')
f.seek(offset)
f.write(data)
f.close()
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
if i == range(numsplits):
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(value - round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
if i == 0:
lst.append('%s-%s' % (i, int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
else:
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
return lst
def main(url=None, splitBy=3):
global fileName, pool
start_time = time.time()
if not url:
print("Please Enter some url to begin download.")
return
#fileName = "1.jpg"
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
print("%s bytes to download." % sizeInBytes)
if not sizeInBytes:
print("Size cannot be determined.")
return
dataDict = {}
# split total num bytes into ranges
ranges = buildRange(int(sizeInBytes), splitBy)
def downloadChunk(idx, irange):
print(idx)
#req = urllib.request.Request(url)
#req.headers['Range'] = 'bytes={}'.format(irange)
headers = urllib3._collections.HTTPHeaderDict()
headers.add('Range', 'bytes=' + str(irange))
data = pool.urlopen('GET', URL, headers=headers).data
print(data)
print("finish: " + str(irange))
offset = int(re.sub("(^.*?)-(.*?)$", "\\1", irange))
#print(offset)
# print(irange)
f = open(fileName,'wb')
f.seek(offset)
#f.truncate(0)
#print(f.tell())
f.write(data)
#f.read()
#f.close()
#defwrite("1.txt", req, re.sub("(^.*?)-", "\\1", str(irange)))
# create one downloading thread per chunk
downloaders = [
threading.Thread(
target=downloadChunk,
args=(idx, irange),
)
for idx,irange in enumerate(ranges)
]
# start threads, let run in parallel, wait for all to finish
for th in downloaders:
th.start()
#th.isAlive()
#for th in downloaders:
#th.join()
#print(th.join)
#print('done: got {} chunks, total {} bytes'.format(
# len(dataDict), sum( (
## len(chunk) for chunk in list(dataDict.values())
# ) )
#))
#print("--- %s seconds ---" % str(time.time() - start_time))
# if os.path.exists(fileName):
# os.remove(fileName)
#reassemble file in correct order
#with open(fileName, 'wb') as fh:
# for _idx,chunk in sorted(dataDict.items()):
# fh.write(chunk)
#stream_chunk = 16 * 1024
#with open(fileName, 'wb') as fp:
# while True:
# for _idx,chunk in sorted(dataDict.items()):
#fh.write(chunk)
# chunking = chunk.read(stream_chunk)
# if not chunk:
# break
# fp.write(chunking)
# print("Finished Writing file %s" % fileName)
#print('file size {} bytes'.format(os.path.getsize(fileName)))
if __name__ == '__main__':
main(URL, splitBy=3)
You use three threads where your target function is downloadChunk, you open the file three times using wb which overwrites so you get 1/3 of the content. You also call seek for no apparent reason. If you wanted to append to a file you would open using a each time or just open the file once outside the functions.
You are trying to seek using an empty file and write so that is where the null bytes come from.
If you want to open a file for reading and writing so you can seek with line buffering:
with open("whatever.file", "r+b",buffering=1) as f
Then use that file to write to, don't keep opening in the function and overwriting, the file must also exist.
Related
I am assembling a script to fingerprint up to 8TB directories with over 1 million files (including some files ~50 GB), and export the result into a .csv, such as, "md5","LastWriteTime","filesize","fullpath\file.ext":
"md5","YYYYMMDDHHMMSS","12345","A:\aaa\bb\c\file1.ext"
I am stuck with the coding, getting the output .csv empty:
def md5(fname):
hash_md5 = hashlib.md5()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(2 ** 20), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def getSize(filename):
st = os.stat(filename)
return st.st_size()
with open('md5_filelist.csv', 'w') as md5_filelist:
file.write('hash_md5.hexdigest','timestamp','st.st_size','os.path.abspath')te')
What are the bits I am doing wrong (I am new to Python)? Thank you.
Try This Again:
import hashlib
import os
import time
your_target_folder = "."
def get_size(filename):
st = os.stat(filename)
return str(st.st_size)
def get_last_write_time(filename):
st = os.stat(filename)
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
return convert_time_to_human_readable
def get_md5(fname):
hash_md5 = hashlib.md5()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(2 ** 20), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
for dirpath, _, filenames in os.walk(your_target_folder):
for items in filenames:
file_full_path = os.path.abspath(os.path.join(dirpath, items))
try:
my_last_data = get_md5(file_full_path) + ", " + get_last_write_time(file_full_path) + ", " + get_size(
file_full_path) + ", " + file_full_path + "\n"
with open("md5_filelist.csv", "a") as my_save_file:
my_save_file.write(my_last_data)
print(str(file_full_path) + " ||| Done")
except:
print("Error On " + str(file_full_path))
i changed fullpathaddress method and i added time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime)) for convert that time to human readable format.
Good Luck ...
I have 2 big text files (right now 17MB but could be GB), as such I don't want to load them in the ram because their size could exceed my ram capacity.
The code I wrote for now is this :
def stopIfFileExist(filename):
if os.path.isfile(filename):
raise Exception("%s already exist" %filename)
def compareDump(before_filename, after_filename, diff_filename):
"""
Compare 2 dumps generated via makeDump(output_filename) and generate
a file containing the differences
-before_filename : (string) filename of the first dump
-after_filename : (string) filename of the second dump
-diff_filename : (string) filename of the diff
"""
stopIfFileExist(diff_filename)
num_lines = sum(1 for line in open(after_filename))
one_percent = num_lines/float(100)
diff = []
start = time.time()
with open(after_filename, "r") as afterFile:
counter = 0
for a_line in afterFile:
print "completion : %.9f percents" %(counter/float(one_percent))
counter = counter + 1
diff.append(a_line)
with open(before_filename, "r") as beforeFile:
for b_line in beforeFile:
if a_line.rstrip() == b_line.rstrip():
diff.pop()
break
end = time.time()
print "task completed in %s seconds" %(end - start)
with open(diff_filename, "a") as diffFile:
for line in diff:
diffFile.write(line)
what I'd like to do is remove from the beforeFile a line that was sucessfully compared (eg, when the if a_line.rstrip() == b_line.rstrip(): is triggered)
However since I am currently reading the file I don't see how to do it.
Any ideas?
Thanks.
I was able to diff two 20 megabyte files in a little over 3 minutes using the following test code.
Every 10,000 lines I put a random number, which you can see diff'd in the results.
import random
import difflib
import os
import time
start = time.time()
NUM_LINES = int(10000000 / 4)
t1 = 'test1'
t2 = 'test2'
if os.path.exists(t1):
os.remove(t1)
if os.path.exists(t2):
os.remove(t2)
with open(t1, 'w+') as f1:
for number in range(1, NUM_LINES):
if number % 10000 == 0:
r = random.randint(1, number)
else:
r = 1
f1.write(str(number * r) + '\n')
else:
f1.seek(0)
with open(t2, 'w+') as f2:
for number in range(1, NUM_LINES):
if number % 10000 == 0:
r = random.randint(1, number)
else:
r = 1
f2.write(str(number * r) + '\n')
else:
f2.seek(0)
t1 = f1.readlines()
t2 = f2.readlines()
for l in difflib.unified_diff(t1, t2, lineterm=''):
print(l.strip())
print('Execution took: {:.2f} seconds'.format(time.time() - start))
I pasted the output on github, as it is obscenely long.
This is my first post. I have been doing python programming for quite sometime and recently was working on a multi-threaded downloader. But the problem is that my file (jpg is my target ) gets corrupted . Also with the followinf input : http://www.aumathletics.com/images_web/headerAUMLogo.jpg
it shows error
while with the input :
http://www.nasa.gov/images/content/607800main_kepler1200_1600-1200.jpg
the file gets corrupted.
Here is the code:-
import os, sys, requests
import threading
import urllib2
import time
URL = sys.argv[1]
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
if i == 0:
lst.append('%s-%s' % (i, int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
else:
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
return lst
def main(url=None, splitBy=5):
start_time = time.time()
if not url:
print "Please Enter some url to begin download."
return
fileName = "image.jpg"
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
print "%s bytes to download." % sizeInBytes
if not sizeInBytes:
print "Size cannot be determined."
return
dataDict = {}
# split total num bytes into ranges
ranges = buildRange(int(sizeInBytes), splitBy)
def downloadChunk(idx, irange):
req = urllib2.Request(url)
req.headers['Range'] = 'bytes={}'.format(irange)
dataDict[idx] = urllib2.urlopen(req).read()
# create one downloading thread per chunk
downloaders = [
threading.Thread(
target=downloadChunk,
args=(idx, irange),
)
for idx,irange in enumerate(ranges)
]
# start threads, let run in parallel, wait for all to finish
for th in downloaders:
th.start()
for th in downloaders:
th.join()
print 'done: got {} chunks, total {} bytes'.format(
len(dataDict), sum( (
len(chunk) for chunk in dataDict.values()
) )
)
print "--- %s seconds ---" % str(time.time() - start_time)
if os.path.exists(fileName):
os.remove(fileName)
# reassemble file in correct order
with open(fileName, 'w') as fh:
for _idx,chunk in sorted(dataDict.iteritems()):
fh.write(chunk)
print "Finished Writing file %s" % fileName
print 'file size {} bytes'.format(os.path.getsize(fileName))
if __name__ == '__main__':
main(URL)
the indentation here might be wrong so here is the code pastebin(dot)com/wGEkp878
I would be very grateful if someone could point the error
EDIT: suggested by a guy
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
first = i if i == 0 else buildRange().start(i, value, numsplits)
second = buildRange().end(i, value, numsplits)
lst.append("{}-{}".format(first, second))
return lst
can anyone tell me hoe to keep the part files downloaded with names like part1 part2 so on
It turns out the file must be opened in binary mode, with 'wb' instead of 'w'. If opened with just 'w' a bunch of extra characters will be written. This has something to do with derpy windows vs. linux new line semantics. If you use 'wb' it will write exactly what you put in into the file.
EDIT:
If you want to store the individual file parts you can change
# reassemble file in correct order
with open(fileName, 'w') as fh:
for _idx,chunk in sorted(dataDict.iteritems()):
fh.write(chunk)
print "Finished Writing file %s" % fileName
print 'file size {} bytes'.format(os.path.getsize(fileName))
To
# reassemble file in correct order
for _idx,chunk in sorted(dataDict.iteritems()):
with open(fileName + str(".part-") + str(_idx), 'wb') as fh:
fh.write(chunk)
print "Finished Writing file %s" % fileName
#print 'file size {} bytes'.format(os.path.getsize(fileName))
I've constructed Python script and it works well on OS X/Linux but I'm having problems in Windows (see title). It's using Pillow module and the error originates in module PIL\Image.py on line 2274.
My code:
# -*- coding: utf-8 -*-
import os
import sys
import urllib2
from PIL import Image, ImageFile
from PyPDF2 import PdfFileReader, PdfFileWriter, PdfFileMerger
from bs4 import BeautifulSoup
ImageFile.LOAD_TRUNCATED_IMAGES = True
def parser():
try:
return sys.argv[1].lower()
except IndexError:
print 'no argument specified'
the_url = 'http://www.oldgames.sk'
base_url = the_url + '/mags/'
# Add magazines + relative URLs here
magazines = {
'score': 'score/',
'level': 'level/',
'amiga': 'amiga-magazin/',
'bit': 'bit/',
'commodore': 'commodore-amater/',
'CGW': 'cgw/',
'excalibur': 'excalibur/',
'hrac': 'hrac-cz/',
'joystick': 'joystick-sk/',
'pocitac-aktivne': 'pocitac-aktivne/',
'pocitacove-hry': 'pocitacove-hry/',
'riki': 'riki/',
'zzap64': 'zzap64/'}
issue_links = []
download_list = {}
def parse_args(arg):
if arg == '--list':
items = [i for i in magazines.keys()]
for item in items:
print item
sys.exit()
elif arg in magazines:
print "Scraping %s magazine..." % arg.capitalize()
return base_url + magazines[arg]
else:
return sys.exit('invalid magazine name')
def extract_links_to_issue(url):
soup = BeautifulSoup(urllib2.urlopen(url))
for div in soup.findAll('div','mImage'):
issue_links.append(the_url + div.a['href'])
print 'Scraped %d links' % len(issue_links)
def issue_renamer(issue_name):
char1 = '\\'
char2 = '/'
replacement = '-'
if char1 in issue_name:
issue_name = issue_name.replace(char1, replacement)
print 'inv. char (%s): renaming to %s' % (char1, issue_name)
elif char2 in issue_name:
issue_name = issue_name.replace(char2, replacement)
print 'inv. char (%s): renaming to %s' % (char2, issue_name)
return issue_name
def extract_links_to_images(issue_links):
for index, link in enumerate(issue_links):
print 'Scraping issue #%d: %s' % (index + 1, link)
issue_soup = BeautifulSoup(urllib2.urlopen(link))
image_list = []
for image in issue_soup.findAll('div', 'mags_thumb_article'):
issue_name = issue_renamer(issue_soup.findAll('h1','top')[0].text)
image_list.append(the_url + image.a['href'])
download_list[issue_name] = image_list
def clean_up(list_of_files, list_of_pdfs):
num = len(list_of_files) + len(list_of_pdfs)
for file in list_of_files:
os.remove(file)
for pdf in list_of_pdfs:
os.remove(pdf)
print 'Cleaned up %d files' % num
def convert_images(list_of_files, issue):
list_of_pdfs = []
for index, file in enumerate(list_of_files):
im = Image.open(file)
outfile = file + '.pdf'
im.save(outfile, 'PDF')
list_of_pdfs.append(outfile)
print 'converting ...' + str((index + 1)) + '/' + str(len(list_of_files))
final_pdf = PdfFileMerger()
for pdf in list_of_pdfs:
final_pdf.append(open(pdf, 'rb'))
issue_name = issue + '.pdf'
final_pdf.write(open(issue_name, 'wb'))
final_pdf.close()
print '--- PDF completed ---'
clean_up(list_of_files, list_of_pdfs)
def download_images(download_list):
for issues,image_list in download_list.items():
print 'Preparing %s ...' % issues
list_of_files = []
for image in image_list:
image_name = os.path.split(image)[1]
list_of_files.append(image_name)
f = open(image_name, 'w')
f.write(urllib2.urlopen(image).read())
print 'Downloading image: %s' % image
f.close()
convert_images(list_of_files, issues)
arg = parser()
extract_links_to_issue(parse_args(arg))
extract_links_to_images(issue_links)
download_images(download_list)
I'd like to fix this, can anyone help me?
You are copying images into a file opened in text mode:
f = open(image_name, 'w')
f.write(urllib2.urlopen(image).read())
On Windows this means that any 0A (newline) bytes are translated to 0D 0A byte sequences (carriage return, newline), as that is the Windows line separator.
Open your files in binary mode:
f = open(image_name, 'wb')
f.write(urllib2.urlopen(image).read())
I'd switch to using the file as a context manager (with the with statement) so you don't have to manually close it, and using shutil.copyfileobj() to stream the data straight to disk (in blocks) rather than read the whole image into memory in one go:
import shutil
# ...
with open(image_name, 'wb') as f:
shutil.copyfileobj(urllib2.urlopen(image), f)
I have the following code tha uses FFmpeg . It has 5 argv and it takes in filename,video,segment size, start time, end time, ratings. thats supposed to let me classify segments many times with my ratings "PG G M18..." but there's this error,
"File "C:\c.py",line 92, in <module> os.rename<filename + str(x), filename + str(x) + classification)
WindowsError: [Error2] The system cannot find the file specified.
Ive tried to edit many times but this error still persists. Anyone have any idea what could this error mean and anyway to solve it?
import sys
import subprocess
import os
#change hh:mm:ss to seconds:
def getSeconds(sec):
l = sec.split(':')
return int(l[0])* 3600 + int(l[1])* 60 + float(l[2])
def get_total_time(filename):
proc = subprocess.Popen(["ffmpeg", "-i", filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
lines = proc.communicate()[1]
target = [line for line in lines.split('\n') if 'Duration:' in line][0]
time = target.split('Duration: ')[-1].split(',', 1)[0]
return time
#check command line arguments
if len(sys.argv) < 5:
print "Error: not enough arguments"
sys.exit()
#save filename to file_name
file_name = sys.argv[1]
if not file_name.endswith('mpg'):
print 'Error! File extension not supported'
sys.exit()
# save a size of chunk in chunk
segsize = int(sys.argv[2])
chunk = (segsize * 1024)
# get time of starting censorship in seconds
start_censorship = getSeconds(sys.argv[3])
# get time of ending censorship in seconds
end_censorship = getSeconds(sys.argv[4])
classification = sys.argv[5]
if classification not in ['P','PG','G','NC16','M18','R21']:
print "Error: invalid classification"
sys.exit()
#initialize variable for extension
file_ext = ''
# if extension exists then save it into file_ext
if '.' in file_name:
# split file_name on two parts from right
file_ext = sys.argv[1].split('.')[-1]
# file_name without extension
filename = '.'.join(file_name.split('.')[:-1])
# total_time of file in seconds
total_time = getSeconds(get_total_time(file_name))
print total_time
#open file
in_file = open(file_name,'rb')
#read first chunks
s = in_file.read(chunk)
in_file.seek(0, 2)
file_size = in_file.tell()
chunks = (file_size / chunk) + 1
chunk_time = total_time/ file_size * chunk
#close input file
in_file.close()
#loop for each chunk
for x in range(0, chunks):
# starting time of current chunk
t1 = chunk_time*x
# ending time of current chunk
t2 = chunk_time*(x+1)
if t2 < start_censorship or t1 > end_censorship:
pass
else:
if os.path.exists(filename + str(x) + 'x'):
os.rename(filename + str(x) + 'x', filename + str(x))
os.rename(filename + str(x), filename + str(x) + classification)
#read next bytes
You're not checking whether filename + str(x) exists before you try to rename it. Either check first, or put it in a try block and catch OSError (which WindowsError subclasses). Either that or you're missing an indent on the second rename.
In that last for loop it looks like you are possibly renaming multiple files -- where do the original files come from?
Or, asked another way, for every chunk that is not censored you are renaming a file -- is that really what you want?