Iterating through file sizes recursively - python

Hi I have created a mini program to get the MD5 hash of all files in the directory the script is located.
My problem is when I generate the get_size() of the files I end up with only one line in my filelist.md5.txt and get_size() seems to be outputting the entire directory's sum instead of each individual files size.
How can I output individual files size in this script?
I get this output in the filelist.md5.txt file:
#
# GENERATE_FILELIST
# (filename) (filesize) (md5) (major_version) (minor_version)
#
Test_2.txt 190 dea9fe052f1abf71bac7421c732b0475 ---- ----
However I want to get this output:
#
# GENERATE_FILELIST
# (filename) (filesize) (md5) (major_version) (minor_version)
#
MD5.bat filesize b9a7c825517002e0da8e980c2c2c2cef ---- ----
MD5.py filesize b61124e8bef473d377f59aa0964174ce ---- ----
MD5test.bat filesize f29d68f9721c57d648164cae79dac71b ---- ----
MD5test.py filesize a7a3c45ebe1aca82f57591c7fccd6cfc ---- ----
MD5v1.bat filesize e5e7407117845a2413fe667fe7a2f681 ---- ----
MD5v1.py filesize 55ab90b5a623548825a0b40406fcdde2 ---- ----
MD5v2.bat filesize e9e31aaa62f6f37572cf89a03860cb96 ---- ----
MD5v3.bat filesize 559c0e9ed05fc9b4884c83bc3e04f8fd ---- ----
MD5v3.py filesize d20a8841f3c37d28fd3b74847731e212 ---- ----
Test_2.txt filesize dea9fe052f1abf71bac7421c732b0475 ---- ----
Code so far:
import glob
import hashlib
import sys
import os
filenames = glob.glob('*.*')
# truncate the file to zero length before opening
f1 = open(os.path.expanduser(sys.path[0]) + '\\filelist.md5.txt', 'w')
#'a' will append the file, rather than write over whatever else you put in it like 'w'
with open('filelist.md5.txt', 'a') as f:
print >> f,''
print >> f,'#'
print >> f,'# GENERATE_FILELIST'
print >> f,'# (filename) (filesize) (md5) (major_version) (minor_version)'
print >> f,'#'
print >> f,''
f.close()
# print to console
for filename in filenames:
with open(filename, 'rb') as inputfile:
data = inputfile.read()
print '. -- ',filename, ' ---------- ', hashlib.md5(data).hexdigest()
# get the size of each file
def get_size(start_path = '.'):
total_size = 0
for dirpath, dirnames, filenames in os.walk(start_path):
for fn in filenames:
fp = os.path.join(dirpath, fn)
total_size += os.path.getsize(fp)
return total_size
#'a' will append the file, rather than write over whatever else you put in it like 'w'
with open('filelist.md5.txt', 'a') as f:
print >> f,'{:44}'.format(filename), get_size(),' ', hashlib.md5(data).hexdigest(),' ','----',' ','----'
f.close()

Your get_size() is written to return the size of the whole directory, which is not what you're looking for.
dir=r'specify\path\here'
with open('filelist.md5.txt', 'w') as fx:
for f in os.listdir(dir):
path = os.path.join(dir, f)
if os.path.isfile(path):
# specify anything else you want to write inside fx.write()
fx.write(f + "\t\t" + str(os.path.getsize(path)) + "\n")
The above code writes the file name and size separated by tabs, and in separate lines.
You don't have to explicitly close when you're doing with open('filelist.md5.txt', 'a') as f:

Try This ( Works Better for larg_file, none_ascci_format_files_names, whitout glob module, and error_handling ):
import hashlib, os, hashlib, sys
your_target_folder = "." # put your folder or just this "."
def get_size(filename):
st = os.stat(filename)
return str(st.st_size)
def get_minor_version(filename):
# Your Code ...
return "minor_version"
def get_major_version(filename):
# Your Code ...
return "major_version"
def get_md5(fname):
hash_md5 = hashlib.md5()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(2 ** 20), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
# this function works for none ascii files names ( like chinese format )!!
def sys_out(out_data):
try:
print(out_data)
except UnicodeEncodeError:
if sys.version_info >= (3,):
print(out_data.encode('utf8').decode(sys.stdout.encoding))
else:
print(out_data.encode('utf8'))
def make_beautiful_terminal_output(get_list):
col_width = max(len(word) for word in get_list) + 3 # padding
buffer_last = []
for row in get_list:
buffer_last.append("".join(word.ljust(col_width) for word in get_list))
return buffer_last[0]
def print_header():
header_tag = "(filename) (filesize) (md5) (major_version) (minor_version)\n"
with open("filelist.md5.txt", "a") as my_header:
my_header.write(header_tag)
print(header_tag)
print_header()
for dirpath, _, filenames in os.walk(your_target_folder):
for items in filenames:
file_full_path = os.path.abspath(os.path.join(dirpath, items))
try:
my_last_data = [items, get_size(file_full_path), get_md5(file_full_path), get_major_version(file_full_path), get_minor_version(file_full_path)]
terminal_output = make_beautiful_terminal_output(my_last_data)
sys_out(terminal_output)
re_buffer = terminal_output + "\n"
with open("filelist.md5.txt", "a", encoding='utf-8') as my_save_file:
my_save_file.write(re_buffer)
except:
sys_out("Error On " + str(file_full_path))

Related

Python function within a loop iterating over text files only works on the first file

I'm writing a simple script which loops over some text file and uses a function which should replace some string looking in a .csv file (every row has the word to replace and the word which I want there)
Here is my simple code:
import os
import re
import csv
def substitute_tips(table, tree_content):
count = 0
for l in table:
print("element of the table", l[1])
reg_tree = re.search(l[1],tree_content)
if reg_tree is not None:
#print("match in the tree: ",reg_tree.group())
tree_content = tree_content.replace(reg_tree.group(), l[0])
count = count + 1
else:
print("Not found: ",l[1])
tree_content = tree_content
print("Substitutions done: ",count)
return(tree_content)
path=os.getcwd()
table_name = "162_table.csv"
table = open(table_name)
csv_table = csv.reader(table, delimiter='\t')
for root, dirs, files in os.walk(path, topdown=True):
for name in files:
if name.endswith(".tree"):
print(Fore.GREEN + "Working on treefile", name)
my_tree = open(name, "r")
my_tree_content = my_tree.read()
output_tree = substitute_tips(csv_table, my_tree_content)
output_file = open(name.rstrip("tree") + "SPECIES_NAME.tre", "w")
output_file.write(output_tree)
output_file.close()
else:
print(Fore.YELLOW + name ,Fore.RED + "doesn't end in .tree")
It's probably very easy, but I'm a newbie.
Thanks!
The files list returned by os.walk contains only the file names rather than the full path names. You should join root with the file names instead to be able to open them:
Change:
my_tree = open(name, "r")
...
output_file = open(name.rstrip("tree") + "SPECIES_NAME.tre", "w")
to:
my_tree = open(os.path.join(root, name), "r")
...
output_file = open(os.path.join(root, name.rstrip("tree") + "SPECIES_NAME.tre"), "w")

Python - How to stop the loop

I have this where it reads a file called source1.html, source2.html, source3.html, but when it cant find the next file (because it doesnt exist) it gives me a error. there can be an x amount of sourceX.html, so i need something to say if the next sourcex.html file can not be found, stop the loop.
Traceback (most recent call last): File "main.py", line 14, in
file = open(filename, "r") IOError: [Errno 2] No such file or
directory: 'source4.html
how can i stop the script looking for the next source file?
from bs4 import BeautifulSoup
import re
import os.path
n = 1
filename = "source" + str(n) + ".html"
savefile = open('OUTPUT.csv', 'w')
while os.path.isfile(filename):
strjpgs = "Extracted Layers: \n \n"
filename = "source" + str(n) + ".html"
n = n + 1
file = open(filename, "r")
soup = BeautifulSoup(file, "html.parser")
thedata = soup.find("div", class_="cplayer")
strdata = str(thedata)
DoRegEx = re.compile('/([^/]+)\.jpg')
jpgs = DoRegEx.findall(strdata)
strjpgs = strjpgs + "\n".join(jpgs) + "\n \n"
savefile.write(filename + '\n')
savefile.write(strjpgs)
print(filename)
print(strjpgs)
savefile.close()
print "done"
use a try / except and break
while os.path.isfile(filename):
try: # try to do this
# <your code>
except FileNotFoundError: # if this error occurs
break # exit the loop
The reason your code doesn't currently work is you're checking the previous file exists in your while loop. Not the next one. Hence you could also do
while True:
strjpgs = "Extracted Layers: \n \n"
filename = "source" + str(n) + ".html"
if not os.path.isfile(filename):
break
# <rest of your code>
you can try opening file, and break out of while loop once you catch an IOError exception.
from bs4 import BeautifulSoup
import re
import os.path
n = 1
filename = "source" + str(n) + ".html"
savefile = open('OUTPUT.csv', 'w')
while os.path.isfile(filename):
try:
strjpgs = "Extracted Layers: \n \n"
filename = "source" + str(n) + ".html"
n = n + 1
file = open(filename, "r")
except IOError:
print("file not found! breaking out of loop.")
break
soup = BeautifulSoup(file, "html.parser")
thedata = soup.find("div", class_="cplayer")
strdata = str(thedata)
DoRegEx = re.compile('/([^/]+)\.jpg')
jpgs = DoRegEx.findall(strdata)
strjpgs = strjpgs + "\n".join(jpgs) + "\n \n"
savefile.write(filename + '\n')
savefile.write(strjpgs)
print(filename)
print(strjpgs)
savefile.close()
print "done"
I'll suggest you to use os.path.exists() (which returns True/False) and os.path.isfile() both.
Use with statement to open file. It is Pythonic way to open files.
with statement is best preferred among the professional coders.
These are the contents of my current working directory.
H:\RishikeshAgrawani\Projects\Stk\ReadHtmlFiles>dir
Volume in drive H is New Volume
Volume Serial Number is C867-828E
Directory of H:\RishikeshAgrawani\Projects\Stk\ReadHtmlFiles
11/05/2018 16:12 <DIR> .
11/05/2018 16:12 <DIR> ..
11/05/2018 15:54 106 source1.html
11/05/2018 15:54 106 source2.html
11/05/2018 15:54 106 source3.html
11/05/2018 16:12 0 stopReadingIfNot.md
11/05/2018 16:11 521 stopReadingIfNot.py
5 File(s) 839 bytes
2 Dir(s) 196,260,925,440 bytes free
The below Python code shows how will you read files source1.html, source2.html, source.3.html and stop if there is no more files of the form sourceX.html (where X is 1, 2, 3, 4, ... etc.).
Sample code:
import os
n = 1;
html_file_name = 'source%d.html'
# It is necessary to check if sourceX.html is file or directory.
# If it is directory the check it if it exists or not.
# It it exists then perform operation (read/write etc.) on file.
while os.path.isfile(html_file_name % (n)) and os.path.exists(html_file_name % (n)):
print "Reading ", html_file_name % (n)
# The best way (Pythonic way) to open file
# You don't need to bother about closing the file
# It will be taken care by with statement
with open(html_file_name % (n), "r") as file:
# Make sure it works
print html_file_name % (n), " exists\n";
n += 1;
Output:
H:\RishikeshAgrawani\Projects\Stk\ReadHtmlFiles>python stopReadingIfNot.py
Reading source1.html
source1.html exists
Reading source2.html
source2.html exists
Reading source3.html
source3.html exists
So based on the above logic. you can modify your code. It will work.
Thanks.
This appears to be a sequence error. Let's look at a small fragment of your code, specifically lines dealing with filename:
filename = "source" + str(n) + ".html"
while os.path.isfile(filename):
filename = "source" + str(n) + ".html"
n = n + 1
file = open(filename, "r")
You're generating the next filename before you open the file (or really, checking the old filename then opening a new one). It's a little hard to see because you're really updating n while filename holds the previous number, but if we look at them in sequence it pops out:
n = 1
filename = "source1.html" # before loop
while os.path.isfile(filename):
filename = "source1.html" # first time inside loop
n = 2
open(filename)
while os.path.isfile(filename): # second time in loop - still source1
filename = "source2.html"
n = 3
open(filename) # We haven't checked if this file exists!
We can fix this a few ways. One is to move the entire updating, n before filename, to the end of the loop. Another is to let the loop mechanism update n, which is a sight easier (the real fix here is that we only use one filename value in each iteration of the loop):
for n in itertools.count(1):
filename = "source{}.html".format(n)
if not os.path.isfile(filename):
break
file = open(filename, "r")
#...
At the risk of looking rather obscure, we can also express the steps functionally (I'm using six here to avoid a difference between Python 2 and 3; Python 2's map wouldn't finish):
from six.moves import map
from itertools import count, takewhile
numbers = count(1)
filenames = map('source{}.html'.format, numbers)
existingfiles = takewhile(os.path.isfile, filenames)
for filename in existingfiles:
file = open(filename, "r")
#...
Other options include iterating over the numbers alone and using break when isfile returns False, or simply catching the exception when open fails (eliminating the need for isfile entirely).

How to save all files, not replacing the file?

I have 100 text files and I want to save it into 100 text files too. Right now, my coding can read all the files but it save only one file, which is the latest result. Here I attached the code.
def nama():
path = "C:/Amar/code/"
infilename = os.listdir(path)
print len(infilename)
for filename in infilename:
print("jumpa dah" + path + "\\"+ filename)
f = open(path + "\\" + filename, "r")
data = f.read()
f.close()
lines = data.split('\n')
outfilename = path + "result.txt"
print outfilename
f = open(outfilename , "a")
Append a string that will act as a unique identifier for each output file. You can use the input filename for this:
outfilename = path + filename + "_result.txt"
# e.g reports_result.txt

Python 3.4 - Add file name + line number to all files in a folder

I'm not a programmer and I've been doing my best to create some small scripts with Python 3.4 that help me with different tasks at work.
I have several .txt files and I to every line in the file I would need to append:
the file name
the file name+ line number
save it as a UTF-8 csv with all fields separated by commas.
I managed to do this for one particular file, but I'm struggling to do it for all the files in the folder. I've tried import glob but with no success.
This is the code right now (a mess... that partially works):
with open('Ruth.txt', 'r') as program:
data = program.readlines()
with open('Ruth.txt', 'w') as program:
for (number, line) in enumerate(data):
program.write('%d","%s' % (number + 1, line))
files = 'Ruth.txt'
all_lines = []
for f in files.split():
lines = open(f, 'r').readlines()
for line in lines:
all_lines.append('"' + f + '"' + ',' + '"' + f + line.strip() + '"')
fout = open(f + 'out.csv', 'w')
fout.write('\n'.join(all_lines))
fout.close()
Try this:
import os
def add_numbers(filename):
with open(filename, 'r') as readfile:
data = readfile.readlines()
with open(filename, 'w') as writefile:
for i, line in enumerate(data):
writefile.write('%d. %s' % (i + 1, line))
for path, _, filenames in os.walk(folder):
for filename in filenames:
add_numbers(os.path.join(path, filename))
This will add numbers to each file in the directory and each file in all sub-directories. If you don't want it to check all sub-directories, change the for loop to this:
path, _, filenames = next(os.walk(folder))
for filename in filenames:
add_numbers(os.path.join(path, filename))
here the complete script that take one positional argument (folder) and create a new .csv file at the same level than the file.
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import sys
from pathlib import Path
def get_files(folder_path, suffix=".txt"):
return Path(folder_path).glob("**/*%s" % suffix)
def write_lines(file_):
with file_.with_suffix(".csv").open("w") as fout, file_.open(encoding="utf-8") as fin:
for i, line in enumerate(fin, 1):
# line number, file name, line
new_line = ",".join(["%d." % i, file_.name, line])
fout.write(new_line)
def main(folder):
for file_ in get_files(folder):
print(file_)
write_lines(file_)
if __name__ == '__main__':
try:
main(sys.argv[1])
except IndexError:
print("usage: %s foldername" % sys.argv[0])
This will take all text files in current folder and turn them into utf-8 encoded 'csv-style' files so that space in the text is turned into a comma with filename and line number also comma-separated.
from glob import glob
filenames = glob("*.txt")
text = ''
for fn in filenames:
with open(fn,'r') as f:
for i,line in enumerate(f):
line=','.join(line.split())
text += ','.join((line,fn,i+1)) + '\n'
fnew = fn.rsplit('.',1)[0]+'.csv'
with open(fnew,'w', encoding='utf-8') as f:
f.write(text)

Running script on every text file in folder

I'm looking to run my script on all text files in a directory, but I'm having a lot of trouble.
Here is the code I have so far:
data = {}
date = ID = values = None
infile = "z140327b.txt"
outfile = "oz140327b.txt"
sample = 1
with open(infile) as datafile, open(outfile, 'w') as f2:
for line in datafile:
if line.lstrip().startswith('!'):
date = line[1:].strip()
elif line.lstrip().startswith('?'):
sample = 2
elif line.lstrip().startswith('#'):
ID = line[1:].strip()
data[ID] = {}
data[ID]['date'] = date
tedtime = ID[0:2] + ":" + ID[2:]
str_1 = str(data[ID])
f2.write(tedtime + ' ' + date + ',' + str(sample))
elif line.strip():
if not ID:
continue
try:
words = line.split()
value = float(words[-1]) # last word
unit = words[-2].lstrip('(').rstrip(')')
item = {'value': value, 'unit': unit}
key = ' '.join(words[:-2])
data[ID][key] = item
except (ValueError) as err:
print("Could not parse this line:")
print(line)
continue
else: # if 'empty' line
ca_str = str(data[ID]['Contact Angle']['value'])
f2.write(',' + ca_str + '\n')
ID = None
ca_str2 = str(data[ID]['Contact Angle']['value'])
f2.write(',' + ca_str2 + '\n')
At the minute, I'm manually adding the filename (infile) and the output filename (outfile). I would like the output file name to be the same as the input file, preceded by an 'o', as shown in the example code.
You can use glob to get all the files in the directory:
from glob import glob
files=glob('*.txt')
for filename in files:
with open(filename,'r') as f, open('o'+filename,'w') as f1:
....
#read from f
#write to f1
Simply iterate over each filename, do what you want to it, and write it to a new file.
Make sure your script is run from the directory you are in, or you need to pass the path to glob.
import glob
import os.path
def text_files(target_dir):
"""Return (infile, outfile) tuple for all *.txt files in target_dir."""
text_files = os.path.join(target_dir, '*.txt')
for fname in glob.glob(text_files):
outfile = 'o' + os.path.basename(fname)
outfile = os.path.join(target_dir, outfile)
yield fname, outfile
# Search for text files in /tmp
for inf, outf in text_files("/tmp"):
print inf, outf

Categories