generalize python script to run on all files in a directory - python

I have the following python script:
with open('ein.csv', 'r') as istr:
with open('aus.csv', 'w') as ostr:
for line in istr:
line = line.rstrip('\n') + ',1'
print(line, file=ostr)
how could this be generalized to run on all the files in a directory and output a seperate file for each one?
maybe have a function like this:
for phyle in list_files(.):
with open(phyle, 'r') as istr:
with open('ausput_xyz.csv', 'w') as ostr:
for line in istr:
line = line.rstrip('\n') + ',1'
print(line, file=ostr)
def list_files(path):
# returns a list of names (with extension, without full path) of all files
# in folder path
files = []
for name in os.listdir(path):
if os.path.isfile(os.path.join(path, name)):
files.append(name)
return files

Just put the code in a function and call it:
def process(infilename):
outfilename = os.path.splitext(infilename)[0] + "-out.csv"
with open(infilename, 'r') as istr:
with open(outfilename, 'w') as ostr:
for line in istr:
line = line.rstrip('\n') + ',1'
print(line, file=ostr)
def process_files(path):
for name in os.listdir(path):
if os.path.isfile(os.path.join(path, name)):
process(name)
In a directory with input files "abc.csv", "xyz.csv" this code will create output files named "abc-out.csv" and "xyz-out.csv".
Note that os.listdir(path) is called just once during the execution, so the list of files to process will not include the newly created output files.

First off, as a more pythonic way for dealing with csv files you better to use csv module and use with statement for opening the files which will close the file object automatically at the end of the block. And use os.walk() function to iterate over the files and directories of a specific path:
import csv
import os
for path_name, dirs, files in os.walk('relative_path'):
for file_name in files:
with open(file_name) as inp,open('{}.csv'.format(file_name),'wb') as out:
spamwriter = csv.writer(out, delimiter=',')
for line in inp:
spamwriter.writerow(line) # or line.split() with a specific delimiter
Note that if your script is not in a same path with your files directory you can add the path to the leading of your file name while you want to open them.
with open(path_name+'/'+file_name),...

Related

Python script to loop for files in subdirectories in order to change text and extension

I'm trying to loop through files in multiple subdirectories in order to :
1- Add some text inside the files (ending with .ext)
2- Change the extension of each file from .ext to .ext2
The script works fine when I have only one subdir in the main directory, but when I try to run the script on multiple subdirs it says:
line 8, in
with open(name, "r") as f:
FileNotFoundError: [Errno 2] No such file or directory: "here the name of the subdir"
import os
directory = 'C:\\Users\\folder\\subfolders'
for dir, subdirs, files in os.walk(directory):
for name in files:
if name.endswith((".ext")):
with open(name, "r") as f:
XMLContent = f.readlines()
XMLContent.insert(6, '<XMLFormat>\n')
XMLContent.insert(40, '\n</XMLFormat>')
with open(name, "w") as f:
XMLContent = "".join(XMLContent)
f.write(XMLContent)
os.rename(os.path.join(dir, name), os.path.join(dir, name[:name.index('.ext')] +".ext1"))
Above is a screenshot of the sub dirs I have in the folder (1.Modified).
I've also created a new folder called all and put in it three folders and for each folder, I've created 2 files of .ext type.
So, I was able to write inside each file of them and change its name as well.
import os
for root, dirs, files in os.walk("/Users/ghaith/Desktop/test/all"):
for file in files:
if file.endswith('.ext'):
path = root + '/' + file
with open(path, "r") as f:
content = f.readlines()
content.insert(1, '<XMLFormat>\n')
content.insert(3, '\n</XMLFormat>')
with open(path, "w") as f:
content = "".join(content)
f.write(content)
os.rename(path, path+'2')
Output:
< XMLFormat >
< /XMLFormat >
you need to pass the directory to open the file
with open(os.path.join(directory, name), "r") as f:
But, I think the best way is use the os.listdir() to loop in the directory
for item in os.listdir(directory):
if item.endswith(".ext"):
with open(os.path.join(directory, item), "r") as r:

delete the file if it has more then 300 lines

I have 500 text files in a folder. How do I find out how many lines each text file have, and delete them if it has more than 300 lines.
import os
path = "All_TSV_Files"
files = [file for file in os.listdir(path) if file.endswith(".txt")]
for file in files:
with open(os.path.join(path, file), 'r',encoding='utf-8') as f:
#find the lines of each file
Unfortunately you will have to read the file to see how many lines it has. In your case, you could skip it once you reach 300.
Sample code to get line count
file = open("sample.txt", "r")
line_count = 0
for line in file:
if line != "\n":
line_count += 1
file.close()
print(line_count)
import os
path = "All_TSV_Files"
files = [file for file in os.listdir(path) if file.endswith(".txt")]
for file in files:
with open(os.path.join(path, file), 'r',encoding='utf-8') as f:
if len(f.readlines())>300:
os.remove(os.path.join(path,file))
Like Saurabh said, if you wanna do it with Python seems like you'd have to iterate through all the lines in order to count them. This article demonstrates doing it with enumerate(). https://pynative.com/python-count-number-of-lines-in-file/
If you're on Unix you could use AWK since you've already imported os: awk 'END{print NR}' file

Python - loop through subfolders and files in a directory without ignoring the subfolder

I have read all the stack exchange help files on looping through subfolders, as as well as the os documentation, but I am still stuck. I am trying to loop over files in subfolders, open each file, extract the first number in the first line, copy the file to a different subfolder(with the same name but in the output directory) and rename the file copy with the number as a suffix.
import os
import re
outputpath = "C:/Users/Heather/Dropbox/T_Files/Raw_FRUS_Data/Wisconsin_Copies_With_PageNumbers"
inputpath = "C:/Users/Heather/Dropbox/T_Files/Raw_FRUS_Data/FRUS_Wisconsin"
suffix=".txt"
for root, dirs, files in os.walk(inputpath):
for file in files:
file_path = os.path.join(root, file)
foldername=os.path.split(os.path.dirname(file_path))[1]
filebname=os.path.splitext(file)[0]
filename=filebname + "_"
f=open(os.path.join(root,file),'r')
data=f.readlines()
if data is None:
f.close()
else:
with open(os.path.join(root,file),'r') as f:
for line in f:
s=re.search(r'\d+',line)
if s:
pagenum=(s.group())
break
with open(os.path.join(outputpath, foldername,filename+pagenum+suffix), 'w') as f1:
with open(os.path.join(root,file),'r') as f:
for line in f:
f1.write(line)
I expect the result to be copies of the files in the input directory placed in the corresponding subfolder in the output directory, renamed with a suffix, such as "005_2", where 005 is the original file name, and 2 is the number the python code extracted from it.
The error I get seems to indicates that I am not looping through files correctly. I know the code for extracting the first number and renaming the file works because I tested it on a single file. But using os.walk to loop through multiple subfolders is not working, and I can't figure out what I am doing wrong. Here is the error:
File "<ipython-input-1-614e2851f16a>", line 23, in <module>
with open(os.path.join(outputpath, foldername,filename+pagenum+suffix), 'w') as f1:
IOError: [Errno 2] No such file or directory: 'C:/Users/Heather/Dropbox/T_Files/Raw_FRUS_Data/Wisconsin_Copies_With_PageNumbers\\FRUS_Wisconsin\\.dropbox_1473986809.txt'
Well, this isn't eloquent, but it worked
from glob import glob
folderlist=glob("C:\\...FRUS_Wisconsin*\\")
outputpath = "C:\\..\Wisconsin_Copies_With_PageNumbers"
for folder in folderlist:
foldername = str(folder.split('\\')[7])
for root, dirs, files in os.walk(folder):
for file in files:
filebname=os.path.splitext(file)[0]
filename=filebname + "_"
if not filename.startswith('._'):
with open(os.path.join(root,file),'r') as f:
for line in f:
s=re.search(r'\d+',line)
if s:
pagenum=(s.group())
break
with open(os.path.join(outputpath, foldername,filename+pagenum+suffix), 'w') as f1:
with open(os.path.join(root,file),'r') as f:
for line in f:
f1.write(line)

Python file-IO and zipfile. Trying to loop through all the files in a folder and then loop through the texts in respective file using Python

Trying to extract all the zip files and giving the same name to the folder where all the files are gonna be.
Looping through all the files in the folder and then looping through the lines within those files to write on a different text file.
This is my code so far:
#!usr/bin/env python3
import glob
import os
import zipfile
zip_files = glob.glob('*.zip')
for zip_filename in zip_files:
dir_name = os.path.splitext(zip_filename)[0]
os.mkdir(dir_name)
zip_handler = zipfile.ZipFile(zip_filename, "r")
zip_handler.extractall(dir_name)
path = dir_name
fOut = open("Output.txt", "w")
for filename in os.listdir(path):
for line in filename.read().splitlines():
print(line)
fOut.write(line + "\n")
fOut.close()
This is the error that I encounter:
for line in filename.read().splitlines():
AttributeError: 'str' object has no attribute 'read'
You need to open the file and also join the path to the file, also using splitlines and then adding a newline to each line is a bit redundant:
path = dir_name
with open("Output.txt", "w") as fOut:
for filename in os.listdir(path):
# join filename to path to avoid file not being found
with open(os.path.join(path, filename)):
for line in filename:
fOut.write(line)
You should always use with to open your files as it will close them automatically. If the files are not large you can simply fOut.write(f.read()) and remove the loop.
You also set path = dir_name which means path will be set to whatever the last value of dir_name was in your first loop which may or may not be what you want. You can also use iglob to avoid creating a full list zip_files = glob.iglob('*.zip').

Reading/Writing Text from Multiple Files to Master File

In the code below I'm trying to open a series of text files and copy their contents into a single file. I'm getting an error on the "os.write(out_file, line)" in which it asks me for an integer. I haven't defined what "line" is, so is that the problem? Do I need to specify somehow that "line" is a text string from the in_file? Also, I open the out_file through each iteration of the for-loop. Is that bad? Should I open it once at the beginning? Thanks!
import os
import os.path
import shutil
# This is supposed to read through all the text files in a folder and
# copy the text inside to a master file.
# This defines the master file and gets the source directory
# for reading/writing the files in that directory to the master file.
src_dir = r'D:\Term Search'
out_file = r'D:\master.txt'
files = [(path, f) for path,_,file_list in os.walk(src_dir) for f in file_list]
# This for-loop should open each of the files in the source directory, write
# their content to the master file, and finally close the in_file.
for path, f_name in files:
open(out_file, 'a+')
in_file = open('%s/%s' % (path, f_name), 'r')
for line in in_file:
os.write(out_file, line)
close(file_name)
close(out_file)
print 'Finished'
You're doing it wrong:
You did:
open(out_file, 'a+')
but that doesn't save the reference as a variable, so you have no way to access the file object you just created. What you need to do:
out_file_handle = open(out_file, 'a+')
...
out_file_handle.write(line)
...
out_file_handle.close()
Or, more pythonically:
out_filename = r"D:\master.txt"
...
with open(out_filename, 'a+') as outfile:
for filepath in files:
with open(os.path.join(*filepath)) as infile:
outfile.write(infile.read())
print "finished"

Categories