delete the file if it has more then 300 lines - python

I have 500 text files in a folder. How do I find out how many lines each text file have, and delete them if it has more than 300 lines.
import os
path = "All_TSV_Files"
files = [file for file in os.listdir(path) if file.endswith(".txt")]
for file in files:
with open(os.path.join(path, file), 'r',encoding='utf-8') as f:
#find the lines of each file

Unfortunately you will have to read the file to see how many lines it has. In your case, you could skip it once you reach 300.
Sample code to get line count
file = open("sample.txt", "r")
line_count = 0
for line in file:
if line != "\n":
line_count += 1
file.close()
print(line_count)

import os
path = "All_TSV_Files"
files = [file for file in os.listdir(path) if file.endswith(".txt")]
for file in files:
with open(os.path.join(path, file), 'r',encoding='utf-8') as f:
if len(f.readlines())>300:
os.remove(os.path.join(path,file))

Like Saurabh said, if you wanna do it with Python seems like you'd have to iterate through all the lines in order to count them. This article demonstrates doing it with enumerate(). https://pynative.com/python-count-number-of-lines-in-file/
If you're on Unix you could use AWK since you've already imported os: awk 'END{print NR}' file

Related

Python - loop through subfolders and files in a directory without ignoring the subfolder

I have read all the stack exchange help files on looping through subfolders, as as well as the os documentation, but I am still stuck. I am trying to loop over files in subfolders, open each file, extract the first number in the first line, copy the file to a different subfolder(with the same name but in the output directory) and rename the file copy with the number as a suffix.
import os
import re
outputpath = "C:/Users/Heather/Dropbox/T_Files/Raw_FRUS_Data/Wisconsin_Copies_With_PageNumbers"
inputpath = "C:/Users/Heather/Dropbox/T_Files/Raw_FRUS_Data/FRUS_Wisconsin"
suffix=".txt"
for root, dirs, files in os.walk(inputpath):
for file in files:
file_path = os.path.join(root, file)
foldername=os.path.split(os.path.dirname(file_path))[1]
filebname=os.path.splitext(file)[0]
filename=filebname + "_"
f=open(os.path.join(root,file),'r')
data=f.readlines()
if data is None:
f.close()
else:
with open(os.path.join(root,file),'r') as f:
for line in f:
s=re.search(r'\d+',line)
if s:
pagenum=(s.group())
break
with open(os.path.join(outputpath, foldername,filename+pagenum+suffix), 'w') as f1:
with open(os.path.join(root,file),'r') as f:
for line in f:
f1.write(line)
I expect the result to be copies of the files in the input directory placed in the corresponding subfolder in the output directory, renamed with a suffix, such as "005_2", where 005 is the original file name, and 2 is the number the python code extracted from it.
The error I get seems to indicates that I am not looping through files correctly. I know the code for extracting the first number and renaming the file works because I tested it on a single file. But using os.walk to loop through multiple subfolders is not working, and I can't figure out what I am doing wrong. Here is the error:
File "<ipython-input-1-614e2851f16a>", line 23, in <module>
with open(os.path.join(outputpath, foldername,filename+pagenum+suffix), 'w') as f1:
IOError: [Errno 2] No such file or directory: 'C:/Users/Heather/Dropbox/T_Files/Raw_FRUS_Data/Wisconsin_Copies_With_PageNumbers\\FRUS_Wisconsin\\.dropbox_1473986809.txt'
Well, this isn't eloquent, but it worked
from glob import glob
folderlist=glob("C:\\...FRUS_Wisconsin*\\")
outputpath = "C:\\..\Wisconsin_Copies_With_PageNumbers"
for folder in folderlist:
foldername = str(folder.split('\\')[7])
for root, dirs, files in os.walk(folder):
for file in files:
filebname=os.path.splitext(file)[0]
filename=filebname + "_"
if not filename.startswith('._'):
with open(os.path.join(root,file),'r') as f:
for line in f:
s=re.search(r'\d+',line)
if s:
pagenum=(s.group())
break
with open(os.path.join(outputpath, foldername,filename+pagenum+suffix), 'w') as f1:
with open(os.path.join(root,file),'r') as f:
for line in f:
f1.write(line)

opening and reading all the files in a directory in python - python beginner

I'd like to read the contents of every file in a folder/directory and then print them at the end (I eventually want to pick out bits and pieces from the individual files and put them in a separate document)
So far I have this code
import os
path = 'results/'
fileList = os.listdir(path)
for i in fileList:
file = open(os.path.join('results/'+ i), 'r')
allLines = file.readlines()
print(allLines)
at the end I dont get any errors but it only prints the contents of the last file in my folder in a series of strings and I want to make sure its reading every file so I can then access the data I want from each file. I've looked online and I cant find where I'm going wrong. Is there any way of making sure the loop is iterating over all my files and reading all of them?
also i get the same result when I use
file = open(os.path.join('results/',i), 'r')
in the 5th line
Please help I'm so lost
Thanks!!
Separate the different functions of the thing you want to do.
Use generators wherever possible. Especially if there are a lot of files or large files
Imports
from pathlib import Path
import sys
Deciding which files to process:
source_dir = Path('results/')
files = source_dir.iterdir()
[Optional] Filter files
For example, if you only need files with extension .ext
files = source_dir.glob('*.ext')
Process files
def process_files(files):
for file in files:
with file.open('r') as file_handle :
for line in file_handle:
# do your thing
yield line
Save the lines you want to keep
def save_lines(lines, output_file=sys.std_out):
for line in lines:
output_file.write(line)
you forgot indentation at this line allLines = file.readlines()
and maybe you can try that :
import os
allLines = []
path = 'results/'
fileList = os.listdir(path)
for file in fileList:
file = open(os.path.join('results/'+ i), 'r')
allLines.append(file.read())
print(allLines)
You forgot to indent this line allLines.append(file.read()).
Because it was outside the loop, it only appended the file variable to the list after the for loop was finished. So it only appended the last value of the file variable that remained after the loop. Also, you should not use readlines() in this way. Just use read() instead;
import os
allLines = []
path = 'results/'
fileList = os.listdir(path)
for file in fileList:
file = open(os.path.join('results/'+ i), 'r')
allLines.append(file.read())
print(allLines)
This also creates a file containing all the files you wanted to print.
rootdir= your folder, like 'C:\\Users\\you\\folder\\'
import os
f = open('final_file.txt', 'a')
for root, dirs, files in os.walk(rootdir):
for filename in files:
data = open(full_name).read()
f.write(data + "\n")
f.close()
This is a similar case, with more features: Copying selected lines from files in different directories to another file

Copying all files of a directory to one text file in python

My intention is to copy the text of all my c# (and later aspx) files to one final text file, but it doesn't work.
For some reason, the "yo.txt" file is not created.
I know that the iteration over the files works, but I can't write the data into the .txt file.
The variable 'data' eventually does contain all text from the files . . .
*******Could it be connected to the fact that there are some non-ascii characters in the text of the c# files?
Here is my code:
import os
import sys
src_path = sys.argv[1]
os.chdir(src_path)
data = ""
for file in os.listdir('.'):
if os.path.isfile(file):
if file.split('.')[-1]=="cs" and (len(file.split('.'))==2 or len(file.split('.'))==3):
print "Copying", file
with open(file, "r") as f:
data += f.read()
print data
with open("yo.txt", "w") as f:
f.write(data)
If someone has an idea, it will be great :)
Thanks
You have to ensure the directory the file is created has sufficient write permissions, if not run
chmod -R 777 .
to make the directory writable.
import os
for r, d, f in os.walk(inputdir):
for file in f:
filelist.append(f"{r}\\{file}")
with open(outputfile, 'w') as outfile:
for f in filelist:
with open(f) as infile:
for line in infile:
outfile.write(line)
outfile.write('\n \n')

generalize python script to run on all files in a directory

I have the following python script:
with open('ein.csv', 'r') as istr:
with open('aus.csv', 'w') as ostr:
for line in istr:
line = line.rstrip('\n') + ',1'
print(line, file=ostr)
how could this be generalized to run on all the files in a directory and output a seperate file for each one?
maybe have a function like this:
for phyle in list_files(.):
with open(phyle, 'r') as istr:
with open('ausput_xyz.csv', 'w') as ostr:
for line in istr:
line = line.rstrip('\n') + ',1'
print(line, file=ostr)
def list_files(path):
# returns a list of names (with extension, without full path) of all files
# in folder path
files = []
for name in os.listdir(path):
if os.path.isfile(os.path.join(path, name)):
files.append(name)
return files
Just put the code in a function and call it:
def process(infilename):
outfilename = os.path.splitext(infilename)[0] + "-out.csv"
with open(infilename, 'r') as istr:
with open(outfilename, 'w') as ostr:
for line in istr:
line = line.rstrip('\n') + ',1'
print(line, file=ostr)
def process_files(path):
for name in os.listdir(path):
if os.path.isfile(os.path.join(path, name)):
process(name)
In a directory with input files "abc.csv", "xyz.csv" this code will create output files named "abc-out.csv" and "xyz-out.csv".
Note that os.listdir(path) is called just once during the execution, so the list of files to process will not include the newly created output files.
First off, as a more pythonic way for dealing with csv files you better to use csv module and use with statement for opening the files which will close the file object automatically at the end of the block. And use os.walk() function to iterate over the files and directories of a specific path:
import csv
import os
for path_name, dirs, files in os.walk('relative_path'):
for file_name in files:
with open(file_name) as inp,open('{}.csv'.format(file_name),'wb') as out:
spamwriter = csv.writer(out, delimiter=',')
for line in inp:
spamwriter.writerow(line) # or line.split() with a specific delimiter
Note that if your script is not in a same path with your files directory you can add the path to the leading of your file name while you want to open them.
with open(path_name+'/'+file_name),...

Find one file out of many containing a desired string in Python

I have a string like 'apples'. I want to find this string, and I know that it exists in one out of hundreds of files. e.g.
file1
file2
file3
file4
file5
file6
...
file200
All of these files are in the same directory. What is the best way to find which file contains this string using python, knowing that exactly one file contains it.
I have come up with this:
for file in os.listdir(directory):
f = open(file)
for line in f:
if 'apple' in f:
print "FOUND"
f.close()
and this:
grep = subprocess.Popen(['grep','-m1','apple',directory+'/file*'],stdout=subprocess.PIPE)
found = grep.communicate()[0]
print found
Given that the files are all in the same directory, we just get a current directory listing.
import os
for fname in os.listdir('.'): # change directory as needed
if os.path.isfile(fname): # make sure it's a file, not a directory entry
with open(fname) as f: # open file
for line in f: # process line by line
if 'apples' in line: # search for string
print 'found string in file %s' %fname
break
This automatically gets the current directory listing, and checks to make sure that any given entry is a file (not a directory).
It then opens the file and reads it line by line (to avoid problems with memory it doesn't read it in all at once) and looks for the target string in each line.
When it finds the target string it prints the name of the file.
Also, since the files are opened using with they are also automatically closed when we are done (or an exception occurs).
For simplicity, this assumes your files are in the current directory:
def whichFile(query):
for root,dirs,files in os.walk('.'):
for file in files:
with open(file) as f:
if query in f.read():
return file
for x in os.listdir(path):
with open(x) as f:
if 'Apple' in f.read():
#your work
break
a lazy-evaluation, itertools-based approach
import os
from itertools import repeat, izip, chain
gen = (file for file in os.listdir("."))
gen = (file for file in gen if os.path.isfile(file) and os.access(file, os.R_OK))
gen = (izip(repeat(file), open(file)) for file in gen)
gen = chain.from_iterable(gen)
gen = (file for file, line in gen if "apple" in line)
gen = set(gen)
for file in gen:
print file
Open your terminal and write this:
Case insensitive search
grep -i 'apple' /path/to/files
Recursive search (through all sub folders)
grep -r 'apple' /path/to/files

Categories