Locating files by name for copying elsewhere - python

New to Python...
I'm trying to have python take a text file of file names (new name on each row), and store them as strings ...
i.e
import os, shutil
files_to_find = []
with open('C:\\pathtofile\\lostfiles.txt') as fh:
for row in fh:
files_to_find.append(row.strip)
...in order to search for these files in directories and then copy any found files somewhere else...
for root, dirs, files in os.walk('D:\\'):
for _file in files:
if _file in files_to_find:
print ("Found file in: " + str(root))
shutil.copy(os.path.abspath(root + '/' + _file), 'C:\\destination')
print ("process completed")
Despite knowing these files exist, the script runs without any errors but without finding any files.
I added...
print (files_to_find)
...after the first block of code to see if it was finding anything and saw screeds of "built-in method strip of str object at 0x00000000037FC730>,
Does this tell me it's not successfully creating strings to compare file names against? I wonder where I'm going wrong?

Use array to create a list of files.
import os
import sys
import glob
import shutil
def file_names(self,filepattern,dir):
os.chdir(dir)
count = len(glob.glob(filepattern))
file_list = []
for line in sorted(glob.glob(filepattern)):
line = line.split("/")
line = line[-1]
file_list.append(line)
return file_list
The loop over the array list to compare.

Related

Locating multiple files in large dataset in python

I have a large repository of image files (~2 million, .jpg) with individual ids spread in multiple sub-dirs and I'm trying to locate and copy each image on a list containing a ~1,000 subset of these ids.
I'm still very new to Python so my first thought was to use os.walk to iterate through the 1k subset for each file, to see if any within the subset matched the id. This works, at least theoretically, but it seems incredibly slow at something like 3-5 images a second. The same seems to be the case for running through all of the files looking for one id at a time.
import shutil
import os
import csv
# Wander to Folder, Identify Files
for root, dirs, files in os.walk(ImgFolder):
for file in files:
fileName = ImgFolder + str(file)
# For each file, check dictionary for match
with open(DictFolder, 'r') as data1:
csv_dict_reader = csv.DictReader(data1)
for row in csv.DictReader(data1):
img_id_line = row['id_line']
isIdentified = (img_id_line in fileName) and ('.jpg' in fileName)
# If id_line == file ID, copy file
if isIdentified:
src = fileName + '.jpg'
dst = dstFolder + '.jpg'
shutil.copyfile(src,dst)
else:
continue
I've been looking at trying to automate query searches instead, but the data is contained on a NAS and I have no easy way of indexing the files to make querying faster. The machine I'm running the code through is a W10 and thus I can't use the Ubuntu Find method which I gather is considerably better at this task.
Any way to speed up the process would be greatly appreciated!
Here's a couple of scripts that should do what you're looking for.
index.py
This script uses pathlib to walk through directories searching for files with a given extension. It will write a TSV file with two columns, filename and filepath.
import argparse
from pathlib import Path
def main(args):
for arg, val in vars(args).items():
print(f"{arg} = {val}")
ext = "*." + args.ext
index = {}
with open(args.output, "w") as fh:
for file in Path(args.input).rglob(ext):
index[file.name] = file.resolve()
fh.write(f"{file.name}\t{file.resolve()}\n")
if __name__ == "__main__":
p = argparse.ArgumentParser()
p.add_argument(
"input",
help="Top level folder which will be recursively "
" searched for files ending with the value "
"provided to `--ext`",
)
p.add_argument("output", help="Output file name for the index tsv file")
p.add_argument(
"--ext",
default="jpg",
help="Extension to search for. Don't include `*` or `.`",
)
main(p.parse_args())
search.py
This script will load the index (output from index.py) into a dicttionary, then it will load the CSV file into a dictionary, then for each id_line it will look for the filename in the index and attempt to copy it to the output folder.
import argparse
import csv
import shutil
from collections import defaultdict
from pathlib import Path
def main(args):
for arg, val in vars(args).items():
print(f"{arg} = {val}")
if not Path(args.dest).is_dir():
Path(args.dest).mkdir(parents=True)
with open(args.index) as fh:
index = dict(l.strip().split("\t", 1) for l in fh)
print(f"Loaded {len(index):,} records")
csv_dict = defaultdict(list)
with open(args.csv) as fh:
reader = csv.DictReader(fh)
for row in reader:
for (k, v) in row.items():
csv_dict[k].append(v)
print(f"Searching for {len(csv_dict['id_line']):,} files")
copied = 0
for file in csv_dict["id_line"]:
if file in index:
shutil.copy2(index[file], args.dest)
copied += 1
else:
print(f"!! File {file!r} not found in index")
print(f"Copied {copied} files to {args.dest}")
if __name__ == "__main__":
p = argparse.ArgumentParser()
p.add_argument("index", help="Index file from `index.py`")
p.add_argument("csv", help="CSV file with target filenames")
p.add_argument("dest", help="Target folder to copy files to")
main(p.parse_args())
How to run this:
python index.py --ext "jpg" "C:\path\to\image\folder" "index.tsv"
python search.py "index.tsv" "targets.csv" "C:\path\to\output\folder"
I would try this on one/two folders first to check that it has the expected results.
Under the assumption that file names are unique and files location doesn't change, it is possible to create a dictionary that will allow searching for a file path in O(1) time complexity. The dictionary creation process will take some time, it is possible to pickle it on your computer, so you have to run it only once.
A simple script to create the dictionary:
from pathlib import Path
import pickle
root = Path('path/to/root/folder')
# files extensions to index
extensions = {'.jpg', '.png'}
# iterating over whole `root` directory tree and indexing by file name
image = {file.stem: file for file in root.rglob('*.*') if file.suffix in extensions}
# saving the index on your computer for further use
index_path = Path('path/to/index.pickle')
with index_path.open('wb') as file:
pickle.dump(image, file, pickle.HIGHEST_PROTOCOL)
An example of loading the dictionary:
from pathlib import Path
import pickle
index_path = Path('path/to/index.pickle')
with index_path.open('rb') as file:
image = pickle.load(file)

How to open and read text files in a folder python

I have a folder which has a text files in it. I want to be able to put in a path to this file and have python go through the folder, open each file and append its content to a list.
import os
folderpath = "/Users/myname/Downloads/files/"
inputlst = [os.listdir(folderpath)]
filenamelist = []
for filename in os.listdir(folderpath):
if filename.endswith(".txt"):
filenamelist.append(filename)
print(filename list)
So far this outputs:
['test1.txt', 'test2.txt', 'test3.txt', 'test4.txt', 'test5.txt', 'test6.txt', 'test7.txt', 'test8.txt', 'test9.txt', 'test10.txt']
I want to have the code take each of these files, open them and put all of its content into a single huge list not just print the file name. Is there any way to do this?
You should use file open for this.
Read here a documentation about its advanced options
Anyway, here is one way how you can do it:
import os
folderpath = r"yourfolderpath"
inputlst = [os.listdir(folderpath)]
filenamecontent = []
for filename in os.listdir(folderpath):
if filename.endswith(".txt"):
f = open(os.path.join(folderpath,filename), 'r')
filenamecontent.append(f.read())
print(filenamecontent)
If you are using Python3, you can use :
for filename in filename_list :
with open(filename,"r") as file_handler :
data = file_handler.read()
Please do mind that you will need the full (either relative or absolute) path to your file in filename
This way, your file handler will be automatically closed when you get out of the with scope.
More information around here : https://docs.python.org/fr/3/library/functions.html#open
On a side note, in order to list files, you might want to have a look to glob and use :
filename_list = glob.glob("/path/to/files/*.txt")
You can use fileinput
Code:
import fileinput
folderpath = "your_path_to_directory_where_files_are_stored"
file_list = [a for a in os.listdir(folderpath) if a.endswith(".txt")]
# This will return all the files which are in .txt format
get_all_files = fileinput.input(file_list)
with open("alldata.txt", 'ab+') as writefile:
for line in get_all_files:
writefile.write(line+'\n')
The above code will read all the data from .txt from a specified directory(folderpath) and store it in alldata.txt So, you wanted to have that long list, that list is now stored in .txt file if you want, else you can remove the write process.
Links:
https://docs.python.org/3/library/fileinput.html
https://docs.python.org/3/library/functions.html#open

opening and reading all the files in a directory in python - python beginner

I'd like to read the contents of every file in a folder/directory and then print them at the end (I eventually want to pick out bits and pieces from the individual files and put them in a separate document)
So far I have this code
import os
path = 'results/'
fileList = os.listdir(path)
for i in fileList:
file = open(os.path.join('results/'+ i), 'r')
allLines = file.readlines()
print(allLines)
at the end I dont get any errors but it only prints the contents of the last file in my folder in a series of strings and I want to make sure its reading every file so I can then access the data I want from each file. I've looked online and I cant find where I'm going wrong. Is there any way of making sure the loop is iterating over all my files and reading all of them?
also i get the same result when I use
file = open(os.path.join('results/',i), 'r')
in the 5th line
Please help I'm so lost
Thanks!!
Separate the different functions of the thing you want to do.
Use generators wherever possible. Especially if there are a lot of files or large files
Imports
from pathlib import Path
import sys
Deciding which files to process:
source_dir = Path('results/')
files = source_dir.iterdir()
[Optional] Filter files
For example, if you only need files with extension .ext
files = source_dir.glob('*.ext')
Process files
def process_files(files):
for file in files:
with file.open('r') as file_handle :
for line in file_handle:
# do your thing
yield line
Save the lines you want to keep
def save_lines(lines, output_file=sys.std_out):
for line in lines:
output_file.write(line)
you forgot indentation at this line allLines = file.readlines()
and maybe you can try that :
import os
allLines = []
path = 'results/'
fileList = os.listdir(path)
for file in fileList:
file = open(os.path.join('results/'+ i), 'r')
allLines.append(file.read())
print(allLines)
You forgot to indent this line allLines.append(file.read()).
Because it was outside the loop, it only appended the file variable to the list after the for loop was finished. So it only appended the last value of the file variable that remained after the loop. Also, you should not use readlines() in this way. Just use read() instead;
import os
allLines = []
path = 'results/'
fileList = os.listdir(path)
for file in fileList:
file = open(os.path.join('results/'+ i), 'r')
allLines.append(file.read())
print(allLines)
This also creates a file containing all the files you wanted to print.
rootdir= your folder, like 'C:\\Users\\you\\folder\\'
import os
f = open('final_file.txt', 'a')
for root, dirs, files in os.walk(rootdir):
for filename in files:
data = open(full_name).read()
f.write(data + "\n")
f.close()
This is a similar case, with more features: Copying selected lines from files in different directories to another file

how can i use fileinput to edit multiple files?

I am using os.walk in python 2.7 to open multiple files, then, add all lines of interest of those files to a list. Later I'd want to edit those lines with fileinput and close it. How can I achieve this? Using the code below is how I'm opening the files:
import os
import fnmatch
import fileinput
lines = []
def openFiles():
for root, dirs, files in os.walk('/home/test1/'):
for lists in fnmatch.filter(files, "*.txt"):
filepath = os.path.join(root, lists)
print filepath
with open(filepath, "r") as sources:#opens 8 files and read their lines
#edit = fileinput.input(filepath, inplace=1)
for line in sources:
if line.startswith('xe') :
lines.append(line)
Then later, for each lines that start with xe, I'd like to add a # in front of it then close that file. I'd like to do that in a different function.
Here's the I way I do it, adding to your code:
import os
import fnmatch
import fileinput
def openFiles(dir):
filePaths = []
for root, dirs, files in os.walk(dir):
for textFile in fnmatch.filter(files, "*.txt"):
filepath = os.path.join(root, textFile)
filePaths.append(filepath)
return filePaths
def prefixLines(filepaths, chartoPrefix, prefixWith):
res = ''
for filepath in filepaths:
# Read file
with open(filepath, 'r') as f:
for line in f:
if line.startswith(chartoPrefix):
res += prefixWith + line
else:
res += line
# Write to file
with open(filepath, 'w') as f:
f.write(res)
res = '' # Rest res
prefixLines(openFiles(r'/home/test1/'), 'xe', '#')
prefixLines suffers from many shortcomings:
Because we read all the lines of files and store them in res, we
may ran out of memory for large files.
If somehow the programmer forgot to indent res = '' in the
right block or if res was completely omitted and the code ran on
actual files that the user needs, you'll end up writing the contents
of the previous read file to the next file and the last
file will have the contents of all the read files. That's why you
have use this code in a testing environment or use it cautiously.
This code only serves to demonstrate how you could achieve your desired effects, prefixing file lines that starts with a string with another string. Therefore, a slight improvement of this code is recommended. For example, instead of reading all the contents of the file and storing them at res you could simply save the line number that needs to be prefixed and thus eliminating the need to load all the data into memory. enumerate could also be helpful to return the file number, it returns an iterable in 2.7. By obviating res not only do we save memory, but also eliminate the shortcoming in bullet 2.
I ended up doing it this way. But I'm using classes in my main code so It's split into 2 functions instead of one. In my main code, I used a list to hold all the file paths and use fileinput to open each filepaths from the list this way for line in fileinput.FileInput(pathlist, inplace=1): do something. I do thank #direprobs for her answer, as she shed some light on how I'm supposed to do this.
import fnmatch
import fileinput
import os
import sys
def openFiles():
for dirpath, dirs, files in os.walk('/home/test1/'):
for filename in fnmatch.filter(files, "*.txt"):
filepaths = os.path.join(dirpath, filename)
for line in fileinput.FileInput(filepaths, inplace=1):
if line.startswith("xe"):
add = "# {}".format(line)
line = line.replace(line, add)
sys.stdout.write(line)
fileinput.close()
openFiles()

pipe one file at a time python

I have more than 10000 json files which I have to convert to for further processing. I am using the following code:
import json
import time
import os
import csv
import fnmatch
tweets = []
count = 0
search_folder = ('/Volumes/Transcend/Axiom/IPL/test/')
for root, dirs, files in os.walk(search_folder):
for file in files:
pathname = os.path.join(root, file)
for file in open(pathname):
try:
tweets.append(json.loads(file))
except:
pass
count = count + 1
This iterates over just one file and stops. I tried adding while True: before for file in open(pathname): and it just doesn't stop nor it creates the csv files. I want to read one file at a time, convert it to csv, then move on to the next file. I tried adding count = count + 1 at the end of the file after completing converting the csv. Still it stops after converting the first file. Can someone help please?
Your indentation is off; you need to put the second for loop inside the first one.
Separate from your main problem, you should use a with statement to open the file. Also, you were reusing the variable name file, which you shouldn't be using anyway since it's the name of a built-in. I also made a few other minor edits.
import json
import os
tweets = []
count = 0
search_folder = '/Volumes/Transcend/Axiom/IPL/test/'
for root, dirs, filenames in os.walk(search_folder):
for filename in filenames:
pathname = os.path.join(root, filename)
with open(pathname, 'r') as infile:
for line in infile:
try:
tweets.append(json.loads(line))
except: # Don't use bare except clauses
pass
count += 1

Categories