I am using python 2.6
I am inputting n number of files and using loops to process the data in the files and outputting that information to a single output file.
The input files are named inputfile_date_time.h5 where each date/time is different for each input file.
I am looking to name the output file outputfile_firstdate_firsttime_lastdate_lasttime.pkt - where firstdate_firsttime is the date and time of the input file with the first time (aka part of the name of the input file that comes first in the sequence of n files) and where lastdate_lasttime is the date and time of the input file with the last time (aka part of the name of the input file that comes last in the sequence of n files)
My code is currently set up as follows:
import os
from glob import glob
from os.path import basename
import numpy
import hdf5
#set location/directory of input files
inputdir = "/Location of directory that contains files"
#create output file
outputfilename = 'outputfilename'
outputfile = "/Location to put output file/"+basename(outputfilename)[:-4]+".pkt"
ofile = open(outputfile, 'wb')
for path, dirs, files in os.walk(inputdir):
files_list = glob(os.path.join(inputdir, '*.h5'))
for file in files_list:
f = h5py.File(os.path.join(files_list,file), 'r')
f.close()
#for loop performing the necessary task to the information in the files
#print that the output file was written
print "Wrote " + outputfile
#close output file
ofile.close()
This code creates an output file called outputfile.pkt
How can I adjust this code to make the changes I previously stated?
time.strptime can parse any time format you want, time.strftime can generate any time format you want. You should read (and possibly parse) all of them, and use min(...) and max(...) to get the smallest and the largest.
For example, if the filenames look like foo2014-06-16bar.txt and hello2014-06-17world, then here is how to parse them:
import re
files = ['foo2014-06-16bar.txt', 'hello2014-06-17world'
dates = [re.search(r'(?:19|20)\d{2}-\d{2}-\d{2}', f).group() for f in files]
print min(dates) #: 2014-06-16
print max(dates) #: 2014-06-17
Here is how to build files using os.walk:
import os
inputdir = "/Location of directory that contains files"
files = []
for dirpath, dirnames, filenames in os.walk(inputdir):
for filename in filenames:
if filename.endswith('.h5'):
pathname = os.path.join(dirpath, filename)
files.append(pathname)
print files
Related
I have a large repository of image files (~2 million, .jpg) with individual ids spread in multiple sub-dirs and I'm trying to locate and copy each image on a list containing a ~1,000 subset of these ids.
I'm still very new to Python so my first thought was to use os.walk to iterate through the 1k subset for each file, to see if any within the subset matched the id. This works, at least theoretically, but it seems incredibly slow at something like 3-5 images a second. The same seems to be the case for running through all of the files looking for one id at a time.
import shutil
import os
import csv
# Wander to Folder, Identify Files
for root, dirs, files in os.walk(ImgFolder):
for file in files:
fileName = ImgFolder + str(file)
# For each file, check dictionary for match
with open(DictFolder, 'r') as data1:
csv_dict_reader = csv.DictReader(data1)
for row in csv.DictReader(data1):
img_id_line = row['id_line']
isIdentified = (img_id_line in fileName) and ('.jpg' in fileName)
# If id_line == file ID, copy file
if isIdentified:
src = fileName + '.jpg'
dst = dstFolder + '.jpg'
shutil.copyfile(src,dst)
else:
continue
I've been looking at trying to automate query searches instead, but the data is contained on a NAS and I have no easy way of indexing the files to make querying faster. The machine I'm running the code through is a W10 and thus I can't use the Ubuntu Find method which I gather is considerably better at this task.
Any way to speed up the process would be greatly appreciated!
Here's a couple of scripts that should do what you're looking for.
index.py
This script uses pathlib to walk through directories searching for files with a given extension. It will write a TSV file with two columns, filename and filepath.
import argparse
from pathlib import Path
def main(args):
for arg, val in vars(args).items():
print(f"{arg} = {val}")
ext = "*." + args.ext
index = {}
with open(args.output, "w") as fh:
for file in Path(args.input).rglob(ext):
index[file.name] = file.resolve()
fh.write(f"{file.name}\t{file.resolve()}\n")
if __name__ == "__main__":
p = argparse.ArgumentParser()
p.add_argument(
"input",
help="Top level folder which will be recursively "
" searched for files ending with the value "
"provided to `--ext`",
)
p.add_argument("output", help="Output file name for the index tsv file")
p.add_argument(
"--ext",
default="jpg",
help="Extension to search for. Don't include `*` or `.`",
)
main(p.parse_args())
search.py
This script will load the index (output from index.py) into a dicttionary, then it will load the CSV file into a dictionary, then for each id_line it will look for the filename in the index and attempt to copy it to the output folder.
import argparse
import csv
import shutil
from collections import defaultdict
from pathlib import Path
def main(args):
for arg, val in vars(args).items():
print(f"{arg} = {val}")
if not Path(args.dest).is_dir():
Path(args.dest).mkdir(parents=True)
with open(args.index) as fh:
index = dict(l.strip().split("\t", 1) for l in fh)
print(f"Loaded {len(index):,} records")
csv_dict = defaultdict(list)
with open(args.csv) as fh:
reader = csv.DictReader(fh)
for row in reader:
for (k, v) in row.items():
csv_dict[k].append(v)
print(f"Searching for {len(csv_dict['id_line']):,} files")
copied = 0
for file in csv_dict["id_line"]:
if file in index:
shutil.copy2(index[file], args.dest)
copied += 1
else:
print(f"!! File {file!r} not found in index")
print(f"Copied {copied} files to {args.dest}")
if __name__ == "__main__":
p = argparse.ArgumentParser()
p.add_argument("index", help="Index file from `index.py`")
p.add_argument("csv", help="CSV file with target filenames")
p.add_argument("dest", help="Target folder to copy files to")
main(p.parse_args())
How to run this:
python index.py --ext "jpg" "C:\path\to\image\folder" "index.tsv"
python search.py "index.tsv" "targets.csv" "C:\path\to\output\folder"
I would try this on one/two folders first to check that it has the expected results.
Under the assumption that file names are unique and files location doesn't change, it is possible to create a dictionary that will allow searching for a file path in O(1) time complexity. The dictionary creation process will take some time, it is possible to pickle it on your computer, so you have to run it only once.
A simple script to create the dictionary:
from pathlib import Path
import pickle
root = Path('path/to/root/folder')
# files extensions to index
extensions = {'.jpg', '.png'}
# iterating over whole `root` directory tree and indexing by file name
image = {file.stem: file for file in root.rglob('*.*') if file.suffix in extensions}
# saving the index on your computer for further use
index_path = Path('path/to/index.pickle')
with index_path.open('wb') as file:
pickle.dump(image, file, pickle.HIGHEST_PROTOCOL)
An example of loading the dictionary:
from pathlib import Path
import pickle
index_path = Path('path/to/index.pickle')
with index_path.open('rb') as file:
image = pickle.load(file)
I am simply trying to create a python 3 program that runs through all .sql files in a specific directory and then apply my regex that adds ; after a certain instance and write the changes made to the file to a separate directory with their respective file names as the same.
So, if I had file1.sql and file2.sql in "/home/files" directory, after I run the program, the output should write those two files to "/home/new_files" without changes the content of the original files.
Here is my code:
import glob
import re
folder_path = "/home/files/d_d"
file_pattern = "/*sql"
folder_contents = glob.glob(folder_path + file_pattern)
for file in folder_contents:
print("Checking", file)
for file in folder_contents:
read_file = open(file, 'rt',encoding='latin-1').read()
#words=read_file.split()
with open(read_file,"w") as output:
output.write(re.sub(r'(TBLPROPERTIES \(.*?\))', r'\1;', f, flags=re.DOTALL))
I receive an error of File name too long:"CREATE EXTERNAL TABLe" and also I am not too sure where I would put my output path (/home/files/new_dd)in my code.
Any ideas or suggestions?
With read_file = open(file, 'rt',encoding='latin-1').read() the whole content of the file was being used as the file descriptor. The code provided here iterate over the files names found with glob.glob pattern open to read, process data, and open to write (assuming that a folder newfile_sqls already exist,
if not, an error would rise FileNotFoundError: [Errno 2] No such file or directory).
import glob
import os
import re
folder_path = "original_sqls"
#original_sqls\file1.sql, original_sqls\file2.sql, original_sqls\file3.sql
file_pattern = "*sql"
# new/modified files folder
output_path = "newfile_sqls"
folder_contents = glob.glob(os.path.join(folder_path,file_pattern))
# iterate over file names
for file_ in [os.path.basename(f) for f in folder_contents]:
# open to read
with open(os.path.join(folder_path,file_), "r") as inputf:
read_file = inputf.read()
# use variable 'read_file' here
tmp = re.sub(r'(TBLPROPERTIES \(.*?\))', r'\1;', read_file, flags=re.DOTALL)
# open to write to (previouly created) new folder
with open(os.path.join(output_path,file_), "w") as output:
output.writelines(tmp)
I have a directory /directory/some_directory/ and in that directory I have a set of files. Those files are named in the following format: <letter>-<number>_<date>-<time>_<dataidentifier>.log, for example:
ABC1-123_20162005-171738_somestring.log
DE-456_20162005-171738_somestring.log
ABC1-123_20162005-153416_somestring.log
FG-1098_20162005-171738_somestring.log
ABC1-123_20162005-031738_somestring.log
DE-456_20162005-171738_somestring.log
I would like to read those a subset of those files (for example, read only files named as ABC1-123*.log) and export all their contents to a single csv file (for example, output.csv), that is, a CSV file that will have all the data from the inidividual files collectively.
The code that I have written so far:
#!/usr/bin/env python
import os
file_directory=os.getcwd()
m_class="ABC1"
m_id="123"
device=m_class+"-"+m_id
for data_file in sorted(os.listdir(file_dir)):
if str(device)+"*" in os.listdir(file_dir):
print data_file
I don't know how to read a only a subset of filtered files and also how to export them to a common csv file.
How can I achieve this?
just use re lib to match file name pattern, and use csv lib to export.
Only a few adjustments, You were close
filesFromDir = os.listdir(os.getcwd())
fileList = [file for file in filesFromDir if file.startswith(device)]
f = open("LogOutput.csv", "ab")
for file in fileList:
#print "Processing", file
with open(file, "rb") as log_file:
txt = log_file.read()
f.write(txt)
f.write("\n")
f.close()
Your question could be better stated, based on your current code snipet, I'll assume that you want to:
Filter files in a directory based on glob pattern.
Concatenate their contents to a file named output.csv.
In python you can achieve (1.) by using glob to list filenames.
import glob
for filename in glob.glob('foo*bar'):
print filename
That would print all files starting with foo and ending with bar in
the current directory.
For (2.) you just read the file and write its content to your desired
output, using python's open() builtin function:
open('filename', 'r')
(Using 'r' as the mode you are asking python to open the file for
"reading", using 'w' you are asking python to open the file for
"writing".)
The final code would look like the following:
import glob
import sys
device = 'ABC1-123'
with open('output.csv', 'w') as output:
for filename in glob.glob(device+'*'):
with open(filename, 'r') as input:
output.write(input.read())
You can use the os module to list the files.
import os
files = os.listdir(os.getcwd())
m_class = "ABC1"
m_id = "123"
device = m_class + "-" + m_id
file_extension = ".log"
# filter the files by their extension and the starting name
files = [x for x in files if x.startswith(device) and x.endswith(file_extension)]
f = open("output.csv", "a")
for file in files:
with open(file, "r") as data_file:
f.write(data_file.read())
f.write(",\n")
f.close()
New to Python...
I'm trying to have python take a text file of file names (new name on each row), and store them as strings ...
i.e
import os, shutil
files_to_find = []
with open('C:\\pathtofile\\lostfiles.txt') as fh:
for row in fh:
files_to_find.append(row.strip)
...in order to search for these files in directories and then copy any found files somewhere else...
for root, dirs, files in os.walk('D:\\'):
for _file in files:
if _file in files_to_find:
print ("Found file in: " + str(root))
shutil.copy(os.path.abspath(root + '/' + _file), 'C:\\destination')
print ("process completed")
Despite knowing these files exist, the script runs without any errors but without finding any files.
I added...
print (files_to_find)
...after the first block of code to see if it was finding anything and saw screeds of "built-in method strip of str object at 0x00000000037FC730>,
Does this tell me it's not successfully creating strings to compare file names against? I wonder where I'm going wrong?
Use array to create a list of files.
import os
import sys
import glob
import shutil
def file_names(self,filepattern,dir):
os.chdir(dir)
count = len(glob.glob(filepattern))
file_list = []
for line in sorted(glob.glob(filepattern)):
line = line.split("/")
line = line[-1]
file_list.append(line)
return file_list
The loop over the array list to compare.
I want to write a program for this: In a folder I have n number of files; first read one file and perform some operation then store result in a separate file. Then read 2nd file, perform operation again and save result in new 2nd file. Do the same procedure for n number of files. The program reads all files one by one and stores results of each file separately. Please give examples how I can do it.
I think what you miss is how to retrieve all the files in that directory.
To do so, use the glob module.
Here is an example which will duplicate all the files with extension *.txt to files with extension *.out
import glob
list_of_files = glob.glob('./*.txt') # create the list of file
for file_name in list_of_files:
FI = open(file_name, 'r')
FO = open(file_name.replace('txt', 'out'), 'w')
for line in FI:
FO.write(line)
FI.close()
FO.close()
import sys
# argv is your commandline arguments, argv[0] is your program name, so skip it
for n in sys.argv[1:]:
print(n) #print out the filename we are currently processing
input = open(n, "r")
output = open(n + ".out", "w")
# do some processing
input.close()
output.close()
Then call it like:
./foo.py bar.txt baz.txt
You may find the fileinput module useful. It is designed for exactly this problem.
I've just learned of the os.walk() command recently, and it may help you here.
It allows you to walk down a directory tree structure.
import os
OUTPUT_DIR = 'C:\\RESULTS'
for path, dirs, files in os.walk('.'):
for file in files:
read_f = open(os.join(path,file),'r')
write_f = open(os.path.join(OUTPUT_DIR,file))
# Do stuff
Combined answer incorporating directory or specific list of filenames arguments:
import sys
import os.path
import glob
def processFile(filename):
fileHandle = open(filename, "r")
for line in fileHandle:
# do some processing
pass
fileHandle.close()
def outputResults(filename):
output_filemask = "out"
fileHandle = open("%s.%s" % (filename, output_filemask), "w")
# do some processing
fileHandle.write('processed\n')
fileHandle.close()
def processFiles(args):
input_filemask = "log"
directory = args[1]
if os.path.isdir(directory):
print "processing a directory"
list_of_files = glob.glob('%s/*.%s' % (directory, input_filemask))
else:
print "processing a list of files"
list_of_files = sys.argv[1:]
for file_name in list_of_files:
print file_name
processFile(file_name)
outputResults(file_name)
if __name__ == '__main__':
if (len(sys.argv) > 1):
processFiles(sys.argv)
else:
print 'usage message'
from pylab import *
import csv
import os
import glob
import re
x=[]
y=[]
f=open("one.txt",'w')
for infile in glob.glob(('*.csv')):
# print "" +infile
csv23=csv2rec(""+infile,'rb',delimiter=',')
for line in csv23:
x.append(line[1])
# print len(x)
for i in range(3000,8000):
y.append(x[i])
print ""+infile,"\t",mean(y)
print >>f,""+infile,"\t\t",mean(y)
del y[:len(y)]
del x[:len(x)]
I know I saw this double with open() somewhere but couldn't remember where. So I built a small example in case someone needs.
""" A module to clean code(js, py, json or whatever) files saved as .txt files to
be used in HTML code blocks. """
from os import listdir
from os.path import abspath, dirname, splitext
from re import sub, MULTILINE
def cleanForHTML():
""" This function will search a directory text files to be edited. """
## define some regex for our search and replace. We are looking for <, > and &
## To replaced with &ls;, > and &. We might want to replace proper whitespace
## chars to as well? (r'\t', ' ') and (f'\n', '<br>')
search_ = ((r'(<)', '<'), (r'(>)', '>'), (r'(&)', '&'))
## Read and loop our file location. Our location is the same one that our python file is in.
for loc in listdir(abspath(dirname(__file__))):
## Here we split our filename into it's parts ('fileName', '.txt')
name = splitext(loc)
if name[1] == '.txt':
## we found our .txt file so we can start file operations.
with open(loc, 'r') as file_1, open(f'{name[0]}(fixed){name[1]}', 'w') as file_2:
## read our first file
retFile = file_1.read()
## find and replace some text.
for find_ in search_:
retFile = sub(find_[0], find_[1], retFile, 0, MULTILINE)
## finally we can write to our newly created text file.
file_2.write(retFile)
This thing also works for reading multiple files, my file name is fedaralist_1.txt and federalist_2.txt and like this, I have 84 files till fedaralist_84.txt
And I'm reading the files as f.
for file in filename:
with open(f'federalist_{file}.txt','r') as f:
f.read()