Write Folder Contents to CSV with Regex - python

I am trying to use the Python script here for my own purposes. I'm no Python bloke, so hopefully someone can see what I have wrong.
The below script doesn't error out. My CSV is created with no values. Do I have a join problem? I'm expecting to have data written to the CSV.
# import the standard libraries you'll need
import os # https://docs.python.org/2/library/os.html
import re # https://docs.python.org/2/library/re.html
# this function will walk your directories and output a list of file paths
def getFilePaths(directory):
file_paths = []
for root, directories, files in os.walk(directory):
for filename in files:
filepath = os.path.join(root, filename)
file_paths.append(filepath)
return file_paths
audio_file_paths = getFilePaths("Z:\Dropbox\Apps\DirScan\files")
output_to_csv = [];
for audio_file in audio_file_paths:
base_path, fname = os.path.split(audio_file)
reg_ex = re.compile("^(.*) - (.*) - (.*).mp3$");
# now apply the compiled regex to each path
name_components = reg_ex.match(fname);
output_to_csv.append("{0},{1}".format(",".join(name_components), base_path));
#create the file, making sure the location is writeable
csv_doc = open("database.csv", "w");
# now join all the rows with line breaks and write the compiled text to the file
csv_doc.write( '\n'.join(output_to_csv) );
#close your new database
csv_doc.close()

When I run your code I get this error:
Traceback (most recent call last):
File "x.py", line 29, in <module>
output_to_csv.append("{0},{1}".format(",".join(name_components), base_path));
TypeError
Because name_components is a regex Match object, which doesn't work as an argument to join. You need to replace:
",".join(name_components)
With:
",".join(name_components.groups())
After making that change I can see the CSV file gets written to correctly.
One other minor point: you don't need a semicolon at the end of a line in python.

Related

Creating new CSV files from inputs in different directory

I'm simply trying to alter a CSV file with Python.
When all the files were in the same dir, everything was fine.
Now that the input files are in a different dir than where the output files will be, everything blows up, apparently b/c the files do not exist?
I first found this:
open() in Python does not create a file if it doesn't exist
Then I learned to change to the directory, which helped me loop over the CSVs in the target dir:
Moving up one directory in Python
When I run the command:
python KWRG.py ../Weekly\ Reports\ -\ Inbound/Search\ Activity/ 8/9/2021
I will get:
Traceback (most recent call last): File "KWRG.py", line 15, in <module> with open(args.input, 'r') as in_file, open(args.output, 'w') as out_file: IsADirectoryError: [Errno 21] Is a directory: '../Weekly Reports - Inbound/Search Activity/'
Sorry If I'm missing the obvious here, but why is the file not being created in the directory that I'm pointing the script to (or at all for that matter)?
The code:
import csv
import argparse
import os
# Create a parser to take arguments
#...snip...
cur_dir = os.getcwd()
reports_dir = os.chdir(os.path.join(cur_dir, args.dir))
for csv_file in os.listdir(reports_dir):
# Shorthand the name of the file
#...snip...
# Open the in and out files
with open(csv_file, 'r') as in_file, open(f'{out_name}-Search-Activity-{args.date}.csv', 'w+') as out_file:
# Re-arrange CSV
# EOF
Your problem is with this line:
reports_dir = os.chdir(os.path.join(cur_dir, args.dir))
os.chdir() doesn't return anything, it just performs the operation requested - changing the current working directory. From an interactive session with the REPL:
>>> import os
>>> result = os.chdir("/Users/mattdmo/")
>>> result
>>>
For your purposes, all you need is
reports_dir = os.path.join(cur_dir, args.dir)
and you'll be all set.

Filter Directory using Regex and output filtered files to another directory

I am simply trying to create a python 3 program that runs through all .sql files in a specific directory and then apply my regex that adds ; after a certain instance and write the changes made to the file to a separate directory with their respective file names as the same.
So, if I had file1.sql and file2.sql in "/home/files" directory, after I run the program, the output should write those two files to "/home/new_files" without changes the content of the original files.
Here is my code:
import glob
import re
folder_path = "/home/files/d_d"
file_pattern = "/*sql"
folder_contents = glob.glob(folder_path + file_pattern)
for file in folder_contents:
print("Checking", file)
for file in folder_contents:
read_file = open(file, 'rt',encoding='latin-1').read()
#words=read_file.split()
with open(read_file,"w") as output:
output.write(re.sub(r'(TBLPROPERTIES \(.*?\))', r'\1;', f, flags=re.DOTALL))
I receive an error of File name too long:"CREATE EXTERNAL TABLe" and also I am not too sure where I would put my output path (/home/files/new_dd)in my code.
Any ideas or suggestions?
With read_file = open(file, 'rt',encoding='latin-1').read() the whole content of the file was being used as the file descriptor. The code provided here iterate over the files names found with glob.glob pattern open to read, process data, and open to write (assuming that a folder newfile_sqls already exist,
if not, an error would rise FileNotFoundError: [Errno 2] No such file or directory).
import glob
import os
import re
folder_path = "original_sqls"
#original_sqls\file1.sql, original_sqls\file2.sql, original_sqls\file3.sql
file_pattern = "*sql"
# new/modified files folder
output_path = "newfile_sqls"
folder_contents = glob.glob(os.path.join(folder_path,file_pattern))
# iterate over file names
for file_ in [os.path.basename(f) for f in folder_contents]:
# open to read
with open(os.path.join(folder_path,file_), "r") as inputf:
read_file = inputf.read()
# use variable 'read_file' here
tmp = re.sub(r'(TBLPROPERTIES \(.*?\))', r'\1;', read_file, flags=re.DOTALL)
# open to write to (previouly created) new folder
with open(os.path.join(output_path,file_), "w") as output:
output.writelines(tmp)

Rename files in multiple directories

I have files named the same in multiple directories. I wanted to change their names, so they would correspond to the unique id of the directory that they are in.
'*' represents unique identifier, like '067' for example
The filename is always 'NoAdapter_len25.truncated_sorted.fastq'
I wanted the filename in each directory to be '*NoAdapter_len25.truncated_sorted.fastq', where * stands for the unique identifier
Here is the the error I'm getting:
Traceback (most recent call last):
File "change_names.py", line 19, in <module>
rename(name, new_name)
TypeError: Can't convert '_io.TextIOWrapper' object to str implicitly
Here's the code that produces it:
from glob import glob
import re
from os import rename
#path = "/home/users/screening/results_Sample_*_hg38_hg19/N*"
files = glob(path)
for f in files:
with open(f) as name:
sample_id = f.partition('results_')[-1].rpartition('hg38_hg19')[0]
#print(sample_id)
back = f[-38:]
new_name = sample_id + back
rename(name, new_name)
You have a few problems:
You're opening a file for no apparent reason (it confirms the file exists and is readable at open time, but even with an open handle, the name could be moved or deleted between that and the rename, so you aren't preventing any race conditions)
You're passing the opened file object to os.rename, but os.rename takes str, not file-like objects
You're doing a lot of "magic" manipulations of the path, instead of using appropriate os.path functions
Try this to simplify the code. I included some inline comments when I'm doing what your example does, but it doesn't make a lot of sense (or it's poor form):
for path in files: # path, not f; f is usually placeholder for file-like object
filedir, filename = os.path.split(path)
parentdir = os.path.dirname(filedir)
# Strip parentdir name to get 'Sample_*_' per provided code; is this what you wanted?
# Question text seems like you only wanted the '*' part.
sample_id = parentdir.replace('results_', '').replace('hg38_hg19', '')
# Large magic numbers are code smell; if the name is a fixed name,
# just use it directly as a string literal
# If the name should be "whatever the file is named", use filename unsliced
# If you absolutely need a fixed length (to allow reruns or something)
# you might do keepnamelen = len('NoAdapter_len25.truncated_sorted.fastq')
# outside the loop, and do f[-keepnamelen:] inside the loop so it's not
# just a largish magic number
back = filename[-38:]
new_name = sample_id + back
new_path = os.path.join(filedir, new_name)
rename(path, new_path)
You feed rename a file (name) and a filename, it needs two filenames. To get from a file to its filename, you can do this
old_filename = os.path.abspath(name.name)

Concatenate last 100 files in an only one

Begginer in Python needs a bit of help. I am using Python 2.7.
I want to make a program that concatenates the last 100 files I have in a folder. In that folder I have lots of files but I only want the concatenation of the last 100 ones. I am able to do the concatenation of all of them (if I don´t specify number and change the for loop), but I am not able to select the last 100 files. These files are saved in binary by the software.They are saved in the folder specified below. I would like to remove that 100 files once are concatenated in teh new one.The program I have done is the following:
#!/usr/bin/python
import os
import glob
os.chdir("C:\AFM_test\jpk_files")
rout=""
filename=glob.glob("*-*-*.*.*-*.*.*.jpk-force")
filename.sort(key=os.path.getmtime)
for filename in range(0,99):
filename=open(filename,"rb")
tout=filename.read()+\r\n"
rout = rout+tout
os.remove(filename)
filename.close()
fout = open("output.jpk-force","wb+")
fout.write(rout)
fout.close()
It doesn´t do anything and the error is the following:
Traceback (most recent call last):
File "C:\AFM_test\jpk_files\AFM_test.py", line 12, in <module>
filename = open(filename,"rb")
TypeError: coercing to Unicode: need string or buffer, int found
[Finished in 0.1s]
I guess the problem is the loop and its structure "range(0,99)",as when I have concatenated all the files contained in the folder:
#!/usr/bin/python
import os
import glob
os.chdir("C:\AFM_test\jpk_files")
rout=""
filename=glob.glob("*-*-*.*.*-*.*.*.jpk-force")
for filename in files:
filename=open(filename,"rb")
tout=filename.read()+\r\n"
rout = rout+tout
os.remove(filename)
filename.close()
fout = open("output.jpk-force","wb+")
fout.write(rout)
fout.close()
it worked okay except the remove order, which showed this error:
Traceback (most recent call last):
File "C:\try\AFM_test_2.py", line 17, in <module>
os.remove(filename)
must be string, not file
Any ideas how can I achieve my goal?
I hope I have explained myself properly. Maybe I have missed something important, sorry, I am just a beginner in this field.
Thank you.
TypeError: coercing to Unicode: need string or buffer, int found
That is because filename is an integer and then you are trying to concatenate it with a string.
os.remove(filename)
must be string, not file
That is because you are re-assigning the variable filename (which was a string path) to a file handle/object. os.remove(..) expects the variable from the for-loop, not the result of open(..). Its generally a good practice to give meaningful names to variables – filepath and infile etc.
A better approach would be:
def processFile(filepath):
with open(filepath) as f:
content = f.read()
os.remove(filepath)
return content
def main():
paths = glob.glob("..*..*..")
last100paths = paths[-100:]
with open(outFilePath, "w") as f:
f.write("\r\n".join(processFile(path) for path in last100paths))
You need to change:
filename=open(filename,"rb")
...to something like:
inf = open(filename, "rb")
...
inf.close()
Then, when you're calling os.remove(filename), it will still be the filename from the original loop, not a file object that your code is reassigning to this variable.
Note: rather than doing this explicit opening and closing of files, though, try using the with statement (see this helpful guide).
Checn if glob is matching patterns
pattern = r"*-*-*.*.*-*.*.*.jpk-force"
filenames=glob.glob(pattern)
if not filenames:
print 'no files matched ', pattern
sys.exit(1)
Get mtime sorted file list by building list of tuples each containing file name and mtime
filenames = [ (filename,os.stat(filename)[8]) for filename in filenames ]
sort the list with mtime in descending order
filenames.sort(key=lambda x:x[1],reverse=True)
The above two lines can be simplified as;
filenames = [ filename for filename in sorted(filenames,key=os.path.getmtime,reverse=True) ]
The above line can be refactored, because we can sort in place
filenames.sort(key=os.path.getmtime,reverse=True)
#!/usr/bin/python
import os
import glob
os.chdir("C:\AFM_test\jpk_files")
rout=""
pattern = r"*-*-*.*.*-*.*.*.jpk-force"
filenames=glob.glob(pattern)
if not filenames:
print 'no files matched ', pattern
sys.exit(1)
filenames.sort(key=os.path.getmtime,reverse=True)
for filename in filenames[:100]
filecontent=open(filename,"rb")
tout=filecontent.read()+"\r\n"
filecontent.close()
rout = rout+tout
os.remove(filename)
fout = open("output.jpk-force","wb+")
fout.write(rout)
fout.close()
You didn't check for exceptions.

How to extract a file within a folder within a zip?

I need to extract a file called Preview.pdf from a folder called QuickLooks inside of a zip file.
Right now my code looks a little like this:
with ZipFile(newName, 'r') as newName:
newName.extract(\QuickLooks\Preview.pdf)
newName.close()
(In this case, newName has been set equal to the full path to the zip).
It's important to note that the backslash is correct in this case because I'm on Windows.
The code doesn't work; here's the error it gives:
Traceback (most recent call last):
File "C:\Users\Asit\Documents\Evam\Python_Scripts\pageszip.py", line 18, in <module>
ZF.extract("""QuickLooks\Preview.pdf""")
File "C:\Python33\lib\zipfile.py", line 1019, in extract
member = self.getinfo(member)
File "C:\Python33\lib\zipfile.py", line 905, in getinfo
'There is no item named %r in the archive' % name)
KeyError: "There is no item named 'QuickLook/Preview.pdf' in the archive"
I'm running the Python script from inside Notepad++, and taking the output from its console.
How can I accomplish this?
Alternatively, how could I extract the whole QuickLooks folder, move out Preview.pdf, and then delete the folder and the rest of it's contents?
Just for context, here's the rest of the script. It's a script to get a PDF of a .pages file. I know there are bonified converters out there; I'm just doing this as an excercise with some sort of real-world application.
import os.path
import zipfile
from zipfile import *
import sys
file = raw_input('Enter the full path to the .pages file in question. Please note that file and directory names cannot contain any spaces.')
dir = os.path.abspath(os.path.join(file, os.pardir))
fileName, fileExtension = os.path.splitext(file)
if fileExtension == ".pages":
os.chdir(dir)
print (dir)
fileExtension = ".zip"
os.rename (file, fileName + ".zip")
newName = fileName + ".zip" #for debugging purposes
print (newName) #for debugging purposes
with ZipFile(newName, 'w') as ZF:
print("I'm about to list names!")
print(ZF.namelist()) #for debugging purposes
ZF.extract("QuickLook/Preview.pdf")
os.rename('Preview.pdf', fileName + '.pdf')
finalPDF = fileName + ".pdf"
print ("Check out the PDF! It's located at" + dir + finalPDF + ".")
else:
print ("Sorry, this is not a valid .pages file.")
sys.exit
I'm not sure if the import of Zipfile is redundant; I read on another SO post that it was better to use from zipfile import * than import zipfile. I wasn't sure, so I used both. =)
EDIT: I've changed the code to reflect the changes suggested by Blckknght.
Here's something that seems to work. There were several issues with your code. As I mentioned in a comment, the zipfile must be opened with mode 'r' in order to read it. Another is that zip archive member names always use forward slash / characters in their path names as separators (see section 4.4.17.1 of the PKZIP Application Note). It's important to be aware that there's no way to extract a nested archive member to a different subdirectory with Python's currentzipfilemodule. You can control the root directory, but nothing below it (i.e. any subfolders within the zip).
Lastly, since it's not necessary to rename the .pages file to .zip — the filename you passZipFile() can have any extension — I removed all that from the code. However, to overcome the limitation on extracting members to a different subdirectory, I had to add code to first extract the target member to a temporary directory, and then copy that to the final destination. Afterwards, of course, this temporary folder needs to deleted. So I'm not sure the net result is much simpler...
import os.path
import shutil
import sys
import tempfile
from zipfile import ZipFile
PREVIEW_PATH = 'QuickLooks/Preview.pdf' # archive member path
pages_file = input('Enter the path to the .pages file in question: ')
#pages_file = r'C:\Stack Overflow\extract_test.pages' # hardcode for testing
pages_file = os.path.abspath(pages_file)
filename, file_extension = os.path.splitext(pages_file)
if file_extension == ".pages":
tempdir = tempfile.gettempdir()
temp_filename = os.path.join(tempdir, PREVIEW_PATH)
with ZipFile(pages_file, 'r') as zipfile:
zipfile.extract(PREVIEW_PATH, tempdir)
if not os.path.isfile(temp_filename): # extract failure?
sys.exit('unable to extract {} from {}'.format(PREVIEW_PATH, pages_file))
final_PDF = filename + '.pdf'
shutil.copy2(temp_filename, final_PDF) # copy and rename extracted file
# delete the temporary subdirectory created (along with pdf file in it)
shutil.rmtree(os.path.join(tempdir, os.path.split(PREVIEW_PATH)[0]))
print('Check out the PDF! It\'s located at "{}".'.format(final_PDF))
#view_file(final_PDF) # see Bonus below
else:
sys.exit('Sorry, that isn\'t a .pages file.')
Bonus: If you'd like to actually view the final pdf file from the script, you can add the following function and use it on the final pdf created (assuming you have a PDF viewer application installed on your system):
import subprocess
def view_file(filepath):
subprocess.Popen(filepath, shell=True).wait()

Categories