Python: how to iterate thru a folder to find all "docx" - python

I need help writing a for loop to find all ".docx" files then edit them and save them under a new name.
I have some code that works for a single file
I need code that iterates thru a folder and picks up only the "docx."
After editing, save them into another folder.
a) Retain the *current file name and put it in another folder
Code below:
import docx
import os
directory = '.' #set dir of all *docx files"
extension = '.docx' # editable files must be "docx"
text = '' #not sure what this is doing here.
#The for loop to check that all the .docx are found in folder. Edit them regx. BUT I do not know the proper syntax for "file" ending in "docx"
for "file" in os.listdir(directory) #
if "file".endswith(extension)
def replacer(p, replace_dict):
inline = p.runs # Specify the list being used
for j in range(0, len(inline)):
# Iterate over the dictionary
for k, v in replace_dict.items():
if k in inline[j].text:
inline[j].text = inline[j].text.replace(k, v)
return p
# Replace Paragraphs
doc = docx.Document("5538AP_7975_D1.docx") # Get a file BUT I need all files with different part names.
dict = {'Don Steel': 'TAN', '5538AP':'5499AP', 'Special Mask Build Notes': 'Special Mask Build Notes: Special New receipts for TAN 4X to 5X'} # Build the dict
for p in doc.paragraphs: # If needed, iter over paragraphs
p = replacer(p, dict) # Call the new replacer function
doc.save("/newOrder/SUBSTITUTE_5538AP1.docx") # need to save all files with their original name in a new folder "newOrder".

A simple answer but effective, you could use the glob module as this has direct pattern searching builtin. See the following example
import glob
for i in glob.glob('E:\drivers\\**\\*.pdf', recursive=True):
print(i)
This would look at all the files in E:\Drivers\ then look through every sub directory and look for pdf files. You can see more at the documentation page: https://docs.python.org/3/library/glob.html

# NEW FUNCTION:
import docx
import glob
for i in glob.glob(r'C:\Users\LOCAL\Documents\TOPPAN_MOR_BASE_PART\*.docx', recursive=True):
def replacer(p, replace_dict):
inline = p.runs # Specify the list being used
for j in range(0, len(inline)):
# Iterate over the dictionary
for k, v in replace_dict.items():
if k in inline[j].text:
inline[j].text = inline[j].text.replace(k, v)
return p
# Replace Paragraphs
file_suffix = raw_input(i)
doc = docx.Document(".docx") # Get the file
dict = {'Don Coffman': 'TOPPAN', 'MG5459APSIM':'MG5499AP', 'Special Mask Build Notes': 'Special Mask Build Notes: Special Mask Build Notes: TOPPAN 4X to 5X'} # Build the dict
for p in doc.paragraphs: # If needed, iter over paragraphs
p = replacer(p, dict) # Call the new replacer function
doc.save(r'C:\Users\LOCAL\Documents\TOPPAN_MOR_BASE_PART\NewOrder\SUBSTITUTE'+file_suffix+'docx')

Related

building a dictionary of my directories and file paths to select all files whose name contains a specific string

I have a rootdirectory called 'IC'. 'IC' contains a bunch of subdirectories which contain subsubdirectories which contain subsubsubdirectories and so on. Is there an easy way to move all the sub...directory files into their parent subdirectory and then delete the empty sub...directories.
So far I've made this monstrosity of nested loops to build a dictionary of file paths and subdirectories as dictionaries containing file paths etc. I was gonna then make something to go through the dictionary and pick all files containing 'IC' and the subdirectory they are in. I need to know which directories contain an 'IC' file or not. I also need to move all the files containing 'IC' to the top level subdirectories(see hashtag in code)
import os, shutil
rootdir = 'data/ICs'
def dir_tree(rootdir):
IC_details = {}
# This first loop is what I'm calling the top level subdirectories. They are the three
# subdirectories inside the directory 'data/ICs'
for i in os.scandir(rootdir):
if os.path.isdir(i):
IC_details[i.path] = {}
for i in IC_details:
for j in os.scandir(i):
if os.path.isdir(j.path):
IC_details[i][j.name] = {}
elif os.path.isfile(j.path):
IC_details[i][j.name] = [j.path]
for j in IC_details[i]:
if os.path.isdir(os.path.join(i,j)):
for k in os.scandir(os.path.join(i,j)):
if os.path.isdir(k.path):
IC_details[i][j][k.name] = {}
elif os.path.isfile(k.path):
IC_details[i][j][k.name] = [k.path]
for k in IC_details[i][j]:
if os.path.isdir(os.path.join(i,j,k)):
for l in os.scandir(os.path.join(i,j,k)):
if os.path.isdir(l.path):
IC_details[i][j][k][l.name] = {}
elif os.path.isfile(l.path):
IC_details[i][j][k][l.name] = [l.path]
for l in IC_details[i][j][k]:
if os.path.isdir(os.path.join(i,j,k,l)):
for m in os.scandir(os.path.join(i,j,k,l)):
if os.path.isfile(m.path):
IC_details[i][j][k][l][m.name] = [m.path]
return IC_details
IC_tree = dir_tree(rootdir)
You should have a look at the 'glob' module :
glob — Unix style pathname pattern expansion¶

Traversing through each directory to identify file types and the count of each type?

I am trying to build a script that can help me in traversing through all the files in a directory and to identify its file type. At the end the result should print the total count of each file type that were identified. I am using the magic library to identify the file type based on MIME.
for filename in os.listdir(os.getcwd()):
print filename
with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
t = m.id_filename(filename)
print t
The identification piece is pasted above which seems to be working fine but I am not sure how to store the identified filetypes and their count. The output should look like:
filetype1 count
filetype2 count
...
...
Please guide me as to what should be the ideal way of doing it.
You can create a dictionary containing a mapping of each file type to its count. e.g.
file_types = {'filetype1' : 10, 'filetype2': 20, ...}
Note that your current solution will only work on the current directory and not subdirectories.
file_types = {}
for filename in os.listdir(os.getcwd()):
with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
t = m.id_filename(filename)
file_types.setdefault(t, 0)
file_types[t] += 1
...
Should append and count for you.
You could use the Counter class from the collections module. It is basically a variant of a dictionary, with a few additional methods and the advantage that you don't need to initialize it with 0 when counting.
I don't have that magic that you mention, so here's an example using my_magic as a substitute:
import collections
import os
def my_magic(filename):
"""
This function is just a placeholder to be used in place of your id_filename()
method.
"""
if filename.endswith(".txt"):
return "TXT"
elif filename.endswith(".pdf"):
return "PDF"
else:
return "other"
# initialize the counter object:
counter = collections.Counter()
for filename in os.listdir(os.getcwd()):
print filename
# substitute the next line with whatever you use to determine the
# type of the file:
t = my_magic(filename)
print t
# increase the count for the current value of 't':
counter[t] += 1
# output what is in counter:
for ext, n in counter.items():
print ext, n

I have a list of a part of a filename, for each one I want to go through the files in a directory that matches that part and return the filename

So, let's say I have a directory with a bunch of filenames.
for example:
Scalar Product or Dot Product (Hindi)-fodZTqRhC24.m4a
AP Physics C - Dot Product-Wvhn_lVPiw0.m4a
An Introduction to the Dot Product-X5DifJW0zek.m4a
Now let's say I have a list, of only the keys, which are at the end of the file names:
['fodZTqRhC24', 'Wvhn_lVPiw0, 'X5DifJW0zek']
How can I iterate through my list to go into that directory and search for a file name containing that key, and then return me the filename?
Any help is greatly appreciated!
I thought about it, I think I was making it harder than I had to with regex. Sorry about not trying it first. I have done it this way:
audio = ['Scalar Product or Dot Product (Hindi)-fodZTqRhC24.m4a',
'An Introduction to the Dot Product-X5DifJW0zek.m4a',
'AP Physics C - Dot Product-Wvhn_lVPiw0.m4a']
keys = ['fodZTqRhC24', 'Wvhn_lVPiw0', 'X5DifJW0zek']
file_names = []
for Id in keys:
for name in audio:
if Id in name:
file_names.append(name)
combined = zip(keys,file_names)
combined
Here is an example:
ls: list of files in a given directory
names: list of strings to search for
import os
ls=os.listdir("/any/folder")
n=['Py', 'sql']
for file in ls:
for name in names:
if name in file:
print(file)
Results :
.PyCharm50
.mysql_history
zabbix2.sql
.mysql
PycharmProjects
zabbix.sql
Assuming you know which directory that you will be looking in, you could try something like this:
import os
to_find = ['word 1', 'word 2'] # list containing words that you are searching for
all_files = os.listdir('/path/to/file') # creates list with files from given directory
for file in all_files: # loops through all files in directory
for word in to_find: # loops through target words
if word in file:
print file # prints file name if the target word is found
I tested this in my directory which contained these files:
Helper_File.py
forms.py
runserver.py
static
app.py
templates
... and i set to_find to ['runserver', 'static']...
and when I ran this code it returned:
runserver.py
static
For future reference, you should make at least some sort of attempt at solving a problem prior to posting a question on Stackoverflow. It's not common for people to assist you like this if you can't provide proof of an attempt.
Here's a way to do it that allows for a selection of weather to match based on placement of text.
import os
def scan(dir, match_key, bias=2):
'''
:0 startswith
:1 contains
:2 endswith
'''
matches = []
if not isinstance(match_key, (tuple, list)):
match_key = [match_key]
if os.path.exists(dir):
for file in os.listdir(dir):
for match in match_key:
if file.startswith(match) and bias == 0 or file.endswith(match) and bias == 2 or match in file and bias == 1:
matches.append(file)
continue
return matches
print scan(os.curdir, '.py'

Can't get unique word/phrase counter to work - Python

I'm having trouble getting anything to write in my outut file (word_count.txt).
I expect the script to review all 500 phrases in my phrases.txt document, and output a list of all the words and how many times they appear.
from re import findall,sub
from os import listdir
from collections import Counter
# path to folder containg all the files
str_dir_folder = '../data'
# name and location of output file
str_output_file = '../data/word_count.txt'
# the list where all the words will be placed
list_file_data = '../data/phrases.txt'
# loop through all the files in the directory
for str_each_file in listdir(str_dir_folder):
if str_each_file.endswith('data'):
# open file and read
with open(str_dir_folder+str_each_file,'r') as file_r_data:
str_file_data = file_r_data.read()
# add data to list
list_file_data.append(str_file_data)
# clean all the data so that we don't have all the nasty bits in it
str_full_data = ' '.join(list_file_data)
str_clean1 = sub('t','',str_full_data)
str_clean_data = sub('n',' ',str_clean1)
# find all the words and put them into a list
list_all_words = findall('w+',str_clean_data)
# dictionary with all the times a word has been used
dict_word_count = Counter(list_all_words)
# put data in a list, ready for output file
list_output_data = []
for str_each_item in dict_word_count:
str_word = str_each_item
int_freq = dict_word_count[str_each_item]
str_out_line = '"%s",%d' % (str_word,int_freq)
# populates output list
list_output_data.append(str_out_line)
# create output file, write data, close it
file_w_output = open(str_output_file,'w')
file_w_output.write('n'.join(list_output_data))
file_w_output.close()
Any help would be great (especially if I'm able to actually output 'single' words within the output list.
thanks very much.
Would be helpful if we got more information such as what you've tried and what sorts of error messages you received. As kaveh commented above, this code has some major indentation issues. Once I got around those, there were a number of other logic errors to work through. I've made some assumptions:
list_file_data is assigned to '../data/phrases.txt' but there is then a
loop through all file in a directory. Since you don't have any handling for
multiple files elsewhere, I've removed that logic and referenced the
file listed in list_file_data (and added a small bit of error
handling). If you do want to walk through a directory, I'd suggest
using os.walk() (http://www.tutorialspoint.com/python/os_walk.htm)
You named your file 'pharses.txt' but then check for if the files
that endswith 'data'. I've removed this logic.
You've placed the data set into a list when findall works just fine with strings and ignores special characters that you've manually removed. Test here:
https://regex101.com/ to make sure.
Changed 'w+' to '\w+' - check out the above link
Converting to a list outside of the output loop isn't necessary - your dict_word_count is a Counter object which has an 'iteritems' method to roll through each key and value. Also changed the variable name to 'counter_word_count' to be slightly more accurate.
Instead of manually generating csv's, I've imported csv and utilized the writerow method (and quoting options)
Code below, hope this helps:
import csv
import os
from collections import Counter
from re import findall,sub
# name and location of output file
str_output_file = '../data/word_count.txt'
# the list where all the words will be placed
list_file_data = '../data/phrases.txt'
if not os.path.exists(list_file_data):
raise OSError('File {} does not exist.'.format(list_file_data))
with open(list_file_data, 'r') as file_r_data:
str_file_data = file_r_data.read()
# find all the words and put them into a list
list_all_words = findall('\w+',str_file_data)
# dictionary with all the times a word has been used
counter_word_count = Counter(list_all_words)
with open(str_output_file, 'w') as output_file:
fieldnames = ['word', 'freq']
writer = csv.writer(output_file, quoting=csv.QUOTE_ALL)
writer.writerow(fieldnames)
for key, value in counter_word_count.iteritems():
output_row = [key, value]
writer.writerow(output_row)
Something like this?
from collections import Counter
from glob import glob
def extract_words_from_line(s):
# make this as complicated as you want for extracting words from a line
return s.strip().split()
tally = sum(
(Counter(extract_words_from_line(line))
for infile in glob('../data/*.data')
for line in open(infile)),
Counter())
for k in sorted(tally, key=tally.get, reverse=True):
print k, tally[k]

Extracting "unsigned files" from a directory

I have a directory with xml files associated with encrypted P7M files, meaning that for every name.xml there is a name.P7M. But there are some exceptions (P7M file is absent) and my goal is to detect them using python.
I'm thinking this code.. Can you help with an elegant code?
import glob
# functions to eleminate extension name
def is_xml(x):
a = re.search(r"(\s)(.xml)",x)
if a :
return a.group(0)
else:
return False
def is_P7M(x):
a = re.search(r"(\s)(.P7M)", x)
if a :
return a.group(0)
else:
return False
# putting xml files and P7M files in two sets
setA = set (glob.glob('directory/*.xml'))
setB = set (glob.glob('directory/*.P7M'))
#eliminating extention names
for elt in setA:
elt= is_xml(elt)
for elt in setB:
elt= is_P7M(elt)
#difference between two sets. setB is always a larger set
print "unsigned files are:", setB.difference(setA)
A simpler way is to glob for the .xml files, then check using os.path.exists for a .P7M file:
import os, glob
for xmlfile in glob.glob('*.xml'):
if not os.path.exists(xmlfile.rsplit(".", 1)[0] + ".P7M"):
print xmlfile, "is unsigned"
This code:
Uses glob.glob to get all the xml files.
Uses str.rsplit to split the filename up into name and extension (e.g. "name.xml" to ("name", ".xml")). The second argument stops str.rsplit splitting more than once.
Takes the name of the file and adds the .P7M extension.
Uses os.path.exists to see if the key file is there. If is isn't, the xmlfile is unsigned, so print it out.
If you need them in a list, you can do:
unsigned = [xmlfile for xmlfile in glob.glob('*.xml') if not os.path.exists(xmlfile.rsplit(".", 1)[0] + ".P7M")]
Or a set:
unsigned = {xmlfile for xmlfile in glob.glob('*.xml') if not os.path.exists(xmlfile.rsplit(".", 1)[0] + ".P7M")}
My solution would be:
import glob
import os
get_name = lambda fname: os.path.splitext(fname)[0]
xml_names = {get_name(fname) for fname in glob.glob('directory/*.xml')}
p7m_names = {get_name(fname) for fname in glob.glob('directory/*.p7m')}
unsigned = [xml_name + ".xml" for xml_name in \
xml_names.difference(p7m_names)]
print unsigned
get all xml's in a dict removing the extension and using the name as key and setting the value to false initially, if we find a matching P7M set value to True, finally print all keys with a False value.
xmls = glob.glob('directory/*.xml')
p7ms = glob.glob('directory/*.P7M')
# use xml file names as keys by removing the extension
d = {k[rsplit(k,1)[0]]:False for k in xmls}
# go over every .P7M again removing extension
# setting value to True for every match
for k in p7ms:
k[rsplit(k,1)[0]] = True
# any values that are False means there is no .P7M match for the xml file
for k,v in d.items():
if not v:
print(k)
Or create a set of each and find the difference:
xmls = {x.rsplit(".",1)[0] for x in in glob.glob('directory/*.xml')}
pm7s = {x.rsplit(".",1)[0] for x in glob.glob('directory/*.P7M')}
print(xmls - pm7s)
Iterate over glob once and populate a dict of filenames by extension. Finally, compute the difference between 'xml' and 'P7M' sets.
import os, glob, collections
fnames = collections.defaultdict(set)
for fname in glob.glob('*'):
f, e = os.path.splitext(fname)
fnames[e].add(f)
print fnames['.xml'] - fnames['.P7M']
Note that unlike other suggestions, this makes one single request to the filesystem, which might be important if the FS is slow (e.g. a network mount).

Categories