I'm trying to move a file based on a keyword in a pdf using Python. What I have so far is this. PyCharm tells me there are no errors, but I'm obviously missing something since it isn't working. It's probably something simple but I'm extremely new to coding in general. Most of this I've pieced together based on stuff I've seen on here.
import os
# import PyPDF2
import PyPDF2
#import Shutil
import shutil
#import PDF Search
#Set list of client's to search for
client_names = ["Name1", "Name2", "Name3", "Name4"]
#Set Scanned Folder
scanned_path = r'path'
#Create a list of files in folder
scanned_file_list = [os.path.join(scanned_path, f) for f in os.listdir(scanned_path)]
print(scanned_file_list)
#Loop for file searching
for document in scanned_file_list:
lst = []
if document[-3] == 'pdf':
file = open(document, "rb")
reader=PyPDF2.PdfFileReader(file)
pageObj = reader.getPage(0)
text = (pageObj.extractText())
text = text.split(",")
source_path = document
for sentence in text:
lst = []
for name in client_names:
if name in sentence:
if name not in lst:
lst.append(name)
if len(lst) > 1:
file.close()
elif len(lst) < 1:
file.close()
elif lst[0] == client_names[0]:
#move file to CA folder
destination_path = r'C:\ClientFolder1'
shutil.move(source_path,destination_path)
elif lst[0] == client_names[1]:
destination_path = r'C:\ClientFolder2'
shutil.move(source_path,destination_path)
elif lst[0] == client_names[2]:
#move file to KB folder
destination_path = r'C:\ClientFolder3'
shutil.move(source_path,destination_path)```
Related
I am trying to make a python 3 script that will search the user-defined folder for certain search terms and 1. copy files with matching words to a new folder on the desktop and 2. create a text file that contains the matched search terms.
The script is working, but for whatever reason, it seems to be maxing out with 1-2 search terms added to the "search_text_list". For example, in the example below, I only get matches for "Displayed" and "x exception". However, when I modify the list to just have "ERR:", it will pick it up, where it would miss it in my longer list of search terms.
I'm very new to python/coding, but anyone have an idea of what might be going on?
CODE
import os
import shutil
import datetime
source_folder = input("Enter the source folder path: ")
search_text_list = ["x exception", "Displayed", "!!!!!", "thermal event", "ERR:", "WRN", "InstrumentMonitorEvent"]
target_folder_name = "NovaSeq 6000 Parsing"
match_file_folder_name = "NovaSeq 6000 Analyzer Output_" + str(datetime.datetime.now().strftime("%Y-%m-%d %H-%M-%S"))
match_file_info = "Matched Search Terms.txt"
desktop = os.path.join(os.path.join(os.environ['USERPROFILE']), 'Desktop')
target_folder = os.path.join(desktop, target_folder_name)
match_file_folder = os.path.join(target_folder, match_file_folder_name)
match_file_path = os.path.join(match_file_folder, match_file_info)
if not os.path.exists(target_folder):
os.makedirs(target_folder)
if not os.path.exists(match_file_folder):
os.makedirs(match_file_folder)
#Find all the matched search terms defined in the Search_text_list and copy them to the folder as well as make a text file with the matched fw
matched_search_terms = []
for root, dirs, files in os.walk(source_folder):
if "ETF" in dirs:
dirs.remove("ETF")
for file in files:
if "Warnings_And_Errors" in file:
continue
if "RunSetup" in file:
continue
if "Wash" in file:
continue
full_path = os.path.join(root, file)
with open(full_path, 'r', errors='ignore') as f:
file_content = f.read().lower()
for search_text in search_text_list:
if search_text.lower() in file_content:
matched_search_terms.append(search_text)
shutil.copy2(full_path, match_file_folder)
break
with open(match_file_path, 'w') as f:
if len(matched_search_terms) > 0:
f.write("\n".join(matched_search_terms))
else:
f.write("NA")
I have confirmed that my search text list works, but there seems to be a limit on the number of terms I can add to my list that will match sucessfully.
You only get the first matching search term for each file, because you break out of the loop after the first match.
Instead of a loop, use a list comprehension to get all the matches, then add that to the list.
with open(full_path, 'r', errors='ignore') as f:
file_content = f.read().lower()
matching_terms = [term for term in search_text_list if term.lower() in file_content]
if matching_terms:
matched_seach_terms.extend(matching_terms)
shutil.copy2(full_path, match_file_folder)
You might also want to make matched_search_terms a set rather than a list, so you don't get lots of duplicates.
Here is the updated code that is only matching 1 file now. I'm now matching Displayed, WRN, InstrumentMonitorEvent, which I was previously missing.
import os
import shutil
import datetime
source_folder = input("Enter the source folder path: ")
search_text_list = ["x exception", "Displayed", "!!!!!", "thermal event", "ERR:", "WRN", "InstrumentMonitorEvent"]
target_folder_name = "NovaSeq 6000 Parsing"
match_file_folder_name = "NovaSeq 6000 Analyzer Output_" + str(datetime.datetime.now().strftime("%Y-%m-%d %H-%M-%S"))
match_file_info = "Matched Search Terms.txt"
desktop = os.path.join(os.path.join(os.environ['USERPROFILE']), 'Desktop')
target_folder = os.path.join(desktop, target_folder_name)
match_file_folder = os.path.join(target_folder, match_file_folder_name)
match_file_path = os.path.join(match_file_folder, match_file_info)
if not os.path.exists(target_folder):
os.makedirs(target_folder)
if not os.path.exists(match_file_folder):
os.makedirs(match_file_folder)
#Find all the matched search terms defined in the Search_text_list and copy them to the folder as well as make a text file with the matched fw
matched_search_terms = []
for root, dirs, files in os.walk(source_folder):
for file in files:
if "Warnings_And_Errors" in file:
continue
if "RunSetup" in file:
continue
if "Wash" in file:
continue
full_path = os.path.join(root, file)
with open(full_path, 'r', errors='ignore') as f:
file_content = f.read().lower()
matching_terms = [term for term in search_text_list if term.lower() in file_content]
if matching_terms:
matched_search_terms.extend(matching_terms)
shutil.copy2(full_path, match_file_folder)
break
with open(match_file_path, 'w') as f:
if len(matched_search_terms) > 0:
f.write("\n".join(matched_search_terms))
else:
f.write("NA")
I want to count the number of times the "genre" tag occurs in some xml files. If found more than once I want to print that more than one was found in the filename it's found in. The code below writes that every file has the genre tag greater than 1 and writes "genre count is > 1" for every file. How can I run the code separately for each xml file?
#!/usr/bin/python3
import os
import xml.etree.ElementTree as ET
mypath = "somepath"
def check_genre():
genrelist = []
count = 0
try:
for base, dirs, file in os.walk(mypath):
dirs[:] = [d for d in dirs if not d[0] == '.']
for file in file:
file = os.path.join(base, file)
if file.endswith(".".lower()):
tree = ET.parse(file)
root = tree.getroot()
gene_tag = root.find("genre")
for genre_tag in root.iter('genre'):
count +=1
if count > 1:
print ("genre count is > 1", file)
Can anybody help me with some command prompt details/ script detail/python programming on how to get file details?
Scenario:
Folder contains many subfolders -- > how to get to know what file formats are present in the folders and how to get path of all those files.
Like, I need, distinct file names/formats/path of the files present under a folder/subfolders
Is there anyway possible to get that or manual effort will only be required?
To recursively list all files in folders and sub-folders in Python:
Glob [docs]
from glob import glob
glob("**", recursive=True)
OS Walk [docs]
import os
list(os.walk("./"))
import os, csv
import glob
import pandas as pd
import ast
dir_path = r'<path of directory>'
extension_output_path = r"<path of output file. Path where u want to save output in csv format>"
output_filenames_path = r"<path of output file. Path where u want to save output in csv format>"
exts = set(f.split('.')[-1] for dir,dirs,files in os.walk(dir_path) for f in files if '.' in f)
exts = list(set(exts))
subdirs = [x[0] for x in os.walk(dir_path)]
print(exts)
big_list = []
bigg_list = []
def FindMaxLength(lst):
maxLength = max(map(len, lst))
return maxLength
for dirs in subdirs:
split_dirs = dirs.split('\\')
big_list.append(split_dirs)
big_list_count = FindMaxLength(big_list)
for subdis in big_list:
count_val = big_list_count - len(subdis)
bigg_list.append(subdis + ['']* count_val + ['/'.join(subdis)])
output_list = []
path_list = []
for subbs in bigg_list:
big_dict = {}
for ext in exts:
tifCounter = len(glob.glob1(subbs[-1],"*."+ext))
filenames = glob.glob1(subbs[-1],"*."+ext)
if filenames != []:
val = list(map((subbs[-1]+'/').__add__,filenames))
if len(val) >1:
for li in val:
path_list.append([ext, li])
else:
path_list.append([ext]+val)
if tifCounter != 0:
big_dict[ext] = tifCounter
output_list.append(subbs+ [big_dict])
columns_row = ['col']* (big_list_count + 1)+ ['val'] + exts
with open(extension_output_path,'w', newline='') as csv_file:
csv_wr = csv.writer(csv_file)
csv_wr.writerow(columns_row)
csv_wr.writerows(output_list)
cv = pd.read_csv(extension_output_path)
for index, row in cv.iterrows():
for ext in exts:
if row['val'] != '{}' and ext in ast.literal_eval(row['val']):
cv.loc[index,ext] = ast.literal_eval(row['val'])[ext]
del cv['val']
cv.to_csv(extension_output_path, index=False)
with open(output_filenames_path,'w', newline='') as csv_file:
csv_wr = csv.writer(csv_file)
csv_wr.writerow(['extension', 'filename'])
csv_wr.writerows(path_list)
print("completed")
This output file will contain folder/subfolder path with extension's count.
I found some code online, and modified it to list all folders in an FTP directory. I have all the folders listed, with the code below.
import ftplib
from ftplib import FTP
ftp = FTP()
import datetime
filenames = []
data = []
ftp = ftplib.FTP('ftp.something.com', 'u_name', 'pswd')
def get_dirs_ftp(folder=""):
contents = ftp.nlst(folder)
folders = []
for item in contents:
if "." not in item:
folders.append(item)
return folders
def get_all_dirs_ftp(folder=""):
dirs = []
new_dirs = []
new_dirs = get_dirs_ftp(folder)
while len(new_dirs) > 0:
for dir in new_dirs:
dirs.append(dir)
old_dirs = new_dirs[:]
new_dirs = []
for dir in old_dirs:
for new_dir in get_dirs_ftp(dir):
new_dirs.append(new_dir)
dirs.sort()
return dirs
allfiles = []
all_dirs = get_all_dirs_ftp()
Using the code above, I confirmed that the hierarchy is correct. Now, I am trying to loop through this list of folders and subfolders, and drill down to the files in each. This is where the problem occurs. Here's the rest of my code.
for dir in all_dirs:
ftp.cwd(dir)
ftp.retrlines('LIST')
filenames = []
ftp.retrlines('NLST', filenames.append)
# writes file name and modified date and file size, to dataframe
#data = []
for filename in filenames:
filename
modifiedTimeFtp = datetime.datetime.strptime(datetime[4:], "%Y%m%d%H%M%S").strftime("%d %b %Y %H:%M:%S")
size = ftp.size(filename)
filesize = "{:.2f}".format(size/(1024)) + 'kb'
finaldata = (str(filename) + '|' + str(modifiedTimeFtp) + '|' + str(filesize))
allfiles.append(finaldata,'\n')
Now, when I run this section of code, I get this error: TypeError: 'module' object is not subscriptable
I'm thinking that the problem lies in this range.
ftp.cwd(dir)
ftp.retrlines('LIST')
filenames = []
ftp.retrlines('NLST', filenames.append)
That's my guess, but I don't know for sure. Is there an easy way to get this working? I almost feel like this is mission impossible, because the FTP folder that I'm querying is pretty massive, and I'm guessing there can be all kinds of timeouts, or whatever, while the task is running. All I'm trying to do is get the file name, file date/time modified, and file size. Thanks for the look.
Here is the final, working, version.
import ftplib
from ftplib import FTP
ftp = FTP()
from datetime import datetime
filenames = []
data = []
ftp = ftplib.FTP('ftp.anything.com', 'u_name', 'ps_wd')
def get_dirs_ftp(folder=""):
contents = ftp.nlst(folder)
folders = []
for item in contents:
if "." not in item:
folders.append(item)
return folders
def get_all_dirs_ftp(folder=""):
dirs = []
new_dirs = []
new_dirs = get_dirs_ftp(folder)
while len(new_dirs) > 0:
for dir in new_dirs:
dirs.append(dir)
old_dirs = new_dirs[:]
new_dirs = []
for dir in old_dirs:
for new_dir in get_dirs_ftp(dir):
new_dirs.append(new_dir)
dirs.sort()
return dirs
#allfiles = []
# get parent and child folders in directory
all_dirs = get_all_dirs_ftp()
# create a list to append metadata
dir_list = []
for dir in all_dirs:
ftp.cwd('/'+dir+'/')
print(dir)
dir_list.append(dir)
ftp.dir(dir_list.append)
len(dir_list)
# you probably want to dump the results to a file...
outF = open('C:/your_path/filenames.csv', 'w')
for line in dir_list:
# write line to output file
outF.write(line)
outF.write("\n")
outF.close()
print('Done!!')
You should take a look at the stacktrace for the actual line that is causing the error.
From your code the fault appears to be this line:
modifiedTimeFtp = datetime.datetime.strptime(datetime[4:], "%Y%m%d%H%M%S").strftime("%d %b %Y %H:%M:%S")
datetime here appears to be the module, but you probably want to refer to some variable with the date, so datetime[4:] is causing the error.
I am trying to write a script that tracks for changes made in directories/files set to multiple file paths created by an installer. I found Thomas Sileo's DirTools project on git, modified it, but am now running into some issues when writing/reading from JSON:
1) First, I believe that I am writing to JSON incorrectly and am finding that my create_state() function is only writing the last path I need.
2) If I get it working, I am unable to read/parse the file like I was before. I usually get ValueError: Extra data errors
Code below:
import os import json import getpass
files = [] subdirs = []
USER = getpass.getuser()
pathMac = ['/Applications/',
'/Users/' + USER + '/Documents/' ]
def create_dir_index(path):
files = []
subdirs = []
for root, dirs, filenames in os.walk(path):
for subdir in dirs:
subdirs.append(os.path.relpath(os.path.join(root, subdir), path))
for f in filenames:
files.append(os.path.relpath(os.path.join(root, f), path))
return dict(files=files, subdirs=subdirs)
def create_state(): for count in xrange(len(pathMac)):
dir_state = create_dir_index(pathMac[count])
out_file = open("Manifest.json", "w")
json.dump(dir_state, out_file)
out_file.close()
def compare_states(dir_base, dir_cmp):
'''
return a comparison two manifest json files
'''
data = {}
data['deleted'] = list(set(dir_cmp['files']) - set(dir_base['files']))
data['created'] = list(set(dir_base['files']) - set(dir_cmp['files']))
data['deleted_dirs'] = list(set(dir_cmp['subdirs']) - set(dir_base['subdirs']))
data['created_dirs'] = list(set(dir_base['subdirs']) - set(dir_cmp['subdirs']))
return data
if __name__ == '__main__':
response = raw_input("Would you like to Compare or Create? ")
if response == "Create":
# CREATE MANIFEST json file
create_state()
print "Manifest file created."
elif response == "Compare":
# create the CURRENT state of all indexes in pathMac and write to json file
for count in xrange(len(pathMac)):
dir_state = create_dir_index(pathMac[count])
out_file = open("CurrentState.json", "w")
json.dump(dir_state, out_file)
out_file.close()
# Open and Load the contents from the file into dictionaries
manifest = json.load(open("Manifest.json", "r"))
current = json.load(open("CurrentState.json", "r"))
print compare_states(current, manifest)