I have a python code. I managed to make my .py to .exe but when I want to drag and drop a folder with my files to compare them with other files, it won't work. Just creating my output folder and exits the program. Here is the python code and where should I modify the code when I drag and drop the folder to execute the program? Thank you
import difflib
import os
from pathlib import Path
import shutil
import time
ts = time.time()
fold1_path = input("First folder:")
path1 = []
path1.append(fold1_path)
list1 = list(Path(path1[0]).glob("*.htm"))
## Create empty folder for the result
x = os.path.expanduser("~")+ r"\Desktop\rezultat_script"
path = x
if not os.path.exists(path):
os.makedirs(path)
# Copy files into different folder for processing
files = list1.copy()
for f in files:
shutil.copy(f, path)
fold2_path = input("Second folder:")
list2 = list(Path(fold2_path).glob("*.htm"))
def compare(folder1, folder2):
for num in range(len(list1)):
show_pre = "Before editing:"
show_post = "After editing:"
show_pre_lines = open(folder1[num]).readlines()
show_post_lines = open(folder2[num]).readlines()
difference = difflib.HtmlDiff(wrapcolumn=100).make_file(show_pre_lines, show_post_lines, show_pre, show_post)
for file in range(len(folder1)):
difference_report = open(folder1[file], "w+")
difference_report.write(difference)
difference_report.close()
fold3_path = list(Path(x).glob("*.htm"))
compare(fold3_path, list2)
print("Script done!")
te = time.time()
total_time = te-ts
print("Total time used: {:10.2f}".format(total_time))
Related
So basically, I'm creating a directory that allows users to put csv files in there. But I want to create python script that would look in that folder everyday at a given time (lets say noon) and pick up the latest file that was placed in there if it's not over a day old. But I'm not sure if that's possible.
Its this chunk of code that I would like to run if it the app finds a new file in the desired directory:
def better_Match(results, best_percent = "Will need to get the match %"):
result = {}
result_list = [{item.name:item.text for item in result} for result in results]
if result_list:
score_list = [float(item['score']) for item in result_list]
match_index = max(enumerate(score_list),key=lambda x: x[1])[0]
logger.debug('MRCs:{}, Chosen MRC:{}'.format(score_list,score_list[match_index]))
logger.debug(result_list[match_index])
above_threshold = float(result_list[match_index]['score']) >= float(best_percent)
if above_threshold:
result = result_list[match_index]
return result
def clean_plate_code(platecode):
return str(platecode).lstrip('0').zfill(5)[:5]
def re_ch(file_path, orig_data, return_columns = ['ex_opbin']):
list_of_chunk_files = list(file_path.glob('*.csv'))
cb_ch = [pd.read_csv(f, sep=None, dtype=object, engine='python') for f in tqdm(list_of_chunk_files, desc='Combining ch', unit='chunk')]
cb_ch = pd.concat(cb_ch)
shared_columns = [column_name.replace('req_','') for column_name in cb_ch.columns if column_name.startswith('req_')]
cb_ch.columns = cb_ch.columns.str.replace("req_", "")
return_columns = return_columns + shared_columns
cb_ch = cb_ch[return_columns]
for column in shared_columns:
cb_ch[column] = cb_ch[column].astype(str)
orig_data[column] = orig_data[column].astype(str)
final= orig_data.merge(cb_ch, how='left', on=shared_columns)
return final
For running script at certain time:
You can use cron for linux.
In windows you can use windows scheduler
Here is an example for getting latest file in directory
files = os.listdir(output_folder)
files = [os.path.join(output_folder, file) for file in files]
files = [file for file in files if os.path.isfile(file)]
latest_file = max(files, key=os.path.getctime)
This will do the job!
import os
import time
import threading
import pandas as pd
DIR_PATH = 'DIR_PATH_HERE'
def create_csv_file():
# create files.csv file that will contains all the current files
# This will run for one time only
if not os.path.exists('files.csv'):
list_of_files = os.listdir(DIR_PATH )
list_of_files.append('files.csv')
pd.DataFrame({'files':list_of_files}).to_csv('files.csv')
else:
None
def check_for_new_files():
create_csv_file()
files = pd.read_csv('files.csv')
list_of_files = os.listdir(DIR_PATH )
if len(files.files) != len(list_of_files):
print('New file added')
#do what you want
#save your excel with the name sample.xslx
#append your excel into list of files and get the set so you will not have the sample.xlsx twice if run again
list_of_files.append('sample.xslx')
list_of_files=list(set(list_of_files))
#save again the curent list of files
pd.DataFrame({'files':list_of_files}).to_csv('files.csv')
print('Finished for the day!')
ticker = threading.Event()
# Run the program every 86400 seconds = 24h
while not ticker.wait(86400):
check_for_new_files()
It basically uses threading to check for new files every 86400s which is 24h, and saves all the current files in a directory where the py file is in and checks for new files that does not exist in the csv file and append them to the files.csv file every day.
I am trying to pull a list of top level directories under C:/Folder.
I want to then check the modified date for each directory. I can use
next(os.walk(source_dir))[1]
to get a list of directories.
But when I try and use
os.stat(str(dir_list))[stat.ST_MTIME]
to get the modified date for each directory, it seems to be searching for all directory names as one large name.
This gives me a File Not Found error.
How can I get the modified date for each subdirectory without getting a File Not Found error?
import os.path
import time, stat
# Source directory
source_dir = r'C:/Folder'
def check_dir():
if not os.path.isdir(source_dir):
# Needed to detect top level files
print("Nothing in top folder.")
else:
dir_list = next(os.walk(source_dir))[1]
print(dir_list)
# get modified time for directories
dtime = os.stat(str(dir_list))[stat.ST_MTIME]
if time.time() - dtime <= 7776000:
print("Found modification in last 90 days for folder =>", dir_list, time.ctime(dtime))
if __name__ == "__main__":
check_dir()
you can't use os.stat for a list you have to iterate over the list:
def check_dir():
if not os.path.isdir(source_dir):
# Needed to detect top level files
print("Nothing in top folder.")
else:
root, dir_list, _ = next(os.walk(source_dir))
print(dir_list)
# get modified time for directories
for dir in dir_list:
dtime = os.stat(str(root + os.path.sep + dir))[stat.ST_MTIME]
if time.time() - dtime <= 7776000:
print("Found modification in last 90 days for folder =>", dir, time.ctime(dtime))
if __name__ == "__main__":
check_dir()
The problem is that os.walk gives you a relative path to the source_dir for example:
import os
source_dir = '/tmp/'
relative_paths = [relative_path for relative_path in next(os.walk(source_dir))[1]]
print(relative_paths)
>>> ['com.google.Keystone', 'com.apple.launchd.liNmfkel1M', 'powerlog']
In order to get the stats you need to use the full path of each folder, for example:
import os
import stat
source_dir = '/tmp/'
full_paths = [os.path.join(source_dir, relative_path) for relative_path in next(os.walk(source_dir))[1]]
print(full_paths)
>>> ['/tmp/com.google.Keystone', '/tmp/com.apple.launchd.liNmfkel1M', '/tmp/powerlog']
And then you can use them:
import os
import stat
source_dir = '/tmp/'
full_paths = [os.path.join(source_dir, relative_path) for relative_path in next(os.walk(source_dir))[1]]
stats = [os.stat(path)[stat.ST_MTIME] for path in full_paths]
print(stats)
>>> [1581523169, 1581523164, 1581523141]
i have below code for list excel file in folder, and after read the file using panda, doing some filldown/etc on each, compile all of the file into one dataframe
below is single thread version of the code, and its working :
Folder = "Folder Name"
StartRead = 2
num_cores = 2
DefaultPath = "C:\\Users\\"
path = DefaultPath + Folder
file_identifier = "*.xlsx"
def reader(filename):
raw_read = pd.read_excel(filename,skiprows=StartRead)
fixed_read = raw_read.fillna(method='ffill')
return fixed_read
def load_serial():
dfs = pd.DataFrame()
list_ = []
file_list = glob2.glob(path + "\\*" + file_identifier)
for f in file_list:
df = reader(f)
list_.append(df)
dfs = pd.concat(list_)
return dfs
data = load_serial()
im trying to do the same in parallel, since the actual file to be compiled is around~2000 excel file.
But somehow doesnt show anything after running it in jupyter with 10 test file, just like above :
import time
import glob2
import pandas as pd
import multiprocessing as mp
from joblib import Parallel, delayed
import os
import concurrent.futures
Folder = "Folder"
StartRead = 2
num_cores = 2
DefaultPath = "C:\\Users\\"
path = DefaultPath + Folder
file_identifier = "*.xlsx"
def reader(filename):
raw_read = pd.read_excel(filename, skiprows=StartRead)
fixed_read = raw_read.fillna(method='ffill')
return fixed_read
def load_serial():
dfs = pd.DataFrame()
list_ = []
file_list = glob2.glob(path + "\\*" + file_identifier)
for f in file_list:
df = reader(f)
list_.append(df)
dfs = pd.concat(list_)
return dfs
start1 = time.time()
if __name__ == "__main__":
file_list = glob2.glob(path + "\\*" + file_identifier)
pool = mp.Pool(num_cores)
list_of_results = pool.map_async(reader, file_list)
pool.close()
pool.join()
end1 = time.time()
Did i miss something? Thank you
this is tested on my dual core notebook with 10 file
the actual run will be on my 32 core server running windows, with ~2000 excel file in each folder
Thank you
Edit:
Finally i have it run
First, function reader need to be on separate file
Then wrap the main function within if name == 'main'
I've tested it on may main 32 core machine and its properly spool up all the core
Will put the fixed code tomorrow, im really sleepy right now...
i made a program that copy automatically a usb device.
when it copy the usb it create one folder in correct destination, and one folder in the same path of python program. i want that itcreate only one folder in correct destination! thanks
this is the code:
import shutil
from array import *
import math
import time
import os
import sys
import random
import datetime
def data():
now = datetime.datetime.now()
format = "%d %b %H.%M"
global now_date
now_date = now.strftime(format)
format = "%M"
global minuti
minuti = now.strftime(format)
data()
old_date = now_date
alfabeto = ['A:','B:','F:','G:','H:','I:','L:','M:','N:','O:',] #mancano e,c,d
a = (r'')
b=random.choice('abcdefghilmnopqrstuvz1234567890èòàù')
new_dir = '{}'.format(now_date)
inc = (r'C:\documenti\program\file\collegamenti\'')
incollaa = "".join([inc, new_dir,' ',b])
i=0
while True:
try:
if i==10: i=0
time.sleep(1)
copiaa = "".join([a, alfabeto[i]])
i=i+1
shutil.copytree(copiaa,incollaa)
if not os.path.exists(new_dir):
os.makedirs(new_dir)
break
except FileNotFoundError:
pass
Your problem is the following lines:
if not os.path.exists(new_dir):
os.makedirs(new_dir)
Since new_dir is a relative path (a date string), it will be created in the working folder of your script.
I've got a small script with monitors when files are added or removed to a directory. The next step is for me to get the script to execute the files (windows batch files) once they’ve been added to the directory. I’m struggling to understand how to use a variable with subprocess call (if this is the best way this can be acheived). Could anyone help me please? Many thanks. Code looks like this so far ;
import sys
import time
import os
inputdir = 'c:\\test\\'
os.chdir(inputdir)
contents = os.listdir(inputdir)
count = len(inputdir)
dirmtime = os.stat(inputdir).st_mtime
while True:
newmtime = os.stat(inputdir).st_mtime
if newmtime != dirmtime:
dirmtime = newmtime
newcontents = os.listdir(inputdir)
added = set(newcontents).difference(contents)
if added:
print "These files added: %s" %(" ".join(added))
import subprocess
subprocess.call(%,shell=True)
removed = set(contents).difference(newcontents)
if removed:
print "These files removed: %s" %(" ".join(removed))
contents = newcontents
time.sleep(15)
This should do what you wanted, cleaned it up a little.
import sys
import time
import os
import subprocess
def monitor_execute(directory):
dir_contents = os.listdir(directory)
last_modified = os.stat(directory).st_mtime
while True:
time.sleep(15)
modified = os.stat(directory).st_mtime
if last_modified == modified:
continue
last_modified = modified
current_contents = os.listdir(directory)
new_files = set(current_contents).difference(dir_contents)
if new_files:
print 'Found new files: %s' % ' '.join(new_files)
for new_file in new_files:
subprocess.call(new_file, shell=True)
lost_files = set(dir_contents).difference(current_contents)
if lost_files:
print 'Lost these files: %s' % ' '.join(lost_files)
dir_contents = current_contents