I have a python code. I managed to make my .py to .exe but when I want to drag and drop a folder with my files to compare them with other files, it won't work. Just creating my output folder and exits the program. Here is the python code and where should I modify the code when I drag and drop the folder to execute the program? Thank you
import difflib
import os
from pathlib import Path
import shutil
import time
ts = time.time()
fold1_path = input("First folder:")
path1 = []
path1.append(fold1_path)
list1 = list(Path(path1[0]).glob("*.htm"))
## Create empty folder for the result
x = os.path.expanduser("~")+ r"\Desktop\rezultat_script"
path = x
if not os.path.exists(path):
os.makedirs(path)
# Copy files into different folder for processing
files = list1.copy()
for f in files:
shutil.copy(f, path)
fold2_path = input("Second folder:")
list2 = list(Path(fold2_path).glob("*.htm"))
def compare(folder1, folder2):
for num in range(len(list1)):
show_pre = "Before editing:"
show_post = "After editing:"
show_pre_lines = open(folder1[num]).readlines()
show_post_lines = open(folder2[num]).readlines()
difference = difflib.HtmlDiff(wrapcolumn=100).make_file(show_pre_lines, show_post_lines, show_pre, show_post)
for file in range(len(folder1)):
difference_report = open(folder1[file], "w+")
difference_report.write(difference)
difference_report.close()
fold3_path = list(Path(x).glob("*.htm"))
compare(fold3_path, list2)
print("Script done!")
te = time.time()
total_time = te-ts
print("Total time used: {:10.2f}".format(total_time))
I have a folder that contains 1000's of folder under which there are 1000's of file.
cb = []
for root, dirs, files in os.walk(dir):
for name in files:
filepath = root + os.sep + name
df = pd.read_csv(filepath,index_col=False)
df['TimeStamp'] = pd.to_datetime(df.TimeStamp, format = '%Y-%m-%d %H:%M:%S')
date = df['TimeStamp'].dt.date.values[0]
time = df['TimeStamp'].dt.time.values[0]
if (df.shape[0] > 0):
cb.append({'Time': time, 'Date': date})
I need to open all the files and do some data processing on them and append the data to empty dataframe.
Doing it sequentially takes days to run, is there a way I can use multiprocessing/threading to reduce the time and not skipping any files in the process?
You can put the per-file work into a separate function and then use a multiprocessing pool to push the processing to separate processes. This helps with CPU bound calculations but the file reads will take just as long as your original serial processing. The trick to multiprocessing is to keep the amount of data flowing through the pool itself to a minimum. Since you only pass a file name and return a couple of date time objects in this example, you're good on that point.
import multiprocessing as mp
import pandas as pd
import os
def worker(filepath):
df = pd.read_csv(filepath,index_col=False)
df['TimeStamp'] = pd.to_datetime(df.TimeStamp, format = '%Y-%m-%d %H:%M:%S')
date = df['TimeStamp'].dt.date.values[0]
time = df['TimeStamp'].dt.time.values[0]
if (df.shape[0] > 0):
return({'Time': time, 'Date': date})
else:
return None
if __name__ == "__main__":
csv_files = [root + os.sep + name
for root, dirs, files in os.walk(dir)
for name in files]
with mp.Pool() as pool:
cb = [result for result in pool.map(worker, csv_files, chunksize=1)
if result]
So basically, I'm creating a directory that allows users to put csv files in there. But I want to create python script that would look in that folder everyday at a given time (lets say noon) and pick up the latest file that was placed in there if it's not over a day old. But I'm not sure if that's possible.
Its this chunk of code that I would like to run if it the app finds a new file in the desired directory:
def better_Match(results, best_percent = "Will need to get the match %"):
result = {}
result_list = [{item.name:item.text for item in result} for result in results]
if result_list:
score_list = [float(item['score']) for item in result_list]
match_index = max(enumerate(score_list),key=lambda x: x[1])[0]
logger.debug('MRCs:{}, Chosen MRC:{}'.format(score_list,score_list[match_index]))
logger.debug(result_list[match_index])
above_threshold = float(result_list[match_index]['score']) >= float(best_percent)
if above_threshold:
result = result_list[match_index]
return result
def clean_plate_code(platecode):
return str(platecode).lstrip('0').zfill(5)[:5]
def re_ch(file_path, orig_data, return_columns = ['ex_opbin']):
list_of_chunk_files = list(file_path.glob('*.csv'))
cb_ch = [pd.read_csv(f, sep=None, dtype=object, engine='python') for f in tqdm(list_of_chunk_files, desc='Combining ch', unit='chunk')]
cb_ch = pd.concat(cb_ch)
shared_columns = [column_name.replace('req_','') for column_name in cb_ch.columns if column_name.startswith('req_')]
cb_ch.columns = cb_ch.columns.str.replace("req_", "")
return_columns = return_columns + shared_columns
cb_ch = cb_ch[return_columns]
for column in shared_columns:
cb_ch[column] = cb_ch[column].astype(str)
orig_data[column] = orig_data[column].astype(str)
final= orig_data.merge(cb_ch, how='left', on=shared_columns)
return final
For running script at certain time:
You can use cron for linux.
In windows you can use windows scheduler
Here is an example for getting latest file in directory
files = os.listdir(output_folder)
files = [os.path.join(output_folder, file) for file in files]
files = [file for file in files if os.path.isfile(file)]
latest_file = max(files, key=os.path.getctime)
This will do the job!
import os
import time
import threading
import pandas as pd
DIR_PATH = 'DIR_PATH_HERE'
def create_csv_file():
# create files.csv file that will contains all the current files
# This will run for one time only
if not os.path.exists('files.csv'):
list_of_files = os.listdir(DIR_PATH )
list_of_files.append('files.csv')
pd.DataFrame({'files':list_of_files}).to_csv('files.csv')
else:
None
def check_for_new_files():
create_csv_file()
files = pd.read_csv('files.csv')
list_of_files = os.listdir(DIR_PATH )
if len(files.files) != len(list_of_files):
print('New file added')
#do what you want
#save your excel with the name sample.xslx
#append your excel into list of files and get the set so you will not have the sample.xlsx twice if run again
list_of_files.append('sample.xslx')
list_of_files=list(set(list_of_files))
#save again the curent list of files
pd.DataFrame({'files':list_of_files}).to_csv('files.csv')
print('Finished for the day!')
ticker = threading.Event()
# Run the program every 86400 seconds = 24h
while not ticker.wait(86400):
check_for_new_files()
It basically uses threading to check for new files every 86400s which is 24h, and saves all the current files in a directory where the py file is in and checks for new files that does not exist in the csv file and append them to the files.csv file every day.
I have a script which pulls in data from a csv file, does some manipulations to it and creates an output excel file. But, its a tedious process as I need to do it for multiple files.
Question: Is there a way for me to run this script across multiple csv files together and create a separate excel file output for each input file?
I'm not sure what to try out here. I've read that I need to use a module called glob but I'm not sure how to go about it.
This script works for a single file:
# Import libraries
import pandas as pd
import xlsxwriter
# Set system paths
INPUT_PATH = 'SystemPath//Downloads//'
INPUT_FILE = 'rawData.csv'
OUTPUT_PATH = 'SystemPath//Downloads//Output//'
OUTPUT_FILE = 'rawDataOutput.xlsx'
# Get data
df = pd.read_csv(INPUT_PATH + INPUT_FILE)
# Clean data
cleanedData = df[['State','Campaigns','Type','Start date','Impressions','Clicks','Spend(INR)',
'Orders','Sales(INR)','NTB orders','NTB sales']]
cleanedData = cleanedData[cleanedData['Impressions'] != 0].sort_values('Impressions',
ascending= False).reset_index()
cleanedData.loc['Total'] = cleanedData.select_dtypes(pd.np.number).sum()
cleanedData['CTR(%)'] = (cleanedData['Clicks'] /
cleanedData['Impressions']).astype(float).map("{:.2%}".format)
cleanedData['CPC(INR)'] = (cleanedData['Spend(INR)'] / cleanedData['Clicks'])
cleanedData['ACOS(%)'] = (cleanedData['Spend(INR)'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData['% of orders NTB'] = (cleanedData['NTB orders'] /
cleanedData['Orders']).astype(float).map("{:.2%}".format)
cleanedData['% of sales NTB'] = (cleanedData['NTB sales'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData = cleanedData[['State','Campaigns','Type','Start date','Impressions','Clicks','CTR(%)',
'Spend(INR)','CPC(INR)','Orders','Sales(INR)','ACOS(%)',
'NTB orders','% of orders NTB','NTB sales','% of sales NTB']]
# Create summary
summaryData = cleanedData.groupby(['Type'])[['Spend(INR)','Sales(INR)']].agg('sum')
summaryData.loc['Overall Snapshot'] = summaryData.select_dtypes(pd.np.number).sum()
summaryData['ROI'] = summaryData['Sales(INR)'] / summaryData['Spend(INR)']
# Push to excel
writer = pd.ExcelWriter(OUTPUT_PATH + OUTPUT_FILE, engine='xlsxwriter')
summaryData.to_excel(writer, sheet_name='Summary')
cleanedData.to_excel(writer, sheet_name='Overall Report')
writer.save()
I've never tried anything like this before and I would appreciate your help trying to figure this out
You can use Python's glob.glob() to get all of the CSV files from a given folder. For each filename that is returned, you could derive a suitable output filename. The file processing could be moved into a function as follows:
# Import libraries
import pandas as pd
import xlsxwriter
import glob
import os
def process_csv(input_filename, output_filename):
# Get data
df = pd.read_csv(input_filename)
# Clean data
cleanedData = df[['State','Campaigns','Type','Start date','Impressions','Clicks','Spend(INR)',
'Orders','Sales(INR)','NTB orders','NTB sales']]
cleanedData = cleanedData[cleanedData['Impressions'] != 0].sort_values('Impressions',
ascending= False).reset_index()
cleanedData.loc['Total'] = cleanedData.select_dtypes(pd.np.number).sum()
cleanedData['CTR(%)'] = (cleanedData['Clicks'] /
cleanedData['Impressions']).astype(float).map("{:.2%}".format)
cleanedData['CPC(INR)'] = (cleanedData['Spend(INR)'] / cleanedData['Clicks'])
cleanedData['ACOS(%)'] = (cleanedData['Spend(INR)'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData['% of orders NTB'] = (cleanedData['NTB orders'] /
cleanedData['Orders']).astype(float).map("{:.2%}".format)
cleanedData['% of sales NTB'] = (cleanedData['NTB sales'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData = cleanedData[['State','Campaigns','Type','Start date','Impressions','Clicks','CTR(%)',
'Spend(INR)','CPC(INR)','Orders','Sales(INR)','ACOS(%)',
'NTB orders','% of orders NTB','NTB sales','% of sales NTB']]
# Create summary
summaryData = cleanedData.groupby(['Type'])[['Spend(INR)','Sales(INR)']].agg('sum')
summaryData.loc['Overall Snapshot'] = summaryData.select_dtypes(pd.np.number).sum()
summaryData['ROI'] = summaryData['Sales(INR)'] / summaryData['Spend(INR)']
# Push to excel
writer = pd.ExcelWriter(output_filename, engine='xlsxwriter')
summaryData.to_excel(writer, sheet_name='Summary')
cleanedData.to_excel(writer, sheet_name='Overall Report')
writer.save()
# Set system paths
INPUT_PATH = 'SystemPath//Downloads//'
OUTPUT_PATH = 'SystemPath//Downloads//Output//'
for csv_filename in glob.glob(os.path.join(INPUT_PATH, "*.csv")):
name, ext = os.path.splitext(os.path.basename(csv_filename))
# Create an output filename based on the input filename
output_filename = os.path.join(OUTPUT_PATH, f"{name}Output.xlsx")
process_csv(csv_filename, output_filename)
os.path.join() can be used as a safer way to join file paths together.
Something like:
import os
import glob
import pandas as pd
os.chdir(r'path\to\folder') #changes folder path to working dir
filelist=glob.glob('*.csv') #creates a list of all csv files
for file in filelist: #loops through the files
df=pd.read_csv(file,...)
#Do something and create a final_df
final_df.to_excel(file[:-4],+'_output.xlsx',index=False) #excel with same name+ouput
you can run this scrip inside a for loop:
for file in os.listdir(INPUT_PATH):
if file.endswith('.csv') or file.endswith('.CSV'):
INPUT_FILE = INPUT_PATH + '/' + file
OUTPUT_FILE = INPUT_PATH + '/Outputs/' + file.[:-4] + 'xlsx'
try this:
import glob
files = glob.glob(INPUT_PATH + "*.csv")
for file in files:
# Get data
df = pd.read_csv(file)
# Clean data
#your cleaning code
# Push to excel
writer = pd.ExcelWriter(OUTPUT_PATH + file.split("/")[-1].replace(".csv","_OUTPUT.xlxs", engine='xlsxwriter')
import pandas as pd
import glob
dataset = pd.read_csv('masterfeedproduction-EURNA_2016-06-27.csv',sep =
',',delimiter = None) # select 1 file in the directory
datasets_cols = ['transactionID','gvkey','companyName']
df= dataset.transactionID
df.shape
df.loc[df.duplicated()]
returns the duplicates in the selected file. displays row number and transactionID. so this is correct.
target_directory = r'C:\Users\nikol\Downloads\fullDailyDeltas\fullDailyDeltas'
file_list = glob.glob(target_directory + "/*.csv")
df_result = df.loc[df.duplicated()]
for file in file_list:
return(df_result)
here I am stuck.
target_directory = r'C:\Users\nikol\Downloads\fullDailyDeltas\fullDailyDeltas'
file_list = glob.glob(target_directory + "/*.csv")
for file in file_list:
dataset = pd.read_csv(file)
df = dataset.transactionID
duplicated = df.loc[df.duplicated()]
if duplicated.empty == False:
print(file)
print(duplicated)
Have a look at the glob module.
import pandas as pd
import glob
def your_function(file):
# put your df processing logic here
return df_result
Step 1 - Create list of files in directory
target_directory = r'Path/to/your/dir'
file_list = glob.glob(target_directory + "/*.csv")
# Include slash or it will search in the wrong directory!!
Step 2 - Loop through files in list
for file in file_list: # Loop files
df_result = your_function(file) # Put your logic into a separate function
new_filename = file.replace('.csv', '_processed.csv')
df_result.to_csv(new_filename, index = False)
Comment
In case you would have included your code showing your attempts to do this yourself, your question was answered within seconds.