I have written a code that could find target row in multiple csv files and generate new csv files and write it to the specific folder, due to the large number files in this folder, I would like to use multiprocessing to operate the CSV files. Here is the code which is without multiprocessing.
import os
from pathlib import Path
from collections import defaultdict
import numpy as np
import pandas as pd
import multiprocessing
def Filter_Minute_Data(foldpath, time_FileMap, target_List, target_Type, output_folder):
columnNum = get_columnNum(target_Type)
for key in time_FileMap.keys():
for file in time_FileMap[key]:
# try to find the filename
output_fileName = genetate_output_fileName(file)
output_folder_new = output_folder + "/by_minute" + output_fileName[0]
if not (os.path.exists(output_folder_new)):
os.makedirs(output_folder_new)
output_path = os.path.join(output_folder_new, output_fileName[1])
out = os.path.join(multiprocessing.current_process().name,output_path)
filePath = Path(os.path.join(foldpath, file))
# read the csv file
df = pd.read_csv(filePath, compression='gzip', header = None, error_bad_lines=False)
df_filtered = df[df[columnNum].isin(target_List)]
df_filtered.to_csv(output_path, index = False, header = False)
I also tried to use multiprocessing to do it, but it missed some data in the result. Can anyone help me with the multiprocessing part? How to implement the Pool and make it correct?
def Filter_Minute_Data(foldpath, time_FileMap, target_List, target_Type, output_folder):
columnNum = get_columnNum(target_Type)
pool = multiprocessing.Pool(processes=4)
for key in time_FileMap.keys():
for file in time_FileMap[key]:
output_fileName = genetate_output_fileName(file)
output_folder_new = output_folder + "/by_minute" + output_fileName[0]
if not (os.path.exists(output_folder_new)):
os.makedirs(output_folder_new)
output_path = os.path.join(output_folder_new, output_fileName[1])
out = os.path.join(multiprocessing.current_process().name,output_path)
filePath = Path(os.path.join(foldpath, file))
pool.apply_async(multithreading_read_csv,(filePath, columnNum, target_List, out))
pool.close()
pool.join()
def multithreading_read_csv(filePath, columnNum, target_List, output_path):
df = pd.read_csv(filePath, compression='gzip', header = None, error_bad_lines=False)
df_filtered = df[df[columnNum].isin(target_List)]
df_filtered.to_csv(output_path, mode = 'a', index = False, header = False)
Related
I have a Python script that reads a CSV file and outputs it to HTML.
I would like to change the order of the columns once written to the HTML file, but I don't know if I should do this on read or write or what function to use, here is my code so far.
import pandas as pd
import os
import shutil
import glob
#paths
HTMLPATH="C:/NMS4/QUE/HTML/"
QUEPATH="C:/NMS4/QUE/"
#create the directory for holding the HTML files if it doesn't exist
isExist = os.path.exists(HTMLPATH)
if not isExist:
os.mkdir(HTMLPATH, 0o777)
#convert .txt file to a html file in the HTML folder
#can't convert an empty file so only convert if file size in not 0
for quefile in glob.iglob('C:/NMS4/QUE/*.txt'):
if os.path.getsize(quefile) != 0:
csv = pd.read_csv(quefile, header=None, usecols=[0,3,4,15,34,43,44,129], names=['OrderNo','Req Qty','Planned Start','Resource','Op','Part','Desc','Qty Recd'])
html_table = csv.to_html()
f = open(quefile + '.html', 'w')
f.write(html_table)
f.close()
shutil.move(quefile + ".HTML", HTMLPATH)
Any help, greatly appreciated.
Thanks.
I've been looking at https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_html.html, but can't quite find out how to re-order the columns.
Edit,
Changes to get the solution are,
import pandas as pd
import os
import shutil
import glob
#paths
HTMLPATH="C:/NMS4/QUE/HTML/"
QUEPATH="C:/NMS4/QUE/"
#create the directory for holding the HTML files if it doesn't exist
isExist = os.path.exists(HTMLPATH)
if not isExist:
os.mkdir(HTMLPATH, 0o777)
#convert .txt file to a html file in the HTML folder
#python can't convert an empty file so only convert if file size in not 0
for quefile in glob.iglob('C:/NMS4/QUE/*.txt'):
if os.path.getsize(quefile) != 0:
csv = pd.read_csv(quefile, header=None, usecols=[0,3,4,15,34,43,44,129], names=['Order No','Req Qty','Planned Start','Resource','Op','Part','Desc','Qty Recd'])
cols = list(csv.columns)
a, b, c, d, e, f, g = cols.index('Order No'), cols.index('Req Qty'), cols.index('Planned Start'), cols.index('Resource'), cols.index('Op'), cols.index('Part'), cols.index('Desc')
cols[a], cols[b], cols[c], cols[d], cols[e], cols[f], cols[g] = cols[a], cols[e], cols[f], cols[g], cols[c], cols[b], cols[d]
df = csv[cols]
html_table = df.to_html()
f = open(quefile + '.html', 'w')
f.write(html_table)
f.close()
shutil.move(quefile + ".HTML", HTMLPATH)
I have a folder on my computer, where are several fiels saved. How can I automatically load the biggest file (in terms of size in kb) of it only?
Right now I could use:
#Sort it with the help of windows, biggest file on top and then:
import pandas as pd
df = pd.read_csv(r'C:\...\FileABC.csv') #when I know FileABC is listed on the top
Is there a way to automatically do that in python? Then I could skip the manual adjustment in windows.
Try this:
import os
import pandas as pd
basedir = 'C:/Users/viupadhy/Desktop/Stackoverflow'
names = os.listdir(basedir)
paths = [os.path.join(basedir, name) for name in names]
sizes = [(path, os.stat(path).st_size) for path in paths]
file = max(sizes, key=lambda x: x[1])
print(file)
df = pd.read_csv(file[0])
df
Output:
A simple way to do this:
import os
def find_largest_file(path):
largest = None
max_size = 0
for filename in os.listdir(path):
if os.path.isfile(filename):
size = os.path.getsize(filename)
if size > max_size:
largest = filename
max_size = size
return largest
print(find_largest_file(path))
# ... whaterver largest file you have in `path`.
This can be further improved by filtering only .csv extension and the like.
You can use something like:
import os
import pandas as pd
folder_path = "C:\\programs\\"
file_list = os.listdir(folder_path)
biggest_file = os.path.join(folder_path, file_list[0])
for file in file_list:
file_location = os.path.join(folder_path, file)
size = os.path.getsize(file_location)
if size > os.path.getsize(biggest_file):
biggest_file = file_location
df = pd.read_csv(biggest_file)
I have about 5,000 .gzip files (~1MB each). Each of these files contains data in a jsonlines format. Here's what it looks like:
{"category_id":39,"app_id":12731}
{"category_id":45,"app_id":12713}
{"category_id":6014,"app_id":13567}
I want to parse these files and convert them to a pandas dataframe. Is there a way to speed up this process? Here's my code but it's kinda slow (0.5s per file)
import pandas as pd
import jsonlines
import gzip
import os
import io
path = 'data/apps/'
files = os.listdir(path)
result = []
for n, file in enumerate(files):
print(n, file)
with open(f'{path}/{file}', 'rb') as f:
data = f.read()
unzipped_data = gzip.decompress(data)
decoded_data = io.BytesIO(unzipped_data)
reader = jsonlines.Reader(decoded_data)
for line in reader:
if line['category_id'] == 6014:
result.append(line)
df = pd.DataFrame(result)
This should allow you to read each line without loading the whole file.
import pandas as pd
import json
import gzip
import os
path = 'data/apps/'
files = os.listdir(path)
result = []
for n, file in enumerate(files):
print(n, file)
with gzip.open(f'{path}/{file}') as f:
for line in f:
data = json.loads(line)
if data['category_id'] == 6014:
result.append(data)
df = pd.DataFrame(result)
I have a script which pulls in data from a csv file, does some manipulations to it and creates an output excel file. But, its a tedious process as I need to do it for multiple files.
Question: Is there a way for me to run this script across multiple csv files together and create a separate excel file output for each input file?
I'm not sure what to try out here. I've read that I need to use a module called glob but I'm not sure how to go about it.
This script works for a single file:
# Import libraries
import pandas as pd
import xlsxwriter
# Set system paths
INPUT_PATH = 'SystemPath//Downloads//'
INPUT_FILE = 'rawData.csv'
OUTPUT_PATH = 'SystemPath//Downloads//Output//'
OUTPUT_FILE = 'rawDataOutput.xlsx'
# Get data
df = pd.read_csv(INPUT_PATH + INPUT_FILE)
# Clean data
cleanedData = df[['State','Campaigns','Type','Start date','Impressions','Clicks','Spend(INR)',
'Orders','Sales(INR)','NTB orders','NTB sales']]
cleanedData = cleanedData[cleanedData['Impressions'] != 0].sort_values('Impressions',
ascending= False).reset_index()
cleanedData.loc['Total'] = cleanedData.select_dtypes(pd.np.number).sum()
cleanedData['CTR(%)'] = (cleanedData['Clicks'] /
cleanedData['Impressions']).astype(float).map("{:.2%}".format)
cleanedData['CPC(INR)'] = (cleanedData['Spend(INR)'] / cleanedData['Clicks'])
cleanedData['ACOS(%)'] = (cleanedData['Spend(INR)'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData['% of orders NTB'] = (cleanedData['NTB orders'] /
cleanedData['Orders']).astype(float).map("{:.2%}".format)
cleanedData['% of sales NTB'] = (cleanedData['NTB sales'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData = cleanedData[['State','Campaigns','Type','Start date','Impressions','Clicks','CTR(%)',
'Spend(INR)','CPC(INR)','Orders','Sales(INR)','ACOS(%)',
'NTB orders','% of orders NTB','NTB sales','% of sales NTB']]
# Create summary
summaryData = cleanedData.groupby(['Type'])[['Spend(INR)','Sales(INR)']].agg('sum')
summaryData.loc['Overall Snapshot'] = summaryData.select_dtypes(pd.np.number).sum()
summaryData['ROI'] = summaryData['Sales(INR)'] / summaryData['Spend(INR)']
# Push to excel
writer = pd.ExcelWriter(OUTPUT_PATH + OUTPUT_FILE, engine='xlsxwriter')
summaryData.to_excel(writer, sheet_name='Summary')
cleanedData.to_excel(writer, sheet_name='Overall Report')
writer.save()
I've never tried anything like this before and I would appreciate your help trying to figure this out
You can use Python's glob.glob() to get all of the CSV files from a given folder. For each filename that is returned, you could derive a suitable output filename. The file processing could be moved into a function as follows:
# Import libraries
import pandas as pd
import xlsxwriter
import glob
import os
def process_csv(input_filename, output_filename):
# Get data
df = pd.read_csv(input_filename)
# Clean data
cleanedData = df[['State','Campaigns','Type','Start date','Impressions','Clicks','Spend(INR)',
'Orders','Sales(INR)','NTB orders','NTB sales']]
cleanedData = cleanedData[cleanedData['Impressions'] != 0].sort_values('Impressions',
ascending= False).reset_index()
cleanedData.loc['Total'] = cleanedData.select_dtypes(pd.np.number).sum()
cleanedData['CTR(%)'] = (cleanedData['Clicks'] /
cleanedData['Impressions']).astype(float).map("{:.2%}".format)
cleanedData['CPC(INR)'] = (cleanedData['Spend(INR)'] / cleanedData['Clicks'])
cleanedData['ACOS(%)'] = (cleanedData['Spend(INR)'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData['% of orders NTB'] = (cleanedData['NTB orders'] /
cleanedData['Orders']).astype(float).map("{:.2%}".format)
cleanedData['% of sales NTB'] = (cleanedData['NTB sales'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData = cleanedData[['State','Campaigns','Type','Start date','Impressions','Clicks','CTR(%)',
'Spend(INR)','CPC(INR)','Orders','Sales(INR)','ACOS(%)',
'NTB orders','% of orders NTB','NTB sales','% of sales NTB']]
# Create summary
summaryData = cleanedData.groupby(['Type'])[['Spend(INR)','Sales(INR)']].agg('sum')
summaryData.loc['Overall Snapshot'] = summaryData.select_dtypes(pd.np.number).sum()
summaryData['ROI'] = summaryData['Sales(INR)'] / summaryData['Spend(INR)']
# Push to excel
writer = pd.ExcelWriter(output_filename, engine='xlsxwriter')
summaryData.to_excel(writer, sheet_name='Summary')
cleanedData.to_excel(writer, sheet_name='Overall Report')
writer.save()
# Set system paths
INPUT_PATH = 'SystemPath//Downloads//'
OUTPUT_PATH = 'SystemPath//Downloads//Output//'
for csv_filename in glob.glob(os.path.join(INPUT_PATH, "*.csv")):
name, ext = os.path.splitext(os.path.basename(csv_filename))
# Create an output filename based on the input filename
output_filename = os.path.join(OUTPUT_PATH, f"{name}Output.xlsx")
process_csv(csv_filename, output_filename)
os.path.join() can be used as a safer way to join file paths together.
Something like:
import os
import glob
import pandas as pd
os.chdir(r'path\to\folder') #changes folder path to working dir
filelist=glob.glob('*.csv') #creates a list of all csv files
for file in filelist: #loops through the files
df=pd.read_csv(file,...)
#Do something and create a final_df
final_df.to_excel(file[:-4],+'_output.xlsx',index=False) #excel with same name+ouput
you can run this scrip inside a for loop:
for file in os.listdir(INPUT_PATH):
if file.endswith('.csv') or file.endswith('.CSV'):
INPUT_FILE = INPUT_PATH + '/' + file
OUTPUT_FILE = INPUT_PATH + '/Outputs/' + file.[:-4] + 'xlsx'
try this:
import glob
files = glob.glob(INPUT_PATH + "*.csv")
for file in files:
# Get data
df = pd.read_csv(file)
# Clean data
#your cleaning code
# Push to excel
writer = pd.ExcelWriter(OUTPUT_PATH + file.split("/")[-1].replace(".csv","_OUTPUT.xlxs", engine='xlsxwriter')
import pandas as pd
import glob
dataset = pd.read_csv('masterfeedproduction-EURNA_2016-06-27.csv',sep =
',',delimiter = None) # select 1 file in the directory
datasets_cols = ['transactionID','gvkey','companyName']
df= dataset.transactionID
df.shape
df.loc[df.duplicated()]
returns the duplicates in the selected file. displays row number and transactionID. so this is correct.
target_directory = r'C:\Users\nikol\Downloads\fullDailyDeltas\fullDailyDeltas'
file_list = glob.glob(target_directory + "/*.csv")
df_result = df.loc[df.duplicated()]
for file in file_list:
return(df_result)
here I am stuck.
target_directory = r'C:\Users\nikol\Downloads\fullDailyDeltas\fullDailyDeltas'
file_list = glob.glob(target_directory + "/*.csv")
for file in file_list:
dataset = pd.read_csv(file)
df = dataset.transactionID
duplicated = df.loc[df.duplicated()]
if duplicated.empty == False:
print(file)
print(duplicated)
Have a look at the glob module.
import pandas as pd
import glob
def your_function(file):
# put your df processing logic here
return df_result
Step 1 - Create list of files in directory
target_directory = r'Path/to/your/dir'
file_list = glob.glob(target_directory + "/*.csv")
# Include slash or it will search in the wrong directory!!
Step 2 - Loop through files in list
for file in file_list: # Loop files
df_result = your_function(file) # Put your logic into a separate function
new_filename = file.replace('.csv', '_processed.csv')
df_result.to_csv(new_filename, index = False)
Comment
In case you would have included your code showing your attempts to do this yourself, your question was answered within seconds.