Load biggest csv file of a folder in python - python

I have a folder on my computer, where are several fiels saved. How can I automatically load the biggest file (in terms of size in kb) of it only?
Right now I could use:
#Sort it with the help of windows, biggest file on top and then:
import pandas as pd
df = pd.read_csv(r'C:\...\FileABC.csv') #when I know FileABC is listed on the top
Is there a way to automatically do that in python? Then I could skip the manual adjustment in windows.

Try this:
import os
import pandas as pd
basedir = 'C:/Users/viupadhy/Desktop/Stackoverflow'
names = os.listdir(basedir)
paths = [os.path.join(basedir, name) for name in names]
sizes = [(path, os.stat(path).st_size) for path in paths]
file = max(sizes, key=lambda x: x[1])
print(file)
df = pd.read_csv(file[0])
df
Output:

A simple way to do this:
import os
def find_largest_file(path):
largest = None
max_size = 0
for filename in os.listdir(path):
if os.path.isfile(filename):
size = os.path.getsize(filename)
if size > max_size:
largest = filename
max_size = size
return largest
print(find_largest_file(path))
# ... whaterver largest file you have in `path`.
This can be further improved by filtering only .csv extension and the like.

You can use something like:
import os
import pandas as pd
folder_path = "C:\\programs\\"
file_list = os.listdir(folder_path)
biggest_file = os.path.join(folder_path, file_list[0])
for file in file_list:
file_location = os.path.join(folder_path, file)
size = os.path.getsize(file_location)
if size > os.path.getsize(biggest_file):
biggest_file = file_location
df = pd.read_csv(biggest_file)

Related

How to get the filename in directory with the max number in the filename python?

I have some xml files in a folder as example 'assests/2020/2010.xml', 'assests/2020/20005.xml', 'assests/2020/20999.xml' etc. I want to get the filename with max value in the '2020' folder. For above three files output should be 20999.xml
I am trying as following:
import glob
import os
list_of_files = glob.glob('assets/2020/*')
# latest_file = max(list_of_files, key=os.path.getctime)
# print (latest_file)
I couldn't be able to find logic to get the required file.
Here is the resource that have best answer to my query but I couldn't build my logic.
You can use pathlib to glob for the xml files and access the Path object attributes like .name and .stem:
from pathlib import Path
list_of_files = Path('assets/2020/').glob('*.xml')
print(max((Path(fn).name for fn in list_of_files), key=lambda fn: int(Path(fn).stem)))
Output:
20999.xml
I can't test it out right now, but you may try this:
files = []
for filename in list_of_files:
filename = str(filename)
filename = filename.replace('.xml','') #Assuming it's not printing your complete directory path
filename = int(filename)
files += [filename]
print(files)
This should get you your filenames in integer format and now you should be able to sort them in descending order and get the first item of the sorted list.
Use re to search for the appropriate endings in your file paths. If found use re again to extract the nr.
import re
list_of_files = [
'assests/2020/2010.xml',
'assests/2020/20005.xml',
'assests/2020/20999.xml'
]
highest_nr = -1
highest_nr_file = ''
for f in list_of_files:
re_result = re.findall(r'\d+\.xml$', f)
if re_result:
nr = int(re.findall(r'\d+', re_result[0])[0])
if nr > highest_nr:
highest_nr = nr
highest_nr_file = f
print(highest_nr_file)
Result
assests/2020/20999.xml
You can also try this way.
import os, re
path = "assests/2020/"
files =[
"assests/2020/2010.xml",
"assests/2020/20005.xml",
"assests/2020/20999.xml"
]
n = [int(re.findall(r'\d+\.xml$',file)[0].split('.')[0]) for file in files]
output = str(max(n))+".xml"
print("Biggest max file name of .xml file is ",os.path.join(path,output))
Output:
Biggest max file name of .xml file is assests/2020/20999.xml
import glob
xmlFiles = []
# this will store all the xml files in your directory
for file in glob.glob("*.xml"):
xmlFiles.append(file[:4])
# this will print the maximum one
print(max(xmlFiles))

Loop through multiple CSV files and run a script

I have a script which pulls in data from a csv file, does some manipulations to it and creates an output excel file. But, its a tedious process as I need to do it for multiple files.
Question: Is there a way for me to run this script across multiple csv files together and create a separate excel file output for each input file?
I'm not sure what to try out here. I've read that I need to use a module called glob but I'm not sure how to go about it.
This script works for a single file:
# Import libraries
import pandas as pd
import xlsxwriter
# Set system paths
INPUT_PATH = 'SystemPath//Downloads//'
INPUT_FILE = 'rawData.csv'
OUTPUT_PATH = 'SystemPath//Downloads//Output//'
OUTPUT_FILE = 'rawDataOutput.xlsx'
# Get data
df = pd.read_csv(INPUT_PATH + INPUT_FILE)
# Clean data
cleanedData = df[['State','Campaigns','Type','Start date','Impressions','Clicks','Spend(INR)',
'Orders','Sales(INR)','NTB orders','NTB sales']]
cleanedData = cleanedData[cleanedData['Impressions'] != 0].sort_values('Impressions',
ascending= False).reset_index()
cleanedData.loc['Total'] = cleanedData.select_dtypes(pd.np.number).sum()
cleanedData['CTR(%)'] = (cleanedData['Clicks'] /
cleanedData['Impressions']).astype(float).map("{:.2%}".format)
cleanedData['CPC(INR)'] = (cleanedData['Spend(INR)'] / cleanedData['Clicks'])
cleanedData['ACOS(%)'] = (cleanedData['Spend(INR)'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData['% of orders NTB'] = (cleanedData['NTB orders'] /
cleanedData['Orders']).astype(float).map("{:.2%}".format)
cleanedData['% of sales NTB'] = (cleanedData['NTB sales'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData = cleanedData[['State','Campaigns','Type','Start date','Impressions','Clicks','CTR(%)',
'Spend(INR)','CPC(INR)','Orders','Sales(INR)','ACOS(%)',
'NTB orders','% of orders NTB','NTB sales','% of sales NTB']]
# Create summary
summaryData = cleanedData.groupby(['Type'])[['Spend(INR)','Sales(INR)']].agg('sum')
summaryData.loc['Overall Snapshot'] = summaryData.select_dtypes(pd.np.number).sum()
summaryData['ROI'] = summaryData['Sales(INR)'] / summaryData['Spend(INR)']
# Push to excel
writer = pd.ExcelWriter(OUTPUT_PATH + OUTPUT_FILE, engine='xlsxwriter')
summaryData.to_excel(writer, sheet_name='Summary')
cleanedData.to_excel(writer, sheet_name='Overall Report')
writer.save()
I've never tried anything like this before and I would appreciate your help trying to figure this out
You can use Python's glob.glob() to get all of the CSV files from a given folder. For each filename that is returned, you could derive a suitable output filename. The file processing could be moved into a function as follows:
# Import libraries
import pandas as pd
import xlsxwriter
import glob
import os
def process_csv(input_filename, output_filename):
# Get data
df = pd.read_csv(input_filename)
# Clean data
cleanedData = df[['State','Campaigns','Type','Start date','Impressions','Clicks','Spend(INR)',
'Orders','Sales(INR)','NTB orders','NTB sales']]
cleanedData = cleanedData[cleanedData['Impressions'] != 0].sort_values('Impressions',
ascending= False).reset_index()
cleanedData.loc['Total'] = cleanedData.select_dtypes(pd.np.number).sum()
cleanedData['CTR(%)'] = (cleanedData['Clicks'] /
cleanedData['Impressions']).astype(float).map("{:.2%}".format)
cleanedData['CPC(INR)'] = (cleanedData['Spend(INR)'] / cleanedData['Clicks'])
cleanedData['ACOS(%)'] = (cleanedData['Spend(INR)'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData['% of orders NTB'] = (cleanedData['NTB orders'] /
cleanedData['Orders']).astype(float).map("{:.2%}".format)
cleanedData['% of sales NTB'] = (cleanedData['NTB sales'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData = cleanedData[['State','Campaigns','Type','Start date','Impressions','Clicks','CTR(%)',
'Spend(INR)','CPC(INR)','Orders','Sales(INR)','ACOS(%)',
'NTB orders','% of orders NTB','NTB sales','% of sales NTB']]
# Create summary
summaryData = cleanedData.groupby(['Type'])[['Spend(INR)','Sales(INR)']].agg('sum')
summaryData.loc['Overall Snapshot'] = summaryData.select_dtypes(pd.np.number).sum()
summaryData['ROI'] = summaryData['Sales(INR)'] / summaryData['Spend(INR)']
# Push to excel
writer = pd.ExcelWriter(output_filename, engine='xlsxwriter')
summaryData.to_excel(writer, sheet_name='Summary')
cleanedData.to_excel(writer, sheet_name='Overall Report')
writer.save()
# Set system paths
INPUT_PATH = 'SystemPath//Downloads//'
OUTPUT_PATH = 'SystemPath//Downloads//Output//'
for csv_filename in glob.glob(os.path.join(INPUT_PATH, "*.csv")):
name, ext = os.path.splitext(os.path.basename(csv_filename))
# Create an output filename based on the input filename
output_filename = os.path.join(OUTPUT_PATH, f"{name}Output.xlsx")
process_csv(csv_filename, output_filename)
os.path.join() can be used as a safer way to join file paths together.
Something like:
import os
import glob
import pandas as pd
os.chdir(r'path\to\folder') #changes folder path to working dir
filelist=glob.glob('*.csv') #creates a list of all csv files
for file in filelist: #loops through the files
df=pd.read_csv(file,...)
#Do something and create a final_df
final_df.to_excel(file[:-4],+'_output.xlsx',index=False) #excel with same name+ouput
you can run this scrip inside a for loop:
for file in os.listdir(INPUT_PATH):
if file.endswith('.csv') or file.endswith('.CSV'):
INPUT_FILE = INPUT_PATH + '/' + file
OUTPUT_FILE = INPUT_PATH + '/Outputs/' + file.[:-4] + 'xlsx'
try this:
import glob
files = glob.glob(INPUT_PATH + "*.csv")
for file in files:
# Get data
df = pd.read_csv(file)
# Clean data
#your cleaning code
# Push to excel
writer = pd.ExcelWriter(OUTPUT_PATH + file.split("/")[-1].replace(".csv","_OUTPUT.xlxs", engine='xlsxwriter')

Python , get duplicates in 1st column of all csv files in a directory

import pandas as pd
import glob
dataset = pd.read_csv('masterfeedproduction-EURNA_2016-06-27.csv',sep =
',',delimiter = None) # select 1 file in the directory
datasets_cols = ['transactionID','gvkey','companyName']
df= dataset.transactionID
df.shape
df.loc[df.duplicated()]
returns the duplicates in the selected file. displays row number and transactionID. so this is correct.
target_directory = r'C:\Users\nikol\Downloads\fullDailyDeltas\fullDailyDeltas'
file_list = glob.glob(target_directory + "/*.csv")
df_result = df.loc[df.duplicated()]
for file in file_list:
return(df_result)
here I am stuck.
target_directory = r'C:\Users\nikol\Downloads\fullDailyDeltas\fullDailyDeltas'
file_list = glob.glob(target_directory + "/*.csv")
for file in file_list:
dataset = pd.read_csv(file)
df = dataset.transactionID
duplicated = df.loc[df.duplicated()]
if duplicated.empty == False:
print(file)
print(duplicated)
Have a look at the glob module.
import pandas as pd
import glob
def your_function(file):
# put your df processing logic here
return df_result
Step 1 - Create list of files in directory
target_directory = r'Path/to/your/dir'
file_list = glob.glob(target_directory + "/*.csv")
# Include slash or it will search in the wrong directory!!
Step 2 - Loop through files in list
for file in file_list: # Loop files
df_result = your_function(file) # Put your logic into a separate function
new_filename = file.replace('.csv', '_processed.csv')
df_result.to_csv(new_filename, index = False)
Comment
In case you would have included your code showing your attempts to do this yourself, your question was answered within seconds.

Finding text files with specific matching numbers or string in the filename

I have imported a file with the column head serial numbers Head serial #
This is a list of serial numbers:
Head serial #
UG0013
UG0025
UG0043
UG0053
UG5214
UG5246
UT5324
UT0244
TH7035
TH7106
TH7212
TH7218
TH7362
C499277BT433
D499241BD221
D499227BQ004
B500438BZ921
B500425BZ933
I need to find all the text files on a network folder that have these numbers in the filename. Please help!
Here is my code so far which is currently returning ALL .txt files but I only want the files with the above serial numbers in the name Thanks in advance!
import matplotlib.pyplot as plot
import pandas as pd
import xlrd
""" This is the master file for reading the lifetest lasers """
masterfile_location = 'C:/Users/gallachj/Documents/Lifetest_Master.xlsx'
#df = pd.read_excel(masterfile_location)
from pandas import ExcelWriter
from pandas import ExcelFile
df = pd.read_excel(masterfile_location, sheet_name='Sheet1')
#print("Column headings:")
#print(df.columns)
#print(df['Head serial #'])
sns = df['Head serial #']
headtypes = df['Head type']
colors = df['Wavelength (nm)']
powers = df['Power rating (W)']
import fnmatch
import os
os.chdir('C:/Users/gallachj/Documents/')
for file in os.listdir('.'):
if fnmatch.fnmatch(file, '*.txt'):
# if fnmatch.filter(sns, '*.txt')
(print(file))`
based on this answer from Noufal Ibrahim, you can try something like this:
import os
def find_filenames(d, s):
try:
files = os.listdir(d)
except PermissionError: # network drives usually have a lot inaccessible folders
return []
matched_files = []
for f in files: # loop over elements in folder
full_name = os.path.join(d, f) # get full relative name
if os.path.isdir(full_name): # recursive call
matched_files += find_filenames(full_name, s)
elif os.path.isfile(full_name):
if any(serial in f for serial in s): # check the filename
matched_files.append(os.path.realpath(f)) # Remember the matched file
return matched_files # Return a list of matched files
find_filenames(r'\\40i2039p-0\d\_Mer_', ['search', 'words'])

Creating Multiple featureclasses from data in .txt Files

I am trying to create multiple feature classes from data with .txt extension. My code runs, but only produces one .shp file. The variable xyTable when checked does contain all the file extensions. These then should individually run through both Arcpy functions and produce the relevant featureclass files named in accordance with their .txt files.
import arcpy
import os
import tempfile
import shutil
shpFileArray = []
print "\n"
arcpy.env.overwriteOutput = True
newFolder = "destinationpath"
if os.path.exists(newFolder):
tmp = tempfile.mktemp(dir=os.path.dirname(newFolder))
shutil.move(newFolder, tmp)
shutil.rmtree(tmp)
os.makedirs(newFolder)
arcpy.env.workspace = newFolder
for file in os.listdir("sourcepath"):
layerName = file[:-4]
fileSHP = layerName+".shp"
for file in os.listdir("sourcepath"):
if file.endswith(".txt"):
xyTable = (os.path.join("destinationpath", file))
arcpy.MakeXYEventLayer_management(table= xyTable, in_x_field="EastingM", in_y_field="NorthingM", out_layer="layerName",...continues...
arcpy.FeatureClassToFeatureClass_conversion(in_features="layerName", out_path="destinationpath", out_name= fileSHP,....continues....
Looks like you are not giving the FeatureClassToFeatureClass tool unique shapefile names. After the first For loop finishes, fileSHP doesn't change. Looks like you have the shpFileArray set up to hold the list of fileSHPs. Perhaps try something like this to save your set of fileSHPs in the first For loop and refer to them in the second For loop. My python might not be exactly right, but I think the idea is.
import arcpy
import os
import tempfile
import shutil
shpFileArray = []
print "\n"
arcpy.env.overwriteOutput = True
newFolder = "destinationpath"
if os.path.exists(newFolder):
tmp = tempfile.mktemp(dir=os.path.dirname(newFolder))
shutil.move(newFolder, tmp)
shutil.rmtree(tmp)
os.makedirs(newFolder)
arcpy.env.workspace = newFolder
for file in os.listdir("sourcepath"):
layerName = file[:-4]
fileSHP = layerName+".shp"
shpFileArray.append(fileSHP)
for idx, file in enumerate(os.listdir("sourcepath")):
if file.endswith(".txt"):
xyTable = (os.path.join("destinationpath", file))
outShape = shapeFileArray[idx]
arcpy.MakeXYEventLayer_management(table= xyTable, in_x_field="EastingM", in_y_field="NorthingM", out_layer="layerName",...continues...
arcpy.FeatureClassToFeatureClass_conversion(in_features="layerName", out_path="destinationpath", out_name= outShape,....continues....

Categories