I am reading multiple CSV files and reformatting them. I have developed this code which reads a single file. However, I am wondering if I can loop the process to read multiple files into separate dataframes and then work on those dataframes to format and rewrite a csv file.
import pandas as pd
station_id = 'id.csv'
input_file = 'filename.txt'
unformatted = 'C:/Users/....../Unformatted/'
formatted = 'C:/....../Formatted/'
print(f'\nReading data file: {input_file}.')
fields = {
'Timestamp': 'timestamp',
# 'Sample Point Name': 'station_name',
# 'Sample Point Name Description': 'station_description',
# 'Start Date':'state_date',
'PM10 (1h) Validated': 'PM_1h_10_ug_m3',
'PM10 Validated' :'PM_10_ug_m3',
# 'PM2.5 (1h) Final': 'pm_25',
# 'PM2.5 Final': 'pm2.5_ug_m3'
}
df = pd.read_table(unformatted+input_file, usecols=fields.keys(), sep='\t', encoding = 'utf-16')
df.rename(columns=fields, inplace=True)
df.loc[:, 'timestamp'] = pd.to_datetime(df['timestamp'], dayfirst=True)
df['date'] = df['timestamp']
df['time'] = df['timestamp']
df['date'] = df['date'].dt.strftime('%d/%m/%Y')
df['time'] = df['time'].apply(lambda z: z.strftime('%H%M'))
df['Date_Time'] = df['date'] +' '+ df['time']
df.drop(['timestamp', 'date', 'time'], axis=1, inplace=True)
df = df[['Date_Time', 'PM_1h_10_ug_m3', 'PM_10_ug_m3']]
availability_PM_1h = df['PM_1h_10_ug_m3'].count()/df['Date_Time'].count()*100
availability_PM_10_min = df['PM_10_ug_m3'].count()/df['Date_Time'].count()*100
#Check for nan values
PM10_nan = df['PM_10_ug_m3'].isnull().sum()
PM10_1h_nan = df['PM_1h_10_ug_m3'].isnull().sum()
print('Count of PM10 NaN: ' + str(PM10_nan))
print('Count of PM10_1h NaN: ' + str(PM10_1h_nan))
df.to_csv(formatted+station_id, index=False)
Say you wrap the whole code for whatever you want to do with a single file in a function: read_single_df(filepath). Then your code for multiple files will look like:
# filepaths: this is the variable to store the filepaths to all files as a list
import os
import pandas as pd
from typing import List
def read_csv(filepath: str, *args, **kwargs) -> pd.DataFrame:
"""Reads a single csv file and processes it before returning
a `pandas.DataFrame`.
"""
# your logic for a single file goes here
df = pd.read_csv(os.path.abspath(filepath))
# further processing steps for a single file...
# ...
return df
# define a list to store dataframes
dfs: List[pd.DataFrame] = []
# run loop to read and store dataframes in the list: dfs
for filepath in filepaths:
dfs.append(read_csv(filepath))
Now you can call each dataframe from the list dfs as dfs[0], dfs[1], etc. and apply further processing downstream.
Some suggested improvements to your code:
The following line is all you need instead of those six lines.
df['Date_Time'] = df['timestamp'].dt.strftime('%d/%m/%Y %H%M')
Related
I want to read a list of CSV files, for example exon_kipan.00001.csv, exon_kipan.00002.csv, exon_kipan.00003.csv, and exon_kipan.00004.csv (24 files in total), and then perform a series of operations using pandas before concatenating the dataframes.
For a single file, I would do:
df= pd.read_csv("exon_kipan.csv", sep="\t", index_col=0, low_memory=False)
df= df[df.columns[::3]]
df= df.T
del df[df.columns[0]]
df.index = df.index.str.upper()
df= df.sort_index()
df.index = ['-'.join( s.split('-')[:4]) for s in df.index.tolist() ]
df.rename_axis(None, axis=1, inplace=True)
However, now I want to read, manipulate, and concatenate multiple files.
filename = '/work/exon_kipan.{}.csv'
df_dict = {}
exon_clin_list = []
for i in range(1, 25):
df_dict[i] = pd.read_csv(filename, sep="\t", index_col=0, low_memory=False)
df_dict[i] = df_dict[i][df_dict[i].columns[::3]]
df_dict[i] = df_dict[i].T
del df_dict[i][df_dict[i].columns[0]]
df_dict[i].index = df_dict[i].index.str.upper()
df_dict[i] = df_dict[i].sort_index()
df_dict[i].index = ['-'.join( s.split('-')[:4]) for s in df_dict[i].index.tolist() ]
df_dict[i].rename_axis(None, axis=1, inplace=True)
exon_clin_list.append(df_dict[i])
exon_clin = pd.concat(df_list)
My code raised:
FileNotFoundError: [Errno 2] No such file or directory: '/work/exon_kipan.{}.csv'
You have to use format method of str:
filename = '/work/exon_kipan.{:05}.csv' # <- don't forget to modify here
...
for i in range(1, 25):
df_dict[i] = pd.read_csv(filename.format(i), ...)
Test:
filename = '/work/exon_kipan.{:05}.csv'
for i in range(1, 25):
print(filename.format(i))
# Output
/work/exon_kipan.00001.csv
/work/exon_kipan.00002.csv
/work/exon_kipan.00003.csv
/work/exon_kipan.00004.csv
/work/exon_kipan.00005.csv
/work/exon_kipan.00006.csv
/work/exon_kipan.00007.csv
/work/exon_kipan.00008.csv
/work/exon_kipan.00009.csv
/work/exon_kipan.00010.csv
/work/exon_kipan.00011.csv
/work/exon_kipan.00012.csv
/work/exon_kipan.00013.csv
/work/exon_kipan.00014.csv
/work/exon_kipan.00015.csv
/work/exon_kipan.00016.csv
/work/exon_kipan.00017.csv
/work/exon_kipan.00018.csv
/work/exon_kipan.00019.csv
/work/exon_kipan.00020.csv
/work/exon_kipan.00021.csv
/work/exon_kipan.00022.csv
/work/exon_kipan.00023.csv
/work/exon_kipan.00024.csv
may be something like this will work
#write a function to read file do some processing and return a dataframe
def read_file_and_do_some_actions(filename):
df = pd.read_csv(filename, index_col=None, header=0)
#############################
#do some processing
#############################
return df
path = r'/home/tester/inputdata/exon_kipan'
all_files = glob.glob(os.path.join(path, "/work/exon_kipan.*.csv"))
#for each file in all_files list, call function read_file_and_do_some_actions and then concatenate all the dataframes into one dataframe
df = pd.concat((read_file_and_do_some_actions(f) for f in all_files), ignore_index=True)
I have a code that merges all txt files from a directory into a dataframe
follow the code below
import pandas as pd
import os
import glob
diretorio = "F:\PROJETOS\LOTE45\ARQUIVOS\RISK\RISK_CUSTOM_FUND_N1"
files = []
files = [pd.read_csv(file, delimiter='\t')
for file in glob.glob(os.path.join(diretorio ,"*.txt"))]
df = pd.concat(files, ignore_index=True)
df
that gives result to this table
I needed to add a date column to this table, but I only have the date available at the end of the filename.
How can I get the date at the end of the filename and put it inside the dataframe.
I have no idea how to do this
Assuming the file naming pattern is constant, you can parse the end of the filename for every iteration of the loop this way :-
from datetime import datetime
files = []
for file in glob.glob(os.path.join(diretorio ,"*.txt")):
df_f = pd.read_csv(file, delimiter='\t')
df_f['date'] = datetime.strptime(file[-11:-4], "%d%m%Y")
files.append(df_f)
df = pd.concat(files, ignore_index=True)
import pandas as pd
import os
diretorio = "F:/PROJETOS/LOTE45/ARQUIVOS/RISK/RISK_CUSTOM_FUND_N1/"
files = []
for filename in os.listdir(diretorio):
if filename.endswith(".csv"):
df = pd.read_csv(diretorio + filename, sep=";")
df['Date'] = filename.split('.')[0].split("_")[-1]
files.append(df)
df = pd.concat(files, ignore_index=True)
print(df)
I have over 100 CSV files, whose date is saved in the following format: 08/03/2020 to 08/03/2020
I have the following code, but before append (or after) option I would like to fix date format (I only need first part), delete duplicate dates and sort dates chronologically. This is excel equivalent: left(cell,10)
import pandas as pd
import glob
import os
path = 'C:\\Users\\test\\Desktop\\AA2020\\files\\'
# Change directory
os.chdir(path)
# Define dataframe that will append the data to
df = pd.DataFrame()
for file in glob.glob('*.csv'):
df_tmp = pd.read_csv(path + file, skiprows=range(1, 10))
df_tmp.columns = ['value']
date = df_tmp.loc['Date', 'value']
count = df_tmp.loc['Results Found', 'value']
df = df.append(pd.DataFrame([[date, count]], columns=['date', 'count']))
df.to_excel(path + "results\\count.xlsx")
I am scanning a directory of text files and adding them to a Pandas dataframe:
text_path = "/home/tdun0002/stash/cloud_scripts/aws_scripts/output_files/memory_stats/text/"
filelist = os.listdir(text_path)
final_df = pd.DataFrame()
for filename in filelist:
my_file = text_path + filename
try:
df = pd.read_csv(my_file, delim_whitespace=True, header=None)
final_df = final_df.append(df)
pd.options.display.max_rows
print(f"\n***Full Data Frame: {df}\n***")
Each file in the directory holds the memory of a server:
bastion001-memory.txt
permissions001-memory.txt
haproxy001-memory.txt
The contents of the files look something like this:
cat haproxy001-memory.txt
7706172
On each pass of adding the file, it reports this:
Data Frame: Empty DataFrame
Columns: [7706172]
Index: []
And when I print out the full data frame it only has the last entry:
***Full Data Frame:
Empty DataFrame
Columns: [7706172]
Index: []
***
Why is it reporting that the dataframe is empty? Why is it only showing the last file that was input? I think I may need to append the data.
2 things:
You need to provide header=None in pd.read_csv command to consider the value in text file as data. This is because by default, pandas assumes the first row to be header.
Since you are reading multiple files, you need to append each dataframe into another. Currently you are overwriting df on each iteration.
Code should be like:
text_path = "/home/tdun0002/stash/cloud_scripts/aws_scripts/output_files/memory_stats/text/"
filelist = os.listdir(text_path)
final_df = pd.DataFrame()
for filename in filelist:
my_file = text_path + filename
try:
df = pd.read_csv(my_file, delim_whitespace=True, header=None)
final_df = final_df.append(df)
print(f"Data Frame: {df}")
pd.options.display.max_rows
print(f"\n***Full Data Frame: {df}\n***")
I'm trying to read a bunch of CSV-files into a single pandas dataframe. Some of the CSVs have data for multiple dates. I want only the data from each CSV that has a date equal to the modification date of each file.
Here is my current attempt:
import os
import datetime
import pandas as pd
from pandas import Series, DataFrame
import glob as glob
path =r'C:xxx'
allFiles = glob.glob(path + "/*.csv")
frame = pd.DataFrame()
def modification_date(filename):
t = os.path.getmtime(filename)
return datetime.datetime.fromtimestamp(t).strftime('%Y-%m-%d')
list_ = []
for file_ in allFiles:
df = pd.read_csv(file_,index_col=None, header=0)
df["DATE"] = pd.to_datetime(df["DATE"], format='%Y-%m-%d')
filedate = modification_date(allFiles)
df = df[(df["DATE"] == filedate)]
list_.append(df)
frame = pd.concat(list_)
frame.reset_index(inplace=True, drop=True)
This fails because the loop here creates a list of modification dates (since the folder contains many CSV's) that the function modification_date can't handle. Error is: "TypeError: coercing to Unicode: need string or buffer, list found"
I'm trying to wrap my head around how to modify this so each CSV is evaluated separately but can't seem to get far.
I would do it this way:
import os
import glob
import pandas as pd
fmask = 'C:/Temp/.data/aaa/*.csv'
all_files = glob.glob(fmask)
# returns file's modification date (the time part will be truncated)
def get_mdate(filename):
return (pd.to_datetime(os.path.getmtime(filename), unit='s')
.replace(hour=0, minute=0, second=0, microsecond=0))
df = pd.concat([pd.read_csv(f, parse_dates=['DATE'])
.query('DATE == #get_mdate(#f)')
for f in all_files
],
ignore_index=True)
Test:
1.csv: # modification date: 2016-07-07
DATE,VAL
2016-07-06,10
2016-07-06,10
2016-07-05,10
2016-07-07,110
2016-07-07,110
2.csv: # modification date: 2016-07-05
DATE,VAL
2016-07-06,1
2016-07-06,1
2016-07-05,1
2016-07-07,11
2016-07-07,11
Result:
In [208]: %paste
df = pd.concat([pd.read_csv(f, parse_dates=['DATE'])
.query('DATE == #get_mdate(#f)')
for f in all_files
],
ignore_index=True)
## -- End pasted text --
In [209]: df
Out[209]:
DATE VAL
0 2016-07-07 110
1 2016-07-07 110
2 2016-07-05 1