Format Custom DateTime for Entire Column - python

I'm looking to concatenate a bunch of csv files in the same directory that this code is ran in. I need the entire 'Date Time' column of these sheets to be in the format 'm/d/yyyy h:mm:ss.0' and I believe I just about have it.
Here is my current code (the format changing is at the very bottom):
import os
import pandas as pd
import glob
# returns all data after header in panda DataFrame
def skip_to(fle, line):
if os.stat(fle).st_size == 0:
raise ValueError("File is empty")
with open(fle, 'r') as f:
if check(fle, line):
pos = 0
cur_line = f.readline()
while not cur_line.startswith(line):
pos = f.tell()
# add current line to header dataframe
cur_line = f.readline()
f.seek(pos)
return pd.read_csv(f, parse_dates=[line], na_values=['Unknown'])
else:
return ""
def check(myfile, myline):
with open(myfile, 'r') as f:
datafile = f.readlines()
for line in datafile:
if myline in line:
return True
return False # finished the search without finding
# getting all csv files for concatenation
dir = os.getcwd()
files = [fn for fn in glob.glob(dir + '\**\\' + 'cdlog*.csv', recursive=True)]
df = pd.DataFrame()
for file in files:
if file.endswith('.csv'):
dp_data = skip_to(file, "Date Time")
if type(dp_data) != str:
dp_data.drop(0, axis=0, inplace=True)
df = pd.concat([df, dp_data], ignore_index=True, axis=0)
df['Date Time'] = pd.to_datetime(df['Date Time'])
df['Date Time'] = df['Date Time'].dt.strftime('%m/%d/%Y %H:%M:%S.0')
print(df['Date Time'])
# export to .csv
df.to_csv("test_output.csv")
With the print statement, I can see that it has it in the exact format that I'm looking for. When I check the newly created file, it is setting the format to 'mm:ss.0' instead. If I remove the '.0' from the end of the formatting, it sets it correctly in the new sheet, but it's only recording up to the minutes - it completely cuts off the seconds and I can't figure out why.
Example with having the '.0' at the end of the formatting:
Example without the '.0' at the end of the formatting:

pd.to_datetime() is outputting a datetime object, with witch you can work (to filter, to select ranges and so). If you want just a string formatted field, maybe you can apply dt.strftime() on the datetime object.
Example, once you got a datetime column by pd.to_datetime() you can apply:
df['Date Time'] = df['Date Time'].dt.strftime('%m/%d/%Y %H:%M:%S.0')

Related

Delete CSV file if missing specific column using python

Currently my code looks into CSV files in a folder and replaces strings based on if the file has column 'PROD_NAME' in the data. If it doesnt have column 'PROD_NAME', I'm trying to delete those files in the folder. I can get my code to print which csv files do not have the column with a little debugging, but I cant figure out how to actually delete or remove them from the folder they are in. I have tried an if statement that calls os.remove() and still nothing happens. No errors or anything.. it just finishes the script with all the files still in the folder. Here is my code. Any help is appreciated. Thanks!
def worker():
filenames = glob.glob(dest_dir + '\\*.csv')
print("Finding all files with column PROD_NAME")
time.sleep(3)
print("Changing names of products in these tables...")
for filename in filenames:
my_file = Path(os.path.join(dest_dir, filename))
try:
with open(filename):
# read data
df1 = pd.read_csv(filename, skiprows=1, encoding='ISO-8859-1') # read column header only - to get the list of columns
dtypes = {}
for col in df1.columns:# make all columns text, to avoid formatting errors
dtypes[col] = 'str'
df1 = pd.read_csv(filename, dtype=dtypes, skiprows=1, encoding='ISO-8859-1')
if 'PROD_NAME' not in df1.columns:
os.remove(filename)
#Replaces text in files
if 'PROD_NAME' in df1.columns:
df1 = df1.replace("NABVCI", "CLEAR_BV")
df1 = df1.replace("NAMVCI", "CLEAR_MV")
df1 = df1.replace("NA_NRF", "FA_GUAR")
df1 = df1.replace("N_FPFA", "FA_FLEX")
df1 = df1.replace("NAMRFT", "FA_SECURE_MVA")
df1 = df1.replace("NA_RFT", "FA_SECURE")
df1 = df1.replace("NSPFA7", "FA_PREFERRED")
df1 = df1.replace("N_ENHA", "FA_ENHANCE")
df1 = df1.replace("N_FPRA", "FA_FLEX_RETIRE")
df1 = df1.replace("N_SELF", "FA_SELECT")
df1 = df1.replace("N_SFAA", "FA_ADVANTAGE")
df1 = df1.replace("N_SPD1", "FA_SPD1")
df1 = df1.replace("N_SPD2", "FA_SPD2")
df1 = df1.replace("N_SPFA", "FA_LIFESTAGES")
df1 = df1.replace("N_SPPF", "FA_PLUS")
df1 = df1.replace("N__CFA", "FA_CHOICE")
df1 = df1.replace("N__OFA", "FA_OPTIMAL")
df1 = df1.replace("N_SCNI", "FA_SCNI")
df1 = df1.replace("NASCI_", "FA_SCI")
df1 = df1.replace("NASSCA", "FA_SSC")
df1.to_csv(filename, index=False, quotechar="'")
except:
if 'PROD_NAME' in df1.columns:
print("Could not find string to replace in this file: " + filename)
worker()
Written below is a block of code that reads the raw csv data. It extracts the first row of data (containing the column names) and looks for the column name PROD_NAME. If it finds it, it sets found to True. Else, it sets found to False. To prevent trying to delete the files whilst open, the removal is done outside of the open().
import os
filename = "test.csv"
with open(filename) as f: #Any code executed in here is while the file is open
if "PROD_NAME" in f.readlines()[0].split(","): #Replace "PROD_NAME" with the string you are looking for
print("found")
found = True
else:
print("not found")
found = False
if not found:
os.remove(filename)
else:
pass#Carry out replacements here/load it in pandas

Read multiple csv files into separate dataframes loop

I am reading multiple CSV files and reformatting them. I have developed this code which reads a single file. However, I am wondering if I can loop the process to read multiple files into separate dataframes and then work on those dataframes to format and rewrite a csv file.
import pandas as pd
station_id = 'id.csv'
input_file = 'filename.txt'
unformatted = 'C:/Users/....../Unformatted/'
formatted = 'C:/....../Formatted/'
print(f'\nReading data file: {input_file}.')
fields = {
'Timestamp': 'timestamp',
# 'Sample Point Name': 'station_name',
# 'Sample Point Name Description': 'station_description',
# 'Start Date':'state_date',
'PM10 (1h) Validated': 'PM_1h_10_ug_m3',
'PM10 Validated' :'PM_10_ug_m3',
# 'PM2.5 (1h) Final': 'pm_25',
# 'PM2.5 Final': 'pm2.5_ug_m3'
}
df = pd.read_table(unformatted+input_file, usecols=fields.keys(), sep='\t', encoding = 'utf-16')
df.rename(columns=fields, inplace=True)
df.loc[:, 'timestamp'] = pd.to_datetime(df['timestamp'], dayfirst=True)
df['date'] = df['timestamp']
df['time'] = df['timestamp']
df['date'] = df['date'].dt.strftime('%d/%m/%Y')
df['time'] = df['time'].apply(lambda z: z.strftime('%H%M'))
df['Date_Time'] = df['date'] +' '+ df['time']
df.drop(['timestamp', 'date', 'time'], axis=1, inplace=True)
df = df[['Date_Time', 'PM_1h_10_ug_m3', 'PM_10_ug_m3']]
availability_PM_1h = df['PM_1h_10_ug_m3'].count()/df['Date_Time'].count()*100
availability_PM_10_min = df['PM_10_ug_m3'].count()/df['Date_Time'].count()*100
#Check for nan values
PM10_nan = df['PM_10_ug_m3'].isnull().sum()
PM10_1h_nan = df['PM_1h_10_ug_m3'].isnull().sum()
print('Count of PM10 NaN: ' + str(PM10_nan))
print('Count of PM10_1h NaN: ' + str(PM10_1h_nan))
df.to_csv(formatted+station_id, index=False)
Say you wrap the whole code for whatever you want to do with a single file in a function: read_single_df(filepath). Then your code for multiple files will look like:
# filepaths: this is the variable to store the filepaths to all files as a list
import os
import pandas as pd
from typing import List
def read_csv(filepath: str, *args, **kwargs) -> pd.DataFrame:
"""Reads a single csv file and processes it before returning
a `pandas.DataFrame`.
"""
# your logic for a single file goes here
df = pd.read_csv(os.path.abspath(filepath))
# further processing steps for a single file...
# ...
return df
# define a list to store dataframes
dfs: List[pd.DataFrame] = []
# run loop to read and store dataframes in the list: dfs
for filepath in filepaths:
dfs.append(read_csv(filepath))
Now you can call each dataframe from the list dfs as dfs[0], dfs[1], etc. and apply further processing downstream.
Some suggested improvements to your code:
The following line is all you need instead of those six lines.
df['Date_Time'] = df['timestamp'].dt.strftime('%d/%m/%Y %H%M')

How to filter a csv file based on datetime?

I have a csv file which looks like this (obviously in reality its much bigger):
1,$1,AA,GG,DD,2020-01-01T00:01:10.740+02:00
2,$2,A1,FD,HH,2020-01-01T00:02:00.240+02:00
3,$3,1A,PP,LL,2020-01-01T00:03:30.460+02:00
4,$4,S1,LL,SS,2020-02-01T00:01:11.190+02:00
5,$5,2G,PP,FF,2020-01-01T00:04:20.320+02:00
6,$6,5S,LL,TT,2020-02-01T01:02:15.180+02:00
I need to take the first row, take that date and check whether the rest of the rows are equal to that day and between 0:00:00.000 hour AND 23:59:59.999 hour. So to say it simple.. I take all the rows inside the date of the first row.
This is my desired outcome:
1,$1,AA,GG,DD,2020-01-01T00:01:10.740+02:00
2,$2,A1,FD,HH,2020-01-01T00:02:00.240+02:00
3,$3,1A,PP,LL,2020-01-01T00:03:30.460+02:00
5,$5,2G,PP,FF,2020-01-01T00:04:20.320+02:00
This is my code:
root = r'c:\data\FF\Desktop\my_files\file01.txt'
with open(root, 'r') as my_file:
reader = csv.reader(my_file)
def filter_row():
for row in reader:
date_time = row[5] #<--- extract the datetime
fdate_time = datetime.strptime(date_time, '%Y-%m-%dT%H:%M:%S.%f%z') #<--- make a datetime object of it
x = fdate_time.date() #<--- extract the y/m/d
begin_time = datetime.strptime(x + '00:00.00+02:00','%Y-%m-%dT%H:%M:%S.%f%z') #<--- fix the start time of a day
end_time = datetime.strptime(x + '23:59:59.999+02:00', '%Y-%m-%dT%H:%M:%S.%f%z') #<--- fix the end time of a day
filtered_records = fdate_time >= begin_time and fdate_time <= end_time #<filter everything between the start and end time
return filtered_records
filter_row()
When I run above code I receive:
File "C:\data\FF\Desktop\Python\My_python\Filter_csv.py", line 82, in filter_row
for row in reader:
ValueError: I/O operation on closed file.
I really lost it as I don't know how to fix this. I looked for multiple solutions but couldn't find any. Hope someone can tell and show me how it works. Thank you all.
The context management provided by with ensure that the resources are freed at end of block. That mean that everything should be read inside the with bloc.
A simple way would be to parameterize the function:
root = r'c:\data\FF\Desktop\my_files\file01.txt'
def filter_row(reader):
for row in reader:
...
return filtered_records
with open(root, 'r') as my_file:
reader = csv.reader(my_file)
filter_row(reader)
But:
you should use the datetime.replace method to compute the start and end of the day instead of using strings
if you want to write the lines to a new file, you should change filter row into a generator:
root = r'c:\data\FF\Desktop\my_files\file01.txt'
newf= r'c:\data\FF\Desktop\my_files\file01.csv'
def filter_row(reader):
first = True
for row in reader:
date_time = row[5] # <--- extract the datetime
fdate_time = datetime.strptime(date_time, '%Y-%m-%dT%H:%M:%S.%f%z') # <--- make a datetime object of it
if first: # special processing for the first line
first = False
begin_time = fdate_time.replace(hour=0, minute=0, second=0, microsecond=0) # <--- fix the start time of a day
end_time = fdate_time.replace(hour=23, minute=59, second=59, microsecond=999999) # <--- fix the end time of a day
yield row # yield first row
elif fdate_time >= begin_time and fdate_time <= end_time: # <filter everything between the start and end time
yield row # and rows of same date
with open(root) as my_file, open(newf, 'w', newline=None) as new_file:
reader = csv.reader(my_file)
writer = csv.writer(new_file)
writer.writerows(filter_row(reader))
I can suggest you doing it using pandas.
Read the file using pandas in dataframe
Then restrict the rows to first value of date (filter the records and put in different data frame)
New data frame will have the desired output
Pandas will give you easy scalability too in case in future your file size increases.

Read and concat csv files from past 14 days in python pandas

I need to write a script that read all csv files with specific names from past 14 days (every day in the morning), but when I do concat this gives me a little cube (in jupyter-notebook), and that sign there is nothing.
def get_local_file(pdate, hour, path='/data/'):
"""Get date+hour processing file from local drive
:param pdate: str Processing date
:param hour: str Processing hour
:param path: str Path to file location
:return: Pandas DF Retrieved DataFrame
"""
sdate = pdate + '-' + str(hour)
for p_file in os.listdir(path):
if fnmatch.fnmatch(p_file, 'ABC_*'+sdate+'*.csv'):
return path+p_file
def get_files(pdate, path='/data/'):
hours = [time(i).strftime('%H') for i in range(24)]
fileList=[]
for hour in hours:
fileList.append(get_local_file(pdate, hour))
return fileList
end_datetime = datetime.combine(date.today(), time(0, 0, 0))
proc_datetime = end_datetime - timedelta(days=14)
while proc_datetime <= end_datetime:
proc_datetime += timedelta(days=1)
a = get_files(str(proc_datetime.date()).replace('-', '_'))
frame = pd.DataFrame()
list_ = []
for file_ in a:
if file_ != None:
df = pd.read_csv(file_,index_col=None, header=0, delimiter=';')
list_.append(df)
frame = pd.concat(list_)
I'm pretty sure that is possible to make code from while loop and below much simpler, but have no idea how to do it.
If you want to create a single dataframe from a bunch of .csv files you can do it this way:
initialize an empty list before the loop
loop over files, for every one read in a dataframe and append it to the list
concatenate the list into a single dataframe after the loop
I did not check if your handling of dates and filenames is correct but here are the relevant changes to your code regarding the concatenation part:
end_datetime = datetime.combine(date.today(), time(0, 0, 0))
proc_datetime = end_datetime - timedelta(days=14)
list_ = []
while proc_datetime <= end_datetime:
proc_datetime += timedelta(days=1)
a = get_files(str(proc_datetime.date()).replace('-', '_'))
for file_ in a:
if file_ != None:
df = pd.read_csv(file_, index_col=None, header=0, delimiter=';')
list_.append(df)
frame = pd.concat(list_)
import pandas
import glob
csvFiles = glob.glob(path + "/data/*.csv")
list_ = []
for file in csvFiles:
if ((datetime.combine(date.today(), time(0, 0, 0)) - datetime(*map(int, file.split("-")[1].split("_")))).days < 14)
df = pandas.read_csv(file, index_col=None, header=0, delimiter=';')
list_.append(df_f)
frame = pandas.concat(list_, ignore_index=True)
frame.to_csv("Appended File.csv")
Assuming the file path doesnot have any hyphen(-) characters in it.

Dataframe to .csv - is only writing last value - Python/Pandas

I'm trying to write a dataframe to a .csv using df.to_csv(). For some reason, its only writing the last value (data for the last ticker). It reads through a list of tickers (turtle, all tickers are in first column) and spits out price data for each ticker. I can print all the data without a problem but can't seem to write to .csv. Any idea why? Thanks
input_file = pd.read_csv("turtle.csv", header=None)
for ticker in input_file.iloc[:,0].tolist():
data = web.DataReader(ticker, "yahoo", datetime(2011,06,1), datetime(2016,05,31))
data['ymd'] = data.index
year_month = data.index.to_period('M')
data['year_month'] = year_month
first_day_of_months = data.groupby(["year_month"])["ymd"].min()
first_day_of_months = first_day_of_months.to_frame().reset_index(level=0)
last_day_of_months = data.groupby(["year_month"])["ymd"].max()
last_day_of_months = last_day_of_months.to_frame().reset_index(level=0)
fday_open = data.merge(first_day_of_months,on=['ymd'])
fday_open = fday_open[['year_month_x','Open']]
lday_open = data.merge(last_day_of_months,on=['ymd'])
lday_open = lday_open[['year_month_x','Open']]
fday_lday = fday_open.merge(lday_open,on=['year_month_x'])
monthly_changes = {i:MonthlyChange(i) for i in range(1,13)}
for index,ym, openf,openl in fday_lday.itertuples():
month = ym.strftime('%m')
month = int(month)
diff = (openf-openl)/openf
monthly_changes[month].add_change(diff)
changes_df = pd.DataFrame([monthly_changes[i].get_data() for i in monthly_changes],columns=["Month","Avg Inc.","Inc","Avg.Dec","Dec"])
CSVdir = r"C:\Users\..."
realCSVdir = os.path.realpath(CSVdir)
if not os.path.exists(CSVdir):
os.makedirs(CSVdir)
new_file_name = os.path.join(realCSVdir,'PriceData.csv')
new_file = open(new_file_name, 'wb')
new_file.write(ticker)
changes_df.to_csv(new_file)
Use a for appending instead of wb because it overwrites the data in every iteration of loop.For different modes of opening a file see here.

Categories