I need to write a script that read all csv files with specific names from past 14 days (every day in the morning), but when I do concat this gives me a little cube (in jupyter-notebook), and that sign there is nothing.
def get_local_file(pdate, hour, path='/data/'):
"""Get date+hour processing file from local drive
:param pdate: str Processing date
:param hour: str Processing hour
:param path: str Path to file location
:return: Pandas DF Retrieved DataFrame
"""
sdate = pdate + '-' + str(hour)
for p_file in os.listdir(path):
if fnmatch.fnmatch(p_file, 'ABC_*'+sdate+'*.csv'):
return path+p_file
def get_files(pdate, path='/data/'):
hours = [time(i).strftime('%H') for i in range(24)]
fileList=[]
for hour in hours:
fileList.append(get_local_file(pdate, hour))
return fileList
end_datetime = datetime.combine(date.today(), time(0, 0, 0))
proc_datetime = end_datetime - timedelta(days=14)
while proc_datetime <= end_datetime:
proc_datetime += timedelta(days=1)
a = get_files(str(proc_datetime.date()).replace('-', '_'))
frame = pd.DataFrame()
list_ = []
for file_ in a:
if file_ != None:
df = pd.read_csv(file_,index_col=None, header=0, delimiter=';')
list_.append(df)
frame = pd.concat(list_)
I'm pretty sure that is possible to make code from while loop and below much simpler, but have no idea how to do it.
If you want to create a single dataframe from a bunch of .csv files you can do it this way:
initialize an empty list before the loop
loop over files, for every one read in a dataframe and append it to the list
concatenate the list into a single dataframe after the loop
I did not check if your handling of dates and filenames is correct but here are the relevant changes to your code regarding the concatenation part:
end_datetime = datetime.combine(date.today(), time(0, 0, 0))
proc_datetime = end_datetime - timedelta(days=14)
list_ = []
while proc_datetime <= end_datetime:
proc_datetime += timedelta(days=1)
a = get_files(str(proc_datetime.date()).replace('-', '_'))
for file_ in a:
if file_ != None:
df = pd.read_csv(file_, index_col=None, header=0, delimiter=';')
list_.append(df)
frame = pd.concat(list_)
import pandas
import glob
csvFiles = glob.glob(path + "/data/*.csv")
list_ = []
for file in csvFiles:
if ((datetime.combine(date.today(), time(0, 0, 0)) - datetime(*map(int, file.split("-")[1].split("_")))).days < 14)
df = pandas.read_csv(file, index_col=None, header=0, delimiter=';')
list_.append(df_f)
frame = pandas.concat(list_, ignore_index=True)
frame.to_csv("Appended File.csv")
Assuming the file path doesnot have any hyphen(-) characters in it.
Related
I'm looking to concatenate a bunch of csv files in the same directory that this code is ran in. I need the entire 'Date Time' column of these sheets to be in the format 'm/d/yyyy h:mm:ss.0' and I believe I just about have it.
Here is my current code (the format changing is at the very bottom):
import os
import pandas as pd
import glob
# returns all data after header in panda DataFrame
def skip_to(fle, line):
if os.stat(fle).st_size == 0:
raise ValueError("File is empty")
with open(fle, 'r') as f:
if check(fle, line):
pos = 0
cur_line = f.readline()
while not cur_line.startswith(line):
pos = f.tell()
# add current line to header dataframe
cur_line = f.readline()
f.seek(pos)
return pd.read_csv(f, parse_dates=[line], na_values=['Unknown'])
else:
return ""
def check(myfile, myline):
with open(myfile, 'r') as f:
datafile = f.readlines()
for line in datafile:
if myline in line:
return True
return False # finished the search without finding
# getting all csv files for concatenation
dir = os.getcwd()
files = [fn for fn in glob.glob(dir + '\**\\' + 'cdlog*.csv', recursive=True)]
df = pd.DataFrame()
for file in files:
if file.endswith('.csv'):
dp_data = skip_to(file, "Date Time")
if type(dp_data) != str:
dp_data.drop(0, axis=0, inplace=True)
df = pd.concat([df, dp_data], ignore_index=True, axis=0)
df['Date Time'] = pd.to_datetime(df['Date Time'])
df['Date Time'] = df['Date Time'].dt.strftime('%m/%d/%Y %H:%M:%S.0')
print(df['Date Time'])
# export to .csv
df.to_csv("test_output.csv")
With the print statement, I can see that it has it in the exact format that I'm looking for. When I check the newly created file, it is setting the format to 'mm:ss.0' instead. If I remove the '.0' from the end of the formatting, it sets it correctly in the new sheet, but it's only recording up to the minutes - it completely cuts off the seconds and I can't figure out why.
Example with having the '.0' at the end of the formatting:
Example without the '.0' at the end of the formatting:
pd.to_datetime() is outputting a datetime object, with witch you can work (to filter, to select ranges and so). If you want just a string formatted field, maybe you can apply dt.strftime() on the datetime object.
Example, once you got a datetime column by pd.to_datetime() you can apply:
df['Date Time'] = df['Date Time'].dt.strftime('%m/%d/%Y %H:%M:%S.0')
Original spreadsheets have 2 columns. I want to pick the rows by given criteria (according to months), and put them into new files.
The original files looked like:
The codes I am using:
import os
import pandas as pd
working_folder = "C:\\My Documents\\"
file_list = ["Jan.xlsx", "Feb.xlsx", "Mar.xlsx"]
with open(working_folder + '201703-1.csv', 'a') as f03:
for fl in file_list:
df = pd.read_excel(working_folder + fl)
df_201703 = df[df.ARRIVAL.between(20170301, 20170331)]
df_201703.to_csv(f03, header = True)
with open(working_folder + '201702-1.csv', 'a') as f02:
for fl in file_list:
df = pd.read_excel(working_folder + fl)
df_201702 = df[df.ARRIVAL.between(20170201, 20170231)]
df_201702.to_csv(f02, header = True)
with open(working_folder + '201701-1.csv', 'a') as f01:
for fl in file_list:
df = pd.read_excel(working_folder + fl)
df_201701 = df[df.ARRIVAL.between(20170101, 20170131)]
df_201701.to_csv(f01, header = True)
The results are like:
Improvements I want to make:
Save them as xlsx files instead of .csv
Not to have the first index columns
Keeping only 1 row (top) headers (now each csv has 3 rows of headers)
How can I do that? Thank you.
I think need create list of DataFrames, concat together and then write to file:
dfs1 = []
for fl in file_list:
df = pd.read_excel(working_folder + fl)
dfs1.append(df[df.ARRIVAL.between(20170101, 20170131)] )
pd.concat(dfs1).to_excel('201701-1.xlsx', index = False)
What should be simplify by list comprehension:
file_list = ["Jan.xlsx", "Feb.xlsx", "Mar.xlsx"]
dfs1 = [pd.read_excel(working_folder + fl).query('20170101 >= ARRIVAL >=20170131') for fl in file_list]
pd.concat(dfs1).to_excel('201701-1.xlsx', index = False)
I started off pulling all files in the folder and concatenating them, this one works:
warranty_list = []
warranty_files = glob.glob(os.path.join(qms, '*.csv'))
for file_ in warranty_files:
df = pd.read_csv(file_,index_col=None, header=0)
warranty_list.append(df)
warranty = pd.concat(warranty_list)
Then I had to write a function so I would only grab certain files and concatenate them, but this one is not working. I do not get an error but the last line is not being used, so I am not concatenating the files.
def get_warranty(years=5):
warranty_list = [] #list for glob.glob()
current_year = datetime.datetime.today().year #current year
last_n_years = [str(current_year-i) for i in range(0,years+1)]
for year in last_n_years:
warranty = glob.glob(os.path.join(qms, "Warranty Detail%s.csv" % year))
if warranty:
for file_ in warranty:
df = pd.read_csv(file_,index_col=None, header=0)
warranty_list.append(df)
warranty_df = pd.concat(warranty_list)
The last line isn't working presumably because the pd.concat() is getting a list as an input and it won't do anything with that. O don't understand why it worked in the first set of code and not this one.
I don't know how to change the function to get a data frame or how to change what I get at the end into a data frame.
Any suggestions?
I would suggest to use directly append because it do same thing as concat
So basically you start with an empty dataframe
warranty_df = pd.Dataframe()
And then append the the others dataframe to this while reading the file
So your function should remain the same but you need to delete the following line
warranty_df = pd.concat(warranty_list)
And after the loop, you return the warranty_df!
def get_warranty(years=5):
warranty_df = pd.Dataframe()
current_year = datetime.datetime.today().year #current year
last_n_years = [str(current_year-i) for i in range(0,years+1)]
for year in last_n_years:
warranty = glob.glob(os.path.join(qms, "Warranty Detail%s.csv" % year))
if warranty:
for file_ in warranty:
df = pd.read_csv(file_,index_col=None, header=0)
waranty_df = warranty_df.append(df)
return warranty_df
Here is my code:
path = 'C:\\Users\\Daniil\\Desktop\\dw_payments'
#list of all df:
all_files = glob.glob(path + '/*.csv')
all_payments_data = pd.DataFrame()
dfs = []
for file in all_files:
df = pd.read_csv(file,index_col = None,chunksize = 200000)
df_f = df[df['CUSTOMER_NO'] == 20069675]
df_f = pd.concat(df_f,ignore_index = True)
dfs.append(df_f)
all_payments_data = pd.concat(dfs)
As you see in the line df_f = df[df['CUSTOMER_NO'] == 20069675] i want to select the specific customer in one chunk and then merge it to the empty data frame. And I want to repeat the process many times(there are a lot of files).
But it throws me an error:
TypeError: 'TextFileReader' object is not subscriptable
How can i fix it?
I think you need iterate by TextFileReader, filter and append to df_s. Last only once concat.
Notice - Structure of all files has to be same (same columns names in same order)
df_s = []
for file in all_files:
txt = pd.read_csv(file,index_col = None,chunksize = 200000)
for df in txt:
df_s.append(df[df['CUSTOMER_NO'] == 20069675])
df_f = pd.concat(df_s,ignore_index = True)
I'm trying to write a dataframe to a .csv using df.to_csv(). For some reason, its only writing the last value (data for the last ticker). It reads through a list of tickers (turtle, all tickers are in first column) and spits out price data for each ticker. I can print all the data without a problem but can't seem to write to .csv. Any idea why? Thanks
input_file = pd.read_csv("turtle.csv", header=None)
for ticker in input_file.iloc[:,0].tolist():
data = web.DataReader(ticker, "yahoo", datetime(2011,06,1), datetime(2016,05,31))
data['ymd'] = data.index
year_month = data.index.to_period('M')
data['year_month'] = year_month
first_day_of_months = data.groupby(["year_month"])["ymd"].min()
first_day_of_months = first_day_of_months.to_frame().reset_index(level=0)
last_day_of_months = data.groupby(["year_month"])["ymd"].max()
last_day_of_months = last_day_of_months.to_frame().reset_index(level=0)
fday_open = data.merge(first_day_of_months,on=['ymd'])
fday_open = fday_open[['year_month_x','Open']]
lday_open = data.merge(last_day_of_months,on=['ymd'])
lday_open = lday_open[['year_month_x','Open']]
fday_lday = fday_open.merge(lday_open,on=['year_month_x'])
monthly_changes = {i:MonthlyChange(i) for i in range(1,13)}
for index,ym, openf,openl in fday_lday.itertuples():
month = ym.strftime('%m')
month = int(month)
diff = (openf-openl)/openf
monthly_changes[month].add_change(diff)
changes_df = pd.DataFrame([monthly_changes[i].get_data() for i in monthly_changes],columns=["Month","Avg Inc.","Inc","Avg.Dec","Dec"])
CSVdir = r"C:\Users\..."
realCSVdir = os.path.realpath(CSVdir)
if not os.path.exists(CSVdir):
os.makedirs(CSVdir)
new_file_name = os.path.join(realCSVdir,'PriceData.csv')
new_file = open(new_file_name, 'wb')
new_file.write(ticker)
changes_df.to_csv(new_file)
Use a for appending instead of wb because it overwrites the data in every iteration of loop.For different modes of opening a file see here.