Column appended to dataframe coming up empty - python

I have the following code:
import glob
import pandas as pd
import os
import csv
myList = []
path = "/home/reallymemorable/Documents/git/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports_us/*.csv"
for fname in glob.glob(path):
df = pd.read_csv(fname)
row = df.loc[df['Province_State'] == 'Pennsylvania']
dateFromFilename = os.path.basename(fname).replace('.csv','')
fileDate = pd.DataFrame({'Date': [dateFromFilename]})
myList.append(row.join(fileDate))
concatList = pd.concat(myList, sort=True)
print(concatList)
concatList.to_csv('/home/reallymemorable/Documents/test.csv', index=False, header=True
It goes through a folder of CSVs and grabs a specific row and puts it all in a CSV. The files themselves have names like 10-10-2020.csv. I have some code in there that gets the filename and removes the file extension, so I am left with the date alone.
I am trying to add another column called "Date" that contains the filename for each file.
The script almost works: it gives me a CSV of all the rows I pulled out of the various CSVs, but the Date column itself is empty.
If I do print(dateFromFilename), the date/filename prints as expected (e.g. 10-10-2020).
What am I doing wrong?

I believe join has how=left by default. And your fileDate dataframe has different index than row, so you wouldn't get the date. Instead, do an assignment:
for fname in glob.glob(path):
df = pd.read_csv(fname)
row = df.loc[df['Province_State'] == 'Pennsylvania']
dateFromFilename = os.path.basename(fname).replace('.csv','')
myList.append(row.assign(Date=dateFromFilename))
concatList = pd.concat(myList, sort=True)
Another way is to store the dataframes as a dictionary, then concat:
myList = dict()
for fname in glob.glob(path):
df = pd.read_csv(fname)
row = df.loc[df['Province_State'] == 'Pennsylvania']
dateFromFilename = os.path.basename(fname).replace('.csv','')
myList[dateFromFilename] = row
concatList = pd.concat(myList, sort=True)

Related

Delete CSV file if missing specific column using python

Currently my code looks into CSV files in a folder and replaces strings based on if the file has column 'PROD_NAME' in the data. If it doesnt have column 'PROD_NAME', I'm trying to delete those files in the folder. I can get my code to print which csv files do not have the column with a little debugging, but I cant figure out how to actually delete or remove them from the folder they are in. I have tried an if statement that calls os.remove() and still nothing happens. No errors or anything.. it just finishes the script with all the files still in the folder. Here is my code. Any help is appreciated. Thanks!
def worker():
filenames = glob.glob(dest_dir + '\\*.csv')
print("Finding all files with column PROD_NAME")
time.sleep(3)
print("Changing names of products in these tables...")
for filename in filenames:
my_file = Path(os.path.join(dest_dir, filename))
try:
with open(filename):
# read data
df1 = pd.read_csv(filename, skiprows=1, encoding='ISO-8859-1') # read column header only - to get the list of columns
dtypes = {}
for col in df1.columns:# make all columns text, to avoid formatting errors
dtypes[col] = 'str'
df1 = pd.read_csv(filename, dtype=dtypes, skiprows=1, encoding='ISO-8859-1')
if 'PROD_NAME' not in df1.columns:
os.remove(filename)
#Replaces text in files
if 'PROD_NAME' in df1.columns:
df1 = df1.replace("NABVCI", "CLEAR_BV")
df1 = df1.replace("NAMVCI", "CLEAR_MV")
df1 = df1.replace("NA_NRF", "FA_GUAR")
df1 = df1.replace("N_FPFA", "FA_FLEX")
df1 = df1.replace("NAMRFT", "FA_SECURE_MVA")
df1 = df1.replace("NA_RFT", "FA_SECURE")
df1 = df1.replace("NSPFA7", "FA_PREFERRED")
df1 = df1.replace("N_ENHA", "FA_ENHANCE")
df1 = df1.replace("N_FPRA", "FA_FLEX_RETIRE")
df1 = df1.replace("N_SELF", "FA_SELECT")
df1 = df1.replace("N_SFAA", "FA_ADVANTAGE")
df1 = df1.replace("N_SPD1", "FA_SPD1")
df1 = df1.replace("N_SPD2", "FA_SPD2")
df1 = df1.replace("N_SPFA", "FA_LIFESTAGES")
df1 = df1.replace("N_SPPF", "FA_PLUS")
df1 = df1.replace("N__CFA", "FA_CHOICE")
df1 = df1.replace("N__OFA", "FA_OPTIMAL")
df1 = df1.replace("N_SCNI", "FA_SCNI")
df1 = df1.replace("NASCI_", "FA_SCI")
df1 = df1.replace("NASSCA", "FA_SSC")
df1.to_csv(filename, index=False, quotechar="'")
except:
if 'PROD_NAME' in df1.columns:
print("Could not find string to replace in this file: " + filename)
worker()
Written below is a block of code that reads the raw csv data. It extracts the first row of data (containing the column names) and looks for the column name PROD_NAME. If it finds it, it sets found to True. Else, it sets found to False. To prevent trying to delete the files whilst open, the removal is done outside of the open().
import os
filename = "test.csv"
with open(filename) as f: #Any code executed in here is while the file is open
if "PROD_NAME" in f.readlines()[0].split(","): #Replace "PROD_NAME" with the string you are looking for
print("found")
found = True
else:
print("not found")
found = False
if not found:
os.remove(filename)
else:
pass#Carry out replacements here/load it in pandas

Column with end of file name python

I have a code that merges all txt files from a directory into a dataframe
follow the code below
import pandas as pd
import os
import glob
diretorio = "F:\PROJETOS\LOTE45\ARQUIVOS\RISK\RISK_CUSTOM_FUND_N1"
files = []
files = [pd.read_csv(file, delimiter='\t')
for file in glob.glob(os.path.join(diretorio ,"*.txt"))]
df = pd.concat(files, ignore_index=True)
df
that gives result to this table
I needed to add a date column to this table, but I only have the date available at the end of the filename.
How can I get the date at the end of the filename and put it inside the dataframe.
I have no idea how to do this
Assuming the file naming pattern is constant, you can parse the end of the filename for every iteration of the loop this way :-
from datetime import datetime
files = []
for file in glob.glob(os.path.join(diretorio ,"*.txt")):
df_f = pd.read_csv(file, delimiter='\t')
df_f['date'] = datetime.strptime(file[-11:-4], "%d%m%Y")
files.append(df_f)
df = pd.concat(files, ignore_index=True)
import pandas as pd
import os
diretorio = "F:/PROJETOS/LOTE45/ARQUIVOS/RISK/RISK_CUSTOM_FUND_N1/"
files = []
for filename in os.listdir(diretorio):
if filename.endswith(".csv"):
df = pd.read_csv(diretorio + filename, sep=";")
df['Date'] = filename.split('.')[0].split("_")[-1]
files.append(df)
df = pd.concat(files, ignore_index=True)
print(df)

Python adding to Pandas dataframe replaces info

I am scanning a directory of text files and adding them to a Pandas dataframe:
text_path = "/home/tdun0002/stash/cloud_scripts/aws_scripts/output_files/memory_stats/text/"
filelist = os.listdir(text_path)
final_df = pd.DataFrame()
for filename in filelist:
my_file = text_path + filename
try:
df = pd.read_csv(my_file, delim_whitespace=True, header=None)
final_df = final_df.append(df)
pd.options.display.max_rows
print(f"\n***Full Data Frame: {df}\n***")
Each file in the directory holds the memory of a server:
bastion001-memory.txt
permissions001-memory.txt
haproxy001-memory.txt
The contents of the files look something like this:
cat haproxy001-memory.txt
7706172
On each pass of adding the file, it reports this:
Data Frame: Empty DataFrame
Columns: [7706172]
Index: []
And when I print out the full data frame it only has the last entry:
***Full Data Frame:
Empty DataFrame
Columns: [7706172]
Index: []
***
Why is it reporting that the dataframe is empty? Why is it only showing the last file that was input? I think I may need to append the data.
2 things:
You need to provide header=None in pd.read_csv command to consider the value in text file as data. This is because by default, pandas assumes the first row to be header.
Since you are reading multiple files, you need to append each dataframe into another. Currently you are overwriting df on each iteration.
Code should be like:
text_path = "/home/tdun0002/stash/cloud_scripts/aws_scripts/output_files/memory_stats/text/"
filelist = os.listdir(text_path)
final_df = pd.DataFrame()
for filename in filelist:
my_file = text_path + filename
try:
df = pd.read_csv(my_file, delim_whitespace=True, header=None)
final_df = final_df.append(df)
print(f"Data Frame: {df}")
pd.options.display.max_rows
print(f"\n***Full Data Frame: {df}\n***")

Adding file name in a Column while merging multible csv files to pandas- Python

I have multiple csv files in the same folder with all the same data columns,
20100104 080100;5369;5378.5;5365;5378;2368
20100104 080200;5378;5385;5377;5384.5;652
20100104 080300;5384.5;5391.5;5383;5390;457
20100104 080400;5390.5;5391;5387;5389.5;392
I want to merge the csv files into pandas and add a column with the file name to each line so I can track where it came from later. There seems to be similar threads but I haven't been able to adapt any of the solutions. This is what I have so far. The merge data into one data frame works but I'm stuck on the adding file name column,
import os
import glob
import pandas as pd
path = r'/filepath/'
all_files = glob.glob(os.path.join(path, "*.csv"))
names = [os.path.basename(x) for x in glob.glob(path+'\*.csv')]
list_ = []
for file_ in all_files:
list_.append(pd.read_csv(file_,sep=';', parse_dates=[0], infer_datetime_format=True,header=None ))
df = pd.concat(list_)
Instead of using a list just use DataFrame's append.
df = pd.DataFrame()
for file_ in all_files:
file_df = pd.read_csv(file_,sep=';', parse_dates=[0], infer_datetime_format=True,header=None )
file_df['file_name'] = file_
df = df.append(file_df)

Reading in a list of files into a list of DataFrames

I'm trying to read a list of files into a list of Pandas DataFrames in Python. However, the code below doesn't work.
files = [file1, file2, file3]
df1 = pd.DataFrame()
df2 = pd.DataFrame()
df3 = pd.DataFrame()
dfs = [df1, df2, df3]
# Read in data files
for file,df in zip(files, dfs):
if file_exists(file):
with open(file, 'rb') as in_file:
df = pd.read_csv(in_file, low_memory=False)
print df #the file is getting read properly
print df1 #empty
print df2 #empty
print df3 #empty
How to I get the original DataFrames to update if I pass them into a for-loop as a list of DataFrames?
Try this:
dfs = [pd.read_csv(f, low_memory=False) for f in files]
if you want to check whether file exists:
import os
dfs = [pd.read_csv(f, low_memory=False) for f in files if os.path.isfile(f)]
and if you want to concatenate all of them into one data frame:
df = pd.concat([pd.read_csv(f, low_memory=False)
for f in files if os.path.isfile(f)],
ignore_index=True)
You are not working on the list elements themselves when iterating over them but you are not operating on the list.
You need to insert the elements (or append them) to the list. One possibility could be:
files = [file1, file2, file3]
dfs = [None] * 3 # Just a placeholder
# Read in data files
for i, file in enumerate(files): # Enumeration instead of zip
if file_exists(file):
with open(file, 'rb') as in_file:
dfs[i] = pd.read_csv(in_file, low_memory=False) # Setting the list element
print dfs[i] #the file is getting read properly
This updates the list elements and should work.
Your code seems over complicated you can just do:
files = [file1, file2, file3]
dfs = []
# Read in data files
for file in files:
if file_exists(file):
dfs.append(pd.read_csv(file, low_memory=False))
You will end up with a list of dfs as desired
You can try list comprehension:
files = [file1, file2, file3]
dfs = [pd.read_csv(x, low_memory=False) for x in files if file_exists(x)]
Custom-written Python function that appropriately handles both CSV & JSON files.
def generate_list_of_dfs(incoming_files):
"""
Accepts a list of csv and json file/path names.
Returns a list of DataFrames.
"""
outgoing_files = []
for filename in incoming_files:
file_extension = filename.split('.')[1]
if file_extension == 'json':
with open(filename, mode='r') as incoming_file:
outgoing_json = pd.DataFrame(json.load(incoming_file))
outgoing_files.append(outgoing_json)
if file_extension == 'csv':
outgoing_csv = pd.read_csv(filename)
outgoing_files.append(outgoing_csv)
return outgoing_files
How to Call this Function
import pandas as pd
import json
files_to_be_read = ['filename1.json', 'filename2.csv', 'filename3.json', 'filename4.csv']
dataframes_list = generate_list_of_dfs(files_to_be_read)
Here is a simple solution that avoids using a list to hold all the data frames, if you don't need them in a list.
import fnmatch
# get the CSV files only
files = fnmatch.filter(os.listdir('.'), '*.csv')
files
Output which is now a list of the names:
['Feedback Form Submissions 1.21-1.25.22.csv',
'Feedback Form Submissions 1.21.22.csv',
'Feedback Form Submissions 1.25-1.31.22.csv']
Now create a simple list of new names to make working with them easier:
# use a simple format
names = []
for i in range(0,len(files)):
names.append('data' + str(i))
names
['data0', 'data1', 'data2']
You can use any list of names that you want. The next step take the file names and the list of names and then assign them to the names.
# i is the incrementor for the list of names
i = 0
# iterate through the file names
for file in files:
# make an empty dataframe
df = pd.DataFrame()
# load the first file in
df = pd.read_csv(file, low_memory=False)
# get the first name from the list, this will be a string
new_name = names[i]
# assign the string to the variable and assign it to the dataframe
locals()[new_name] = df.copy()
# increment the list of names
i = i + 1
You now have 3 separate dataframes named data0, data1, data2, and do commands like
data2.info()

Categories