Delete CSV file if missing specific column using python - python

Currently my code looks into CSV files in a folder and replaces strings based on if the file has column 'PROD_NAME' in the data. If it doesnt have column 'PROD_NAME', I'm trying to delete those files in the folder. I can get my code to print which csv files do not have the column with a little debugging, but I cant figure out how to actually delete or remove them from the folder they are in. I have tried an if statement that calls os.remove() and still nothing happens. No errors or anything.. it just finishes the script with all the files still in the folder. Here is my code. Any help is appreciated. Thanks!
def worker():
filenames = glob.glob(dest_dir + '\\*.csv')
print("Finding all files with column PROD_NAME")
time.sleep(3)
print("Changing names of products in these tables...")
for filename in filenames:
my_file = Path(os.path.join(dest_dir, filename))
try:
with open(filename):
# read data
df1 = pd.read_csv(filename, skiprows=1, encoding='ISO-8859-1') # read column header only - to get the list of columns
dtypes = {}
for col in df1.columns:# make all columns text, to avoid formatting errors
dtypes[col] = 'str'
df1 = pd.read_csv(filename, dtype=dtypes, skiprows=1, encoding='ISO-8859-1')
if 'PROD_NAME' not in df1.columns:
os.remove(filename)
#Replaces text in files
if 'PROD_NAME' in df1.columns:
df1 = df1.replace("NABVCI", "CLEAR_BV")
df1 = df1.replace("NAMVCI", "CLEAR_MV")
df1 = df1.replace("NA_NRF", "FA_GUAR")
df1 = df1.replace("N_FPFA", "FA_FLEX")
df1 = df1.replace("NAMRFT", "FA_SECURE_MVA")
df1 = df1.replace("NA_RFT", "FA_SECURE")
df1 = df1.replace("NSPFA7", "FA_PREFERRED")
df1 = df1.replace("N_ENHA", "FA_ENHANCE")
df1 = df1.replace("N_FPRA", "FA_FLEX_RETIRE")
df1 = df1.replace("N_SELF", "FA_SELECT")
df1 = df1.replace("N_SFAA", "FA_ADVANTAGE")
df1 = df1.replace("N_SPD1", "FA_SPD1")
df1 = df1.replace("N_SPD2", "FA_SPD2")
df1 = df1.replace("N_SPFA", "FA_LIFESTAGES")
df1 = df1.replace("N_SPPF", "FA_PLUS")
df1 = df1.replace("N__CFA", "FA_CHOICE")
df1 = df1.replace("N__OFA", "FA_OPTIMAL")
df1 = df1.replace("N_SCNI", "FA_SCNI")
df1 = df1.replace("NASCI_", "FA_SCI")
df1 = df1.replace("NASSCA", "FA_SSC")
df1.to_csv(filename, index=False, quotechar="'")
except:
if 'PROD_NAME' in df1.columns:
print("Could not find string to replace in this file: " + filename)
worker()

Written below is a block of code that reads the raw csv data. It extracts the first row of data (containing the column names) and looks for the column name PROD_NAME. If it finds it, it sets found to True. Else, it sets found to False. To prevent trying to delete the files whilst open, the removal is done outside of the open().
import os
filename = "test.csv"
with open(filename) as f: #Any code executed in here is while the file is open
if "PROD_NAME" in f.readlines()[0].split(","): #Replace "PROD_NAME" with the string you are looking for
print("found")
found = True
else:
print("not found")
found = False
if not found:
os.remove(filename)
else:
pass#Carry out replacements here/load it in pandas

Related

How to create a data frame from different Text files

I was working on a project where I have to scrape the some text files from a source. I completed this task and I have 140 text file.
This is one of the text file I have scraped.
I am trying to create a dataframe where I should have one row for each text file. So I wrote the below code:-
import pandas as pd
import os
txtfolder = r'/home/spx072/Black_coffer_assignment/' #Change to your folder path
#Find the textfiles
textfiles = []
for root, folder, files in os.walk(txtfolder):
for file in files:
if file.endswith('.txt'):
fullname = os.path.join(root, file)
textfiles.append(fullname)
# textfiles.sort() #Sort the filesnames
#Read each of them to a dataframe
for filenum, file in enumerate(textfiles, 1):
if filenum==1:
df = pd.read_csv(file, names=['data'], sep='delimiter', header=None)
df['Samplename']=os.path.basename(file)
else:
tempdf = pd.read_csv(file, names=['data'], sep='delimiter', header=None)
tempdf['Samplename']=os.path.basename(file)
df = pd.concat([df, tempdf], ignore_index=True)
df = df[['Samplename','data']] #
The code runs fine, but the dataframe I am getting is some thing like this :-
I want that each text file should be inside a single row like:-
1.txt should be in df['data'][0],
2.txt should be in df'data' and so on.
I tried different codes and also check several questions but still unable to get the desired result. Can anyone help.
I'm not shure why you need pd.read_csv() for this. Try it with pure python:
result = pd.DataFrame(columns=['Samplename', 'data'])
for file in textfiles:
with open(file) as f:
data = f.read()
result = pd.concat([result, pd.DataFrame({'Samplename' : file, 'data': data}, index=[0])], axis=0, ignore_index=True)

Column appended to dataframe coming up empty

I have the following code:
import glob
import pandas as pd
import os
import csv
myList = []
path = "/home/reallymemorable/Documents/git/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports_us/*.csv"
for fname in glob.glob(path):
df = pd.read_csv(fname)
row = df.loc[df['Province_State'] == 'Pennsylvania']
dateFromFilename = os.path.basename(fname).replace('.csv','')
fileDate = pd.DataFrame({'Date': [dateFromFilename]})
myList.append(row.join(fileDate))
concatList = pd.concat(myList, sort=True)
print(concatList)
concatList.to_csv('/home/reallymemorable/Documents/test.csv', index=False, header=True
It goes through a folder of CSVs and grabs a specific row and puts it all in a CSV. The files themselves have names like 10-10-2020.csv. I have some code in there that gets the filename and removes the file extension, so I am left with the date alone.
I am trying to add another column called "Date" that contains the filename for each file.
The script almost works: it gives me a CSV of all the rows I pulled out of the various CSVs, but the Date column itself is empty.
If I do print(dateFromFilename), the date/filename prints as expected (e.g. 10-10-2020).
What am I doing wrong?
I believe join has how=left by default. And your fileDate dataframe has different index than row, so you wouldn't get the date. Instead, do an assignment:
for fname in glob.glob(path):
df = pd.read_csv(fname)
row = df.loc[df['Province_State'] == 'Pennsylvania']
dateFromFilename = os.path.basename(fname).replace('.csv','')
myList.append(row.assign(Date=dateFromFilename))
concatList = pd.concat(myList, sort=True)
Another way is to store the dataframes as a dictionary, then concat:
myList = dict()
for fname in glob.glob(path):
df = pd.read_csv(fname)
row = df.loc[df['Province_State'] == 'Pennsylvania']
dateFromFilename = os.path.basename(fname).replace('.csv','')
myList[dateFromFilename] = row
concatList = pd.concat(myList, sort=True)

Reading Columns without headers

I have some code that reads all the CSV files in a certain folder and concatenates them into one excel file. This code works as long as the CSV's have headers but I'm wondering if there is a way to alter my code if my CSV's didn't have any headers.
Here is what works:
path = r'C:\Users\Desktop\workspace\folder'
all_files = glob.glob(path + "/*.csv")
li = []
for filename in all_files:
df = pd.read_csv(filename, index_col=None, header=0)
df = df[~df['Ran'].isin(['Active'])]
li.append(df)
frame = pd.concat(li, axis=0, ignore_index=True)
frame.drop_duplicates(subset=None, inplace=True)
What this is doing is deleting any row in my CSV's with the word "Active" under the "Ran" column. But if I didn't have a "Ran" header for this column, is there another way to read this and do the same thing?
Thanks in advance!
df = df[~df['Ran'].isin(['Active'])]
Instead of selecting a column by name, select it by index. If the 'Ran' column is the third column in the csv use...
df = df[~df.iloc[:,2].isin(['Active'])]
If some of your files have headers and some don't then you probably should look at the first line of each file before you make a DataFrame with it.
for filename in all_files:
with open(filename) as f:
first = next(f).split(',')
if first == ['my','list','of','headers']:
header=0
names=None
else:
header=None
names=['my','list','of','headers']
f.seek(0)
df = pd.read_csv(filename, index_col=None, header=header,names=names)
df = df[~df['Ran'].isin(['Active'])]
If I understood your question correctly ...
If the header is missing, yet you know the data format, you can pass the desired column labels as a list, such as: ['id', 'thing1', 'ran', 'other_stuff'] into the names parameter of read_csv.
Per the pandas docs:
names : array-like, optional
List of column names to use. If the file contains a header row, then you should explicitly pass header=0 to override the column names. Duplicates in this list are not allowed.

Python adding to Pandas dataframe replaces info

I am scanning a directory of text files and adding them to a Pandas dataframe:
text_path = "/home/tdun0002/stash/cloud_scripts/aws_scripts/output_files/memory_stats/text/"
filelist = os.listdir(text_path)
final_df = pd.DataFrame()
for filename in filelist:
my_file = text_path + filename
try:
df = pd.read_csv(my_file, delim_whitespace=True, header=None)
final_df = final_df.append(df)
pd.options.display.max_rows
print(f"\n***Full Data Frame: {df}\n***")
Each file in the directory holds the memory of a server:
bastion001-memory.txt
permissions001-memory.txt
haproxy001-memory.txt
The contents of the files look something like this:
cat haproxy001-memory.txt
7706172
On each pass of adding the file, it reports this:
Data Frame: Empty DataFrame
Columns: [7706172]
Index: []
And when I print out the full data frame it only has the last entry:
***Full Data Frame:
Empty DataFrame
Columns: [7706172]
Index: []
***
Why is it reporting that the dataframe is empty? Why is it only showing the last file that was input? I think I may need to append the data.
2 things:
You need to provide header=None in pd.read_csv command to consider the value in text file as data. This is because by default, pandas assumes the first row to be header.
Since you are reading multiple files, you need to append each dataframe into another. Currently you are overwriting df on each iteration.
Code should be like:
text_path = "/home/tdun0002/stash/cloud_scripts/aws_scripts/output_files/memory_stats/text/"
filelist = os.listdir(text_path)
final_df = pd.DataFrame()
for filename in filelist:
my_file = text_path + filename
try:
df = pd.read_csv(my_file, delim_whitespace=True, header=None)
final_df = final_df.append(df)
print(f"Data Frame: {df}")
pd.options.display.max_rows
print(f"\n***Full Data Frame: {df}\n***")

Elegant way to read multiple files but perform summary on one in python

I have multiple files as shown below. My task is to read all those files, merge them and create one final dataframe. However, one file (Measurement_table_sep_13th.csv) has to be summarized before being used for merge. It is too huge, so we summarize it and then merge it.
filenames = sorted(glob.glob('*.csv'))
filenames # gives the below output
filenames = sorted(glob.glob('*.csv'))
for f in filenames:
print(f)
if f == 'Measurement_table_sep_13th.csv':
df = spark.read.csv(f, sep=",",inferSchema=True, header=True)
df = df.groupby("person_id","visit_occurrence_id").pivot("measurement_concept_id").agg(F.mean(F.col("value_as_number")), F.min(F.col("value_as_number")), F.max(F.col("value_as_number")),
F.count(F.col("value_as_number")),F.stddev(F.col("value_as_number")),
F.expr('percentile_approx(value_as_number, 0.25)').alias("25_pc"),
F.expr('percentile_approx(value_as_number, 0.75)').alias("75_pc"))
else:
df = spark.read.csv(f, sep=",",inferSchema=True, header=True)
try:
JKeys = ['person_id', 'visit_occurrence_id'] if 'visit_occurrence_id' in df.columns else ['person_id']
print(JKeys)
df_final = df_final.join(df, on=JKeys, how='left')
print("success in try")
except:
df_final = df
print("success in except")
As you can see, I am summarizing Measurement_table_sep_13th.csv file before merging, but is there any other elegant and efficient way to write this?
If you do not want to save the one file in a different folder, you can also exlude it directly with glob:
followed by this post:
glob exclude pattern
files = glob.glob('files_path/[!_]*')
You can use this to run a glob function for all the files except your measurement file and then join it.
then you can avoid the long if-code.
It would look like (followed by this post: Loading multiple csv files of a folder into one dataframe):
files = glob.glob("[!M]*.csv")
dfs = [pd.read_csv(f, header=True, sep=";", inferShema=True) for f in files]
df2 = pd.concat(dfs,ignore_index=True)
df = spark.read.csv(f, sep=",",inferSchema=True, header=True)
df = df.groupby("person_id","visit_occurrence_id").pivot("measurement_concept_id").agg(F.mean(F.col("value_as_number")), F.min(F.col("value_as_number")), F.max(F.col("value_as_number")),
F.count(F.col("value_as_number")),F.stddev(F.col("value_as_number")),
F.expr('percentile_approx(value_as_number, 0.25)').alias("25_pc"),
F.expr('percentile_approx(value_as_number, 0.75)').alias("75_pc"))
JKeys = ['person_id', 'visit_occurrence_id'] if 'visit_occurrence_id' in df.columns else ['person_id']
df_final = df(df2, on=JKeys, how='left')

Categories