How to create a data frame from different Text files - python

I was working on a project where I have to scrape the some text files from a source. I completed this task and I have 140 text file.
This is one of the text file I have scraped.
I am trying to create a dataframe where I should have one row for each text file. So I wrote the below code:-
import pandas as pd
import os
txtfolder = r'/home/spx072/Black_coffer_assignment/' #Change to your folder path
#Find the textfiles
textfiles = []
for root, folder, files in os.walk(txtfolder):
for file in files:
if file.endswith('.txt'):
fullname = os.path.join(root, file)
textfiles.append(fullname)
# textfiles.sort() #Sort the filesnames
#Read each of them to a dataframe
for filenum, file in enumerate(textfiles, 1):
if filenum==1:
df = pd.read_csv(file, names=['data'], sep='delimiter', header=None)
df['Samplename']=os.path.basename(file)
else:
tempdf = pd.read_csv(file, names=['data'], sep='delimiter', header=None)
tempdf['Samplename']=os.path.basename(file)
df = pd.concat([df, tempdf], ignore_index=True)
df = df[['Samplename','data']] #
The code runs fine, but the dataframe I am getting is some thing like this :-
I want that each text file should be inside a single row like:-
1.txt should be in df['data'][0],
2.txt should be in df'data' and so on.
I tried different codes and also check several questions but still unable to get the desired result. Can anyone help.

I'm not shure why you need pd.read_csv() for this. Try it with pure python:
result = pd.DataFrame(columns=['Samplename', 'data'])
for file in textfiles:
with open(file) as f:
data = f.read()
result = pd.concat([result, pd.DataFrame({'Samplename' : file, 'data': data}, index=[0])], axis=0, ignore_index=True)

Related

Analyze multiple csv files in a folder and write ONLY the results to a single csv file using pandas

I can read and analyze a single csv file and add new columns to the same data frame. However, I cannot do that for multiple files in a folder and save ONLY the results to a single csv file.
I have tried like the following for a single csv file
df1 = pd.read_csv('file.csv')
df1['Number of rows'] = (len(df1))
df1['Number of unique data'] = df1['column1'].nunique()
df1['Number of A type in Column2'] = df1['column2'].value_counts()['A']
df1.to_csv('df1_results.csv' , index = False)
But, I need the result like the following image in a csv file for multiple files in a folder:Need result like this
You can iterate over the different csv files in your input folder, process the corresponding data and append the output dataframes to a list of dataframes:
import pandas as pd
import os
list_of_dataframes = []
path = "path to your csv files"
list_of_files = os.listdir(path)
for file in list_of_files:
if file.endswith('.csv'):
df1 = pd.read_csv(file)
df1['Number of rows'] = (len(df1))
df1['Number of unique data'] = df1['column1'].nunique()
df1['Number of A type in Column2'] = df1['column2'].value_counts()['A']
list_of_dataframes.append(df1)
The only thing left to do is concatenate the list of dataframes into a single dataframe that you can then output to csv:
df = pd.concat(list_of_dataframes)
df.to_csv('df_results.csv' , index = False)

Delete CSV file if missing specific column using python

Currently my code looks into CSV files in a folder and replaces strings based on if the file has column 'PROD_NAME' in the data. If it doesnt have column 'PROD_NAME', I'm trying to delete those files in the folder. I can get my code to print which csv files do not have the column with a little debugging, but I cant figure out how to actually delete or remove them from the folder they are in. I have tried an if statement that calls os.remove() and still nothing happens. No errors or anything.. it just finishes the script with all the files still in the folder. Here is my code. Any help is appreciated. Thanks!
def worker():
filenames = glob.glob(dest_dir + '\\*.csv')
print("Finding all files with column PROD_NAME")
time.sleep(3)
print("Changing names of products in these tables...")
for filename in filenames:
my_file = Path(os.path.join(dest_dir, filename))
try:
with open(filename):
# read data
df1 = pd.read_csv(filename, skiprows=1, encoding='ISO-8859-1') # read column header only - to get the list of columns
dtypes = {}
for col in df1.columns:# make all columns text, to avoid formatting errors
dtypes[col] = 'str'
df1 = pd.read_csv(filename, dtype=dtypes, skiprows=1, encoding='ISO-8859-1')
if 'PROD_NAME' not in df1.columns:
os.remove(filename)
#Replaces text in files
if 'PROD_NAME' in df1.columns:
df1 = df1.replace("NABVCI", "CLEAR_BV")
df1 = df1.replace("NAMVCI", "CLEAR_MV")
df1 = df1.replace("NA_NRF", "FA_GUAR")
df1 = df1.replace("N_FPFA", "FA_FLEX")
df1 = df1.replace("NAMRFT", "FA_SECURE_MVA")
df1 = df1.replace("NA_RFT", "FA_SECURE")
df1 = df1.replace("NSPFA7", "FA_PREFERRED")
df1 = df1.replace("N_ENHA", "FA_ENHANCE")
df1 = df1.replace("N_FPRA", "FA_FLEX_RETIRE")
df1 = df1.replace("N_SELF", "FA_SELECT")
df1 = df1.replace("N_SFAA", "FA_ADVANTAGE")
df1 = df1.replace("N_SPD1", "FA_SPD1")
df1 = df1.replace("N_SPD2", "FA_SPD2")
df1 = df1.replace("N_SPFA", "FA_LIFESTAGES")
df1 = df1.replace("N_SPPF", "FA_PLUS")
df1 = df1.replace("N__CFA", "FA_CHOICE")
df1 = df1.replace("N__OFA", "FA_OPTIMAL")
df1 = df1.replace("N_SCNI", "FA_SCNI")
df1 = df1.replace("NASCI_", "FA_SCI")
df1 = df1.replace("NASSCA", "FA_SSC")
df1.to_csv(filename, index=False, quotechar="'")
except:
if 'PROD_NAME' in df1.columns:
print("Could not find string to replace in this file: " + filename)
worker()
Written below is a block of code that reads the raw csv data. It extracts the first row of data (containing the column names) and looks for the column name PROD_NAME. If it finds it, it sets found to True. Else, it sets found to False. To prevent trying to delete the files whilst open, the removal is done outside of the open().
import os
filename = "test.csv"
with open(filename) as f: #Any code executed in here is while the file is open
if "PROD_NAME" in f.readlines()[0].split(","): #Replace "PROD_NAME" with the string you are looking for
print("found")
found = True
else:
print("not found")
found = False
if not found:
os.remove(filename)
else:
pass#Carry out replacements here/load it in pandas

Extract single column from multiple CSVs and save to new CSV

I would like to read out a specific column from over 100 CSV files to create a new CSV file. The source column's header will be renamed with the filename the column is extracted from.
I can get the individual columns, but I have been unable to rename each column's header without the ".csv" extension:
import os
import pandas as pd
folder = "C:/Users/Doc/Data"
files = os.scandir(folder)
E2080 = []
with os.scandir(folder) as files:
for file in files:
#print(file)
df = pd.read_csv(file, index_col=None)
dist = {file: (df['lnt_dist'])}
E = pd.DataFrame(dist)
E2080.append(E)
dist = pd.concat(E2080, ignore_index=False, axis=1)
dist.head()
dist.to_csv('E2080', index=False)
This is the final code that worked for me (see output 1):
E2080 = []
with os.scandir(folder) as files:
for file in files:
#print(file)
df = pd.read_csv(file, index_col=None)
dist = {file: (df['lnt_dist'])}
E = pd.DataFrame(dist)
E_1 = E.rename(columns={file: file.name.split('.')[0]}) # rename df header while dropping the ext **[.csv]** and the `os.scandir` attribute `<DirEntry>`
E2080.append(E_1)
dist = pd.concat(E_28, ignore_index=False, axis=1)
#dist.head()
dist.to_csv('E2080.csv', index=False)
You should use file.name instead of file to get string with name.
And with string you can use .split(".") to get name without extension.
for file in os.scandir(folder):
print(file.name, '=>', file.name.split(".")[0])
Or you could use pathlib.Path instead of os.scandir() to have more functions.
for file in pathlib.Path('test').iterdir():
print(file.name, '=>', file.stem)

Creating one unique file from many others saved in a folder

I've a list of csv files (approx. 100) that I'd like to include in one single csv file.
The list is found using
PATH_DATA_FOLDER = 'mypath/'
list_files = os.listdir(PATH_DATA_FOLDER)
for f in list_files:
list_columns = list(pd.read_csv(os.path.join(PATH_DATA_FOLDER, f)).columns)
df = pd.DataFrame(columns=list_columns)
print(df)
Which returns the files (it is just a sample, since I have 100 and more files):
['file1.csv', 'name2.csv', 'example.csv', '.DS_Store']
This, unfortunately, includes also hidden files, that I'd like to exclude.
Each file has the same columns:
Columns: [Name, Surname, Country]
I'd like to find a way to create one unique file with all these fields, plus information of the original file (e.g., adding a new column with the file name).
I've tried with
df1 = pd.read_csv(os.path.join(PATH_DATA_FOLDER, f))
df1['File'] = f # file name
df = df.append(df1)
df = df.reset_index(drop=True).drop_duplicates() # I'd like to drop duplicates in both Name and Surname
but it returns a dataframe with the last entry, so I guess the problem is in the for loop.
I hope you can provide some help.
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
#drop duplicates and reset index
combined_csv.drop_duplicates().reset_index(drop=True)
#Save the combined file
combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8-sig')
Have you tried using glob?
filenames = glob.glob("mypath/*.csv") #list of all you csv files.
df = pd.DataFrame(columns=["Name", "Surname", "Country"])
for filename in filenames:
df = df.append(pd.read_csv(filename))
df = df.drop_duplicates().reset_index(drop=True)
Another way would be concatenating the csv files using the cat command after removing the headers and then read the concatenated csv file using pd.read_csv.

Pandas - Trying to store multiple .txt files in a .csv

I have a folder with about 500 .txt files. I would like to store the content in a csv file, with 2 columns, column 1 being the name of the file and column 2 being the file content in string. So I'd end up with a CSV file with 501 rows.
I've snooped around SO and tried to find similar questions, and came up with the following code:
import pandas as pd
from pandas.io.common import EmptyDataError
import os
def Aggregate_txt_csv(path):
for files in os.listdir(path):
with open(files, 'r') as file:
try:
df = pd.read_csv(file, header=None, delim_whitespace=True)
except EmptyDataError:
df = pd.DataFrame()
return df.to_csv('file.csv', index=False)
However it returns an empty .csv file. Am I doing something wrong?
There are several problems on your code. One of them is that pd.read_csv is not opening file because you're not passing the path to the given file. I think you should try to play from this code
import os
import pandas as pd
from pandas.io.common import EmptyDataError
def Aggregate_txt_csv(path):
files = os.listdir(path)
df = []
for file in files:
try:
d = pd.read_csv(os.path.join(path, file), header=None, delim_whitespace=True)
d["file"] = file
except EmptyDataError:
d = pd.DataFrame({"file":[file]})
df.append(d)
df = pd.concat(df, ignore_index=True)
df.to_csv('file.csv', index=False)
Use pathlib
Path.glob() to find all the files
When using path objects, file.stem returns the file name from the path.
Use pandas.concat to combine the dataframes in df_list
from pathlib import Path
import pandas as pd
p = Path('e:/PythonProjects/stack_overflow') # path to files
files = p.glob('*.txt') # get all txt files
df_list = list() # create an empty list for the dataframes
for file in files: # iterate through each file
with file.open('r') as f:
text = '\n'.join([line.strip() for line in f.readlines()]) # join all rows in list as a single string separated with \n
df_list.append(pd.DataFrame({'filename': [file.stem], 'contents': [text]})) # create and append a dataframe
df_all = pd.concat(df_list) # concat all the dataframes
df_all.to_csv('files.txt', index=False) # save to csv
I noticed there's already an answer, but I've gotten it to work with a relatively simple piece of code. I've only edited the file read-in a little bit, and the dataframe is outputting successfully.
Link here
import pandas as pd
from pandas.io.common import EmptyDataError
import os
def Aggregate_txt_csv(path):
result = []
print(os.listdir(path))
for files in os.listdir(path):
fullpath = os.path.join(path, files)
if not os.path.isfile(fullpath):
continue
with open(fullpath, 'r', errors='replace') as file:
try:
content = '\n'.join(file.readlines())
result.append({'title': files, 'body': content})
except EmptyDataError:
result.append({'title': files, 'body': None})
df = pd.DataFrame(result)
return df
df = Aggregate_txt_csv('files')
print(df)
df.to_csv('result.csv')
Most importantly here, I am appending to an array so as not to run pandas' concatenate function too much, as that would be pretty bad for performance. Additionally, reading in the file should not need read_csv, as there isn't a set format for the file. So using '\n'.join(file.readlines()) allows you to read in the file plainly and take out all lines into a string.
At the end, I convert the array of dictionaries into a final dataframe, and it returns the result.
EDIT: for paths that aren't the current directory, I updated it to append the path so that it could find the necessary files, apologies for the confusion

Categories