I am trying to write a code which takes all the .csv files in a directory, which are semi colon delimited, and formats the .csv file into columns. This is my code:
import pandas as pd
import glob
path = r'C:...'
csv_file = path + "\*.csv"
allFiles = glob.glob(path + "\*.csv")
for file in allFiles:
dataframe = pd.read_csv(file, delimiter=';')
dataframe.to_csv(file, encoding='utf-8', index=False)
I have tested the dataframe = part of this code, it works as desired for one .csv file, but I cannot get this to repeat for all files in the folder. Any ideas? Thanks.
If all you want to do is change ; to , in the files, something like this would work:
for root, dirs, files in os.walk("/dirname/"):
csv_files = [ff for ff in files if ff.endswith('.csv')]
for f in csv_files:
with open(f) as tf:
s = f.read()
with open(f, "w") as tf:
f.write(s.replace(";", ","))
You can use pandas and do something like this:
import pandas as pd
df1 = pd.read_csv("csv_semicolon.csv", delimiter=";")
df1.to_csv("csv_tab.csv", sep="\t", index=False)
Related
I am trying to read data from multiple xls files and write it to one single file.
My code below is writing only the first file. Not sure what I am missing.
import glob import os import pandas as pd
def list_files(dir):
r = []
for root, dirs, files in os.walk(dir):
for name in files:
r.append(os.path.join(root, name))
return r
files = list_files("C:\\Users\\12345\\BOFS")
for file in files:
df = pd.read_excel(file)
new_header = df.iloc[1]
df = df[2:]
df.columns = new_header
with pd.ExcelWriter("C:\\Users\\12345\\Test\\Test.xls", mode='a') as writer:
df.to_excel(writer,index=False, header=True,)
Documentation says:
ExcelWriter can also be used to append to an existing Excel file:
with pd.ExcelWriter('output.xlsx',
mode='a') as writer:
df.to_excel(writer, sheet_name='Sheet_name_3')
And that probably replaces given sheet
But you could use pd.concat(<dataframes>) to concatenate dataframes and write all data at once in a single sheet.
I tested this piece of code, hopefully its work in your case.
import glob, os
os.chdir("D:/Data Science/stackoverflow")
for file in glob.glob("*.xlsx"):
df = pd.read_excel(file)
all_data = all_data.append(df,ignore_index=True)
# now save the data frame
writer = pd.ExcelWriter('output.xlsx')
all_data.to_excel(writer,'sheet1')
writer.save()
I have imported a few thousand txt files from a folder into pandas dataframe. Is there any way I can create a column adding a sub-string from the filenames of the imported txt files in it? This is to identify each text file in the dataframe by a unique name.
Text files are named as 1001example.txt, 1002example.txt, 1003example.txt and son on. I want something like this:
filename text
1001 this is an example text
1002 this is another example text
1003 this is the last example text
....
The code I have used to import the data is below. However, I do not know how to create a column by a sub-string of filenames. Any help would be appreciated. Thanks.
import glob
import os
import pandas as pd
file_list = glob.glob(os.path.join(os.getcwd(), "K:\\text_all", "*.txt"))
corpus = []
for file_path in file_list:
with open(file_path, encoding="latin-1") as f_input:
corpus.append(f_input.read())
df = pd.DataFrame({'text':corpus})
This should work. It takes numbers from file name.
import glob
import os
import pandas as pd
file_list = glob.glob(os.path.join(os.getcwd(), "K:\\text_all", "*.txt"))
corpus = []
files = []
for file_path in file_list:
with open(file_path, encoding="latin-1") as f_input:
corpus.append(f_input.read())
files.append(''.join([n for n in os.path.basename(file_path) if n.isdigit()]))
df = pd.DataFrame({'file':files, 'text':corpus})
There is a one-liner:
df = pd.concat([pd.read_csv(f, encoding='latin-1').
assign(Filename=os.path.basename(f)) for f in glob.glob('K:\\text_all*.txt')])
df['Filename'] = df['Filename'].str.extract('(\d+)').astype(int)
I have a folder with about 500 .txt files. I would like to store the content in a csv file, with 2 columns, column 1 being the name of the file and column 2 being the file content in string. So I'd end up with a CSV file with 501 rows.
I've snooped around SO and tried to find similar questions, and came up with the following code:
import pandas as pd
from pandas.io.common import EmptyDataError
import os
def Aggregate_txt_csv(path):
for files in os.listdir(path):
with open(files, 'r') as file:
try:
df = pd.read_csv(file, header=None, delim_whitespace=True)
except EmptyDataError:
df = pd.DataFrame()
return df.to_csv('file.csv', index=False)
However it returns an empty .csv file. Am I doing something wrong?
There are several problems on your code. One of them is that pd.read_csv is not opening file because you're not passing the path to the given file. I think you should try to play from this code
import os
import pandas as pd
from pandas.io.common import EmptyDataError
def Aggregate_txt_csv(path):
files = os.listdir(path)
df = []
for file in files:
try:
d = pd.read_csv(os.path.join(path, file), header=None, delim_whitespace=True)
d["file"] = file
except EmptyDataError:
d = pd.DataFrame({"file":[file]})
df.append(d)
df = pd.concat(df, ignore_index=True)
df.to_csv('file.csv', index=False)
Use pathlib
Path.glob() to find all the files
When using path objects, file.stem returns the file name from the path.
Use pandas.concat to combine the dataframes in df_list
from pathlib import Path
import pandas as pd
p = Path('e:/PythonProjects/stack_overflow') # path to files
files = p.glob('*.txt') # get all txt files
df_list = list() # create an empty list for the dataframes
for file in files: # iterate through each file
with file.open('r') as f:
text = '\n'.join([line.strip() for line in f.readlines()]) # join all rows in list as a single string separated with \n
df_list.append(pd.DataFrame({'filename': [file.stem], 'contents': [text]})) # create and append a dataframe
df_all = pd.concat(df_list) # concat all the dataframes
df_all.to_csv('files.txt', index=False) # save to csv
I noticed there's already an answer, but I've gotten it to work with a relatively simple piece of code. I've only edited the file read-in a little bit, and the dataframe is outputting successfully.
Link here
import pandas as pd
from pandas.io.common import EmptyDataError
import os
def Aggregate_txt_csv(path):
result = []
print(os.listdir(path))
for files in os.listdir(path):
fullpath = os.path.join(path, files)
if not os.path.isfile(fullpath):
continue
with open(fullpath, 'r', errors='replace') as file:
try:
content = '\n'.join(file.readlines())
result.append({'title': files, 'body': content})
except EmptyDataError:
result.append({'title': files, 'body': None})
df = pd.DataFrame(result)
return df
df = Aggregate_txt_csv('files')
print(df)
df.to_csv('result.csv')
Most importantly here, I am appending to an array so as not to run pandas' concatenate function too much, as that would be pretty bad for performance. Additionally, reading in the file should not need read_csv, as there isn't a set format for the file. So using '\n'.join(file.readlines()) allows you to read in the file plainly and take out all lines into a string.
At the end, I convert the array of dictionaries into a final dataframe, and it returns the result.
EDIT: for paths that aren't the current directory, I updated it to append the path so that it could find the necessary files, apologies for the confusion
I am trying to convert 200 text files into csv files. I am using below code I am able to run it but it does not produce csv files. Could anyone tell any easy and fast way to do? Many Thanks
dirpath = 'C:\Files\Code\Analysis\Input\qobs_RR1\\'
output = 'C:\Files\Code\Analysis\output\qobs_CSV.csv'
csvout = pd.DataFrame()
files = os.listdir(dirpath)
for filename in files:
data = pd.read_csv(filename, sep=':', index_col=0, header=None)
csvout = csvout.append(data)
csvout.to_csv(output)
The problem is that your os.listdir gives you the list of filenames inside dirpath, not the full path to these files. You can get the full path by prepending the dirpath to filenames with os.path.join function.
import os
import pandas as pd
dirpath = 'C:\Files\Code\Analysis\Input\qobs_RR1\\'
output = 'C:\Files\Code\Analysis\output\qobs_CSV.csv'
csvout_lst = []
files = [os.path.join(dirpath, fname) for fname in os.listdir(dirpath)]
for filename in sorted(files):
data = pd.read_csv(filename, sep=':', index_col=0, header=None)
csvout_lst.append(data)
pd.concat(csvout_lst).to_csv(output)
Edit: this can be done with a one-liner:
pd.concat(
pd.read_csv(os.path.join(dirpath, fname), sep=':', index_col=0, header=None)
for fname in sorted(os.listdir(dirpath))
).to_csv(output)
Edit 2: updated the answer, so the list of files is sorted alphabetically.
I have multiple .txt files in a directory and I want to merge them into one by importing in python. The catch here is that after the merge I want to convert it into one csv file on which the whole program is based.
So far I only had to input one .txt file and converted it into csv file by this code:
import io
bytes = open('XYZ.txt', 'rb').read()
df=pd.read_csv(io.StringIO(bytes.decode('utf-8')), sep='\t', parse_dates=['Time'] )
df.head()
Now I need to input multiple .txt files, merge them and then convert them into csv files. Any workaround?
If the headers are same then it should be as easy as this
import os
import io
merged_df = pd.DataFrame()
for file in os.listdir("PATH_OF_DIRECTORY"):
if file.endswith(".txt"):
bytes = open(file, 'rb').read()
merged_df = merged_df.append(pd.read_csv(io.StringIO(
bytes.decode('utf-8')), sep='\t', parse_dates=['Time']))
print(len(merged_df))
import glob
path="location/of/folder"
allFiles = glob.glob(path + "\\*.txt")
list_ = []
for file in allFiles:
print(file)
df = pd.read_csv(io.StringIO(file.decode('utf-8')), sep='\t', parse_dates=['Time'])
list_.append(df)
combined_files = pd.concat(list_)