Opening multiple files in pandas - some of which are 'owner' files - python

I am trying to open multiple files with pandas into a dataframe.
Only files with a prefix ~$ show an error of
XLRDError: Unsupported format, or corrupt file: Expected BOF record; found b'\x15Microso'
Here is two of my list of filepaths:
bulk_uploads /~$0730-0731.xlsx',
bulk_uploads /0701-0702.xlsx'
The one without the prefix opens perfectly fine, and I am not sure why the other one throws an error.
Here is the code I am trying:
import pandas as pd
import glob
path = 'bulk_uploads ' # use your path
all_files = glob.glob(path + "/*.xlsx")
li = []
for filename in all_files:
df = pd.read_excel(filename, sheet_name = 1)
df['Date'] = str(filename)[:-4]
li.append(df)
# frame = pd.concat(li, axis=0, ignore_index=True)
Is there either a way to chance any files that have this prefix to lose it, or another way around it?
It looks like they are files which I have previously opened (I have no files currently open)

import pandas as pd
import glob
import re
path = 'bulk_uploads ' # use your path
all_files = glob.glob(path + "/*.xlsx")
li = []
special=re.compile('$~') #####add more special characters if any
for filename in all_files:
if special.search(filename):
os.remove(filename)
else:
df = pd.read_excel(filename, sheet_name = 1)
df['Date'] = str(filename)[:-4]
li.append(df)
Can you give this a try and see if it works fine?
It seems that your folder is having temporary files..

Related

How to upload all csv files that have specific name inside filename in python

I want to concat all csv file that have this specific word 'tables' on the filename.
Below code is upload all csv file without filter the specific word that i want.
# importing the required modules
import glob
import pandas as pd
# specifying the path to csv files
#path = "csvfoldergfg"
path = "folder_directory"
# csv files in the path
files = glob.glob(path + "/*.csv")
# defining an empty list to store
# content
data_frame = pd.DataFrame()
content = []
# checking all the csv files in the
# specified path
for filename in files:
# reading content of csv file
# content.append(filename)
df = pd.read_csv(filename, index_col=None)
content.append(df)
# converting content to data frame
data_frame = pd.concat(content)
print(data_frame)
example filename are:
abcd-tables.csv
abcd-text.csv
abcd-forms.csv
defg-tables.csv
defg-text.csv
defg-forms.csv
From the example filenames. The expected output is concat filenames
abcd-tables.csv
defg-tables.csv
into single dataframe. Assuming the header are same.
*Really appreciate you guys can solve this
You can use:
import pandas as pd
import pathlib
path = 'folder_directory'
content = []
for filename in pathlib.Path(path).glob('*-tables.csv'):
df = pd.read_csv(filename, index_col=None)
content.append(df)
df = pd.concat(content, ignore_index=True)

How to keep top 500 rows a csv loop (python) and overwrite each file

I am trying to read more than 100 csv files in python to keep the TOP 500 rows (they each have more than 55,0000 rows). So far I know how to do that, but I need save each modified file in the loop with its own filename in csv format. because normally I can output the concatenated dataframe to one big csv file, but this time I need to basically truncate each csv file to only keep top 500 rows and save each.
this is the code I have had so far:
import pandas as pd
import glob
FolderName = str(input("What's the name of the folder are you comparing? "))
path = str(input('Enter full path of the folder: '))
#r'C:\Users\si\Documents\UST\AST' # use your path
all_files = glob.glob(path + "/*.csv")
#list1 = []
d = {}
for filename in all_files:
df = pd.read_csv(filename, index_col=None, header=0, nrows=500)
#list1.append(df)
d[filename] = df.columns
#frame = pd.concat(list1, axis=0, ignore_index=True)
frame = pd.DataFrame.from_dict(d, orient='index')
output_path = r'C:\Users\si\Downloads\New\{}_header.xlsx'.format(FolderName)
frame.to_excel(output_path)
Dataframes can write as well as read CSVs. So, just create and call to_csv with the same filename.
import pandas as pd
import glob
FolderName = str(input("What's the name of the folder are you comparing? "))
path = input('Enter full path of the folder: ')
all_files = glob.glob(path + "/*.csv")
for filename in all_files:
pd.read_csv(filename, index_col=None, header=0, nrows=500).to_csv(filename)

How to merge multiple text files into one csv file in Python

I am trying to convert 200 text files into csv files. I am using below code I am able to run it but it does not produce csv files. Could anyone tell any easy and fast way to do? Many Thanks
dirpath = 'C:\Files\Code\Analysis\Input\qobs_RR1\\'
output = 'C:\Files\Code\Analysis\output\qobs_CSV.csv'
csvout = pd.DataFrame()
files = os.listdir(dirpath)
for filename in files:
data = pd.read_csv(filename, sep=':', index_col=0, header=None)
csvout = csvout.append(data)
csvout.to_csv(output)
The problem is that your os.listdir gives you the list of filenames inside dirpath, not the full path to these files. You can get the full path by prepending the dirpath to filenames with os.path.join function.
import os
import pandas as pd
dirpath = 'C:\Files\Code\Analysis\Input\qobs_RR1\\'
output = 'C:\Files\Code\Analysis\output\qobs_CSV.csv'
csvout_lst = []
files = [os.path.join(dirpath, fname) for fname in os.listdir(dirpath)]
for filename in sorted(files):
data = pd.read_csv(filename, sep=':', index_col=0, header=None)
csvout_lst.append(data)
pd.concat(csvout_lst).to_csv(output)
Edit: this can be done with a one-liner:
pd.concat(
pd.read_csv(os.path.join(dirpath, fname), sep=':', index_col=0, header=None)
for fname in sorted(os.listdir(dirpath))
).to_csv(output)
Edit 2: updated the answer, so the list of files is sorted alphabetically.

Pandas raising an error when setting the output directory

There is OSError when I try to set the output directory and write Prefix in front of "i" such as 'cal_' or 'edit_'. If I added the postfix like df.to_csv(i + '_edit.csv'), the result was "filename.csv_edit".
So the files were saved the input directory and I couldn't add any prefix and postfix. How to fix this error?
import pandas as pd
import glob
PathIn = r'C:\Users\input'
PathOut = r'C:\Users\output'
filenames = glob.glob(PathIn + "/*.csv")
file_list = []
for i in filenames:
df = pd.read_csv(i)
file_list.append(df)
df.columns = df.columns.str.replace('[','')
df.columns = df.columns.str.replace(']','')
df.to_csv(i + '.csv')
Try this one. This should work. It has the full code you want.
import os
import pandas as pd
PathIn = r'C:\Users\input'
PathOut = r'C:\Users\output'
file_list = []
for name in os.listdir(PathIn):
if name.endswith(".csv"):
#print(name)
df = pd.read_csv(PathIn + "\" + name)
file_list.append(df)
df.columns = df.columns.str.replace('[','')
df.columns = df.columns.str.replace(']','')
df.to_csv(PathOut + name)
The value of i in filenames is the absolute path of the csv file you are reading.
So if you have 3 csv files in your input directory, you filenames list will be like below :
['C:\Users\input\file1.csv',
'C:\Users\input\file2.csv',
'C:\Users\input\file3.csv']
Now you are trying to add a prefix in front of the elements of above list which would not be a valid path.
You need to fetch the filename of input file and append it with PathOut so that a valid path exists.
You can fetch the filenames in any directory as below :
filenames = []
for entry in os.listdir(PathIn):
if os.path.isfile(os.path.join(PathIn, entry)) and ".csv" in entry:
filenames.append(entry)
Now you can iterate over this list and do operations you were doing. For saving the final df to file in output directory, append the filenames with PathOut.

How to merge 2000 CSV files saved in different subfolders within the same main folder

Hey People I would like to merge 2000 Csv files into one of 2000 sub-folders. Each sub-folder contains three Csv files with different names. so I need to select only one Csv from each folder.
I know the code for how to merge bunch of Csv files if they are in the same - folder.
import pandas as pd
import glob
path = r'Total_csvs'
all_files = glob.glob(path + "/*.csv")
li = []
for filename in all_files:
df = pd.read_csv(filename, index_col=None, header=0)
li.append(df)
frame = pd.concat(li, axis=0, ignore_index=True)
frame.to_csv('Total.csv',index=False)
But my problems with 2000 Csv files look totally different.
Folder structure is:
Main folder (with in this 2000 subfolders, within subfolders I had multiple Csv Files and I need to select only one Csv file from this. Finally concating all 2000 Csv files.)
Coming to Naming Conventions (all the subfolders had different names, but the subfolder name and the Csv name within the subfolder is same)
Any suggestions or a sample code (how to read 2000 Csv from sub-folders) would be helpful.
Thanks in advance
You can loop through all the subfolders using os.listdir.
Since the CSV filename is the same as the subfolder name, simply use the subfolder name to construct the full path name.
import os
import pandas
folders = os.listdir("Total_csvs")
li = []
for folder in folders:
# Since they are the same name
selected_csv = folder
filename = os.path.join(folder, selected_csv + ".csv")
df = pd.read_csv(filename, index_col=None, header=0)
li.append(df)
frame = pd.concat(li, axis=0, ignore_index=True)
frame.to_csv('Total.csv',index=False)
We can iterate on every subfolder, determine expected_csv_path, check if it exists. If it exists, we add them to our all_files list.
Try following:
import pandas as pd
import os
path = r'Total_csvs'
li = []
for f in os.listdir(path):
expected_csv_path = os.path.join(path, f, f + '.csv')
csv_exists = os.path.isfile(expected_csv_path)
if csv_exists:
df = pd.read_csv(expected_csv_path, index_col=None, header=0)
li.append(df)
frame = pd.concat(li, axis=0, ignore_index=True, sort=False)
frame.to_csv('Total.csv',index=False)
If you are using Python 3.5 or newer you could use glob.glob in recursive manner following way:
import glob
path = r'Total_csvs'
all_csv = glob.glob(path+"/**/*.csv",recursive=True)
Now all_csv is list of relative paths to all *.csv inside Total_csv and subdirectories of Total_csv and subdirectories of subdirectories of Total_csv and so on.
For example purpose lets assume that all_csv is now:
all_csv = ['Total_csvs/abc/abc.csv','Total_csv/abc/another.csv']
So we need to get files with names correnponding to directory of their residence, this could be done following way:
import os
def check(x):
directory,filename = x.split(os.path.sep)[-2:]
return directory+'.csv'==filename
all_csv = [i for i in all_csv if check(i)]
print(all_csv) #prints ['Total_csvs/abc/abc.csv']
Now all_csv is list of paths to all .csv you are seeking and you can use it same way as you did with all_csv in "flat" (non-recursive) case.
You can do it without joining paths:
import pathlib,pandas
lastparent=None
for ff in pathlib.Path("Total_csvs").rglob("*.csv"): # recursive glob
print(ff)
if(ff.parent!=lastparent): # process the 1st file in the dir
lastparent= ff.parent
df = pd.read_csv(str(ff),... )
...etc.

Categories