When ever I run i try to add a column in pandas dataframe by iterating through a directory like this
import pandas as pd
import os
import sys
########generate file path
'getcwd: ', os.getcwd()
osfile, maindir =('__file__: ', __file__)
filename = os.path.basename(sys.argv[0])
inpath = maindir.replace(filename,"Excels")
outpath = maindir.replace(filename,"BulkFile.xlsx")
#########pandas script
def add_column():
for root, dirs, files in os.walk(inpath):
print(files)
for f in files:
path = os.path.join(root, f)
excelframe = pd.read_excel(path)
excelframe['full_name'] = excelframe['first_name'] + " " + excelframe['last_name']
dataframe = [excelframe]
compactframe = pd.concat(dataframe)
compactframe.to_excel(outpath)
it does nothing, no error codes or anything, it just does nothing.
But if I don't iterate through a directory and replace that script with this
import pandas as pd
import sys
import os
#####generate file path
osfile, maindir =('__file__: ', __file__)
filename = os.path.basename(sys.argv[0])
inpath = maindir.replace(filename,"BulkFile.xlsx")
outpath = maindir.replace(filename,"newfile.xlsx")
########pandas script
excelframe = pd.read_excel(inpath)
excelframe['full_name'] = excelframe['first_name'] + " " + excelframe['last_name']
dataframe = [excelframe]
compactframe = pd.concat(dataframe)
compactframe.to_excel(outpath)
It works just fine.
Does anybody know why this is or how to iterate through a directory and add columns to a dataframe?
You have defined the function add_column but you haven't ran it. Add the line add_column() to the end of your first script to execute.
Related
This question already has answers here:
How do I check whether a file exists without exceptions?
(40 answers)
Closed 2 months ago.
Seeking for your assistance again on how I can add an Error Handling Message when the .csv file is not in the directory. How do I this? Sorry, still learning stuff in Python but thank you very much in advance.
Full code:
import os
import pandas as pd
import openpyxl
import tkinter
from tkinter import messagebox
root = tkinter.Tk()
root.withdraw()
directory = 'C:/Path1'
ext = ('.csv')
for filename in os.listdir(directory):
f = os.path.join(directory, filename)
if f.endswith(ext):
head_tail = os.path.split(f)
head_tail1 = 'C:/Path2'
k =head_tail[1]
r=k.split(".")[0]
p=head_tail1 + "/" + r + " - .csv"
mydata = pd.read_csv(f, engine='python')
# to pull columns and values
new = mydata[["A","B","C","D"]]
new = new.rename(columns={'D': 'F'})
new['F'] = 1
print(new.columns)
new["B"] = (pd.to_datetime(new["B"], format="%d-%b", errors="coerce").dt.strftime("%#m-%#d").fillna(new["B"]))
new.to_csv(p ,index=False)
#to merge columns and values
merge_columns = ['A', 'B', 'C']
merged_col = ''.join(merge_columns).replace('ABC', 'G')
new[merged_col] = new[merge_columns].astype(str).apply(lambda x: '.'.join(x), axis=1)
new.drop(merge_columns, axis=1, inplace=True)
new = new.groupby(merged_col).count().reset_index()
new.to_csv(p, index=False)
messagebox.showinfo("Done.")
os.chdir("C:/Path2")
for file in os.listdir():
if file.endswith(".xlsx"):
if os.path.exists("Master.xlsx"):
os.rename("Master.xlsx", "Old_Master.xlsx")
os.rename(file, "Master.xlsx")
I tried adding this on the script but it is still doing its function even if the file is not in the directory.
import os
import pandas as pd
import openpyxl
import tkinter
from tkinter import messagebox
root = tkinter.Tk()
root.withdraw()
directory = 'C:/Path1'
ext = ('.csv')
for filename in os.listdir(directory):
f = os.path.join(directory, filename)
if f.endswith(ext):
#added line###############
if os.path.isfile(directory):
print("File does exist at this time")
else:
print("No such file exists at this time")
head_tail = os.path.split(f)
head_tail1 = 'C:/Path2'
k =head_tail[1]
r=k.split(".")[0]
p=head_tail1 + "/" + r + " - .csv"
mydata = pd.read_csv(f, engine='python')
There is easier than what you doing:
import os
file_found = False
for file in os.listdir("C:/Folder"):
if file.endswith(".csv"):
file_found = True
break
if file_found:
print("There's .csv file in C:/Folder.")
else:
print("Error! There's no .csv file in C:/Folder.")
My initial code is here:
import pandas as pd
import os
directory_in_str = input('\n\nEnter the name of the folder you would like to use. If there are spaces, replace with underscores: ')
directory_in_str.strip()
directory = os.fsencode(directory_in_str)
user = input('\nEnter your first initial and last name as one word (ex: username): ')
user.strip()
path1 = '/Users/'
path2 = '/Desktop/DataScience/'
dspath = path1 + user + path2
slash = '/'
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.endswith(".xls") or filename.endswith(".xlsx"):
print(directory)
pathname = dspath + directory_in_str + slash + filename
print(filename)
#Global = pd.read_excel(pathname, sheet_name=0)
Stats = pd.read_excel(pathname, sheet_name=1)
listorder = ['1', '2', '3']
Stats = Stats.reindex(columns=listorder)
Stats.to_excel(filename, sheet_name='Statistics', index=False)
continue
else:
continue
I've included the filename print statement to insure that the correct path is being used. However, the print statement happens twice.
These are the statements printed.
b'testrearrange'
Testname.xlsx
b'testrearrange'
~$Testname.xlsx
Why are the two characters '~$' added? The error originates from the line
Stats = pd.read_excel(pathname, sheet_name=1)
with the error
ValueError: File is not a recognized excel file
Does anyone know how to fix this?
I think the files starting with "~$# are temporary excel files that are created when you open the file in excel. One option is to close the file, in which case the temporary file is deleted. Other option is to change the logic by which you list the files to be read so that it ignores files that start with ~. I like to use glob for this
from glob import glob
path = "C:/Users/Wolf/[!~]*.xls*"
files = glob(path)
for file in files:
print("Do your thing here")
I have about 500 Excel files in the format: data_1, data_2 ... data_500
However, not all file are there. File like data_3 is not in the folder.
I want to import all available data into dataframe.
However, the my code below will stop when it hit a name of file not in the list, say data_3
Can you please help me to skip these record?
Thank you,
HN
for i in range(500):
filename='data_'+ str(i) + 'xlsx'
output = pd.read_excel('PATH' + filename)
THE KEY IS CHECK IN FULL PATH IN glob.glob
import glob
for i in xlx_file_list:
filename = 'Excel_Sample' + str(i) + '.xlsx' #; print(filename)
full_path = 'D:\Python...\\' + filename #; print(full_path)
if full_path not in glob.glob('D:\Python...\*'):
print(filename, ' not in folder')
continue
outfile = pd.read_excel(full_path, sheet_name='data_sheet')
print(outfile)
Hi in your sample probably PATH is a variable, not a string, 'PATH'+filename cannot work.
i suggest to use os.path.join() to compose file path, don't use string composition for this.
There are two way to solve this problem:
Generate all names and see if the file exists:
import os
for i in range(500):
filename='data_'+ str(i) + 'xlsx'
if os.path.exists(filename)
output = pd.read_excel(filename)
or generate only the correct filename list:
import glob
for filename in glob.glob('data_*.xlsx'):
output = pd.read_excel(filename)
In my path /volume1/xx/ are several files with this character A_test1.pdf, B_test2.pdf, ...I want to seperate the test1 part without path and .pdf.
Im newbie so I tried first with full name
but I got only the "*.pdf" as a text.
What is wrong with the path oder placeholder * ?
splitname = os.path.basename('/volume1/xx/*.pdf')
Edit
I got 2019-01-18_RG-Telekom[] from orign ReT_march - I want 2019-01-18_RG-Telekom_march (text after underlining) xx is a folder
here is the whole code:
#!/usr/bin/env python3
import datetime
import glob
import os
import os.path
SOURCE_PATH = '/volume1/xx'
TARGET_PATH = os.path.join(SOURCE_PATH, 'DMS')
def main():
today = datetime.date.today()
splitnames = [os.path.basename(fpath) for fpath in glob.glob("./xx/*.pdf")]
for prefix, name_part in [
('ReA', 'RG-Amazon'),
('GsA', 'GS-Amazon'),
('ReT', 'RG-Telekom'),
('NoE', 'Notiz-EDV'),
]:
filenames = glob.iglob(os.path.join(SOURCE_PATH, prefix + '*.pdf'))
for old_filename in filenames:
new_filename = os.path.join(TARGET_PATH, '{}_{}_{}.pdf'.format(today, name_part, splitnames))
os.rename(old_filename, new_filename)
if __name__ == '__main__':
main()
Use glob, os.path don't know how to process masks, but glob.glob works:
splitnames = [os.path.basename(fpath) for fpath in glob.glob("./**/*.txt")]
splitnames
Out:
['A_test1.pdf', 'B_test2.pdf']
Output of the glob:
glob.glob("./**/*.txt")
Out:
['./some_folder/A_test1.pdf', './another_folder/B_test2.pdf']
Apply os.path.basename to this list and extract basenames, as it shown above.
Edit
If xx in the path volume1/xx/ is just a folder name, not a mask, you should use following expression:
splitnames = [os.path.basename(fpath) for fpath in glob.glob("./xx/*.txt")]
because ./**/ is expression which masks a folder name and it's unnecessary that case.
I am trying to create multiple feature classes from data with .txt extension. My code runs, but only produces one .shp file. The variable xyTable when checked does contain all the file extensions. These then should individually run through both Arcpy functions and produce the relevant featureclass files named in accordance with their .txt files.
import arcpy
import os
import tempfile
import shutil
shpFileArray = []
print "\n"
arcpy.env.overwriteOutput = True
newFolder = "destinationpath"
if os.path.exists(newFolder):
tmp = tempfile.mktemp(dir=os.path.dirname(newFolder))
shutil.move(newFolder, tmp)
shutil.rmtree(tmp)
os.makedirs(newFolder)
arcpy.env.workspace = newFolder
for file in os.listdir("sourcepath"):
layerName = file[:-4]
fileSHP = layerName+".shp"
for file in os.listdir("sourcepath"):
if file.endswith(".txt"):
xyTable = (os.path.join("destinationpath", file))
arcpy.MakeXYEventLayer_management(table= xyTable, in_x_field="EastingM", in_y_field="NorthingM", out_layer="layerName",...continues...
arcpy.FeatureClassToFeatureClass_conversion(in_features="layerName", out_path="destinationpath", out_name= fileSHP,....continues....
Looks like you are not giving the FeatureClassToFeatureClass tool unique shapefile names. After the first For loop finishes, fileSHP doesn't change. Looks like you have the shpFileArray set up to hold the list of fileSHPs. Perhaps try something like this to save your set of fileSHPs in the first For loop and refer to them in the second For loop. My python might not be exactly right, but I think the idea is.
import arcpy
import os
import tempfile
import shutil
shpFileArray = []
print "\n"
arcpy.env.overwriteOutput = True
newFolder = "destinationpath"
if os.path.exists(newFolder):
tmp = tempfile.mktemp(dir=os.path.dirname(newFolder))
shutil.move(newFolder, tmp)
shutil.rmtree(tmp)
os.makedirs(newFolder)
arcpy.env.workspace = newFolder
for file in os.listdir("sourcepath"):
layerName = file[:-4]
fileSHP = layerName+".shp"
shpFileArray.append(fileSHP)
for idx, file in enumerate(os.listdir("sourcepath")):
if file.endswith(".txt"):
xyTable = (os.path.join("destinationpath", file))
outShape = shapeFileArray[idx]
arcpy.MakeXYEventLayer_management(table= xyTable, in_x_field="EastingM", in_y_field="NorthingM", out_layer="layerName",...continues...
arcpy.FeatureClassToFeatureClass_conversion(in_features="layerName", out_path="destinationpath", out_name= outShape,....continues....