Looping in python: NameError: name 'df2' is not defined - python

I am tring to make changes to the multiple excel files. I have below program to loop through multiple files. I strangle get this error NameError: name 'df2' is not defined. Can someone help me to understand what is the issue? Thank you in advance!
import os
import glob
import pandas as pd
from pathlib import Path
folder = (r"C:\Users\Documents\Extracted")
for file_name in Path(folder).glob('*.xlsx'):
df = pd.read_excel(file_name)
drop_list = ['BarrierFreeAttributes.BarrierFreeAttribute','ConsultationHours.ConsultationHoursTimeSpan', 'Location.Coordinates.Latitude_right', 'Location.Coordinates.Longitude_right']
if all(item in drop_list for item in list(df.columns)):
df2 = df.drop(columns=drop_list, axis=1)
else:
print(file_name)
df2.to_excel(file_name.with_suffix('.xlsx'),index = False)
Data:

Reason is in else is not defined df2, simpliest is use df only and overwrite it in if statement, also writing to file is removed from else statement:
for file_name in Path(folder).glob('*.xlsx'):
df = pd.read_excel(file_name)
drop_list = ['BarrierFreeAttributes.BarrierFreeAttribute','ConsultationHours.ConsultationHoursTimeSpan', 'Location.Coordinates.Latitude_right', 'Location.Coordinates.Longitude_right']
if all(item in drop_list for item in list(df.columns)):
df = df.drop(columns=drop_list, axis=1)
else:
print(file_name)
df.to_excel(file_name.with_suffix('.xlsx'),index = False)

Related

How to set a looped pd.read_excel to skip if an error returns

I have some code set up to read specific data from every single xlsx file in a folder. However, the excels use two different naming conventions for the sheet I want (for example: Titlepage and Title Page). My current code is this:
import pandas as pd
import string
import glob
import os
directory = 'file path'
files = os.listdir(directory)
list_of_dfs = []
Start of loop
os.chdir('file path')
for file in files:
df = pd.read_excel (file)
df = df.T
df = df.iloc[[1],:24]
list_of_dfs.append(df)
End of loop
data_combined = pd.concat(list_of_dfs)
data_combined.to_excel ('file path/output.xlsx', index=False)
I am thinking of having the loop above twice, once for Title Page, and once for Titlepage. However, the code will error if it cannot find the sheet name specific. Is there anyway to tell python to move to the next xlsx file if no such sheet name is found?
Edit:
I used the code below to make this work.
os.chdir("file path")
for file in files:
try:
df = pd.read_excel (file, sheet_name= 'Titlepage')
df = df.T
df = df.iloc[[1],:24]
list_of_dfs.append(df)
except:
pass
for file in files:
try:
df = pd.read_excel (file, sheet_name= 'Title Page')
df = df.T
df = df.iloc[[1],:24]
list_of_dfs.append(df)
except:
pass

Change Colume Name in dataframe and melt it

I am have a code to merge few excel together using Python, but i cant realy rename any thing in that dataframe using df.rename(). could someone explain why? Thanks!
import os
import xlrd
import pandas as pd
def file_name(file_dir):
list=[]
for file in os.listdir(file_dir):
if os.path.splitext(file)[1] == '.xlsx':
list.append(file)
return list
path = r'E:\Sync\External\Test'
wks = file_name(path)
data = []
for i in range(len(wks)):
read_xlsx = xlrd.open_workbook(path + '\\' + wks[i])
sheet1 = read_xlsx.sheets()[1]
nrow = sheet1.nrows
title = sheet1.row_values(0)
location = os.path.splitext(wks[i])[0]
for j in range(6,nrow):
a = sheet1.row_values(j)
a.insert(0,location)
print(a)
data.append(a)
content= pd.DataFrame(data)
content.rename({'0': 'X', '1': 'Y'}, axis=1, inplace=True)
#content.to_csv(path+'\\test.xlsx', sep=',', header=True, index=False)
content.to_excel(path+'\\test.xlsx', header=True, index=False)
Code as above, no error shows,but it's just doesn't work (rename part)

Open multiple Excel files to separate Pandas dataframes

Brand new to Python and could use some help importing multiple Excel files to separate Pandas dataframes. I have successfully implemented the following code, but of course it imports everything into one frame. I would like to import them into df1, df2, df3, df4, df5, etc.
Anything helps, thank you!
import pandas as pd
import glob
def get_files():
directory_path = input('Enter directory path: ')
filenames = glob.glob(directory_path + '/*.xlsx')
number_of_files = len(filenames)
df = pd.DataFrame()
for f in filenames:
data = pd.read_excel(f, 'Sheet1')
df = df.append(data)
print(df)
print(number_of_files)
get_files()
The easiest way to do that is to use a list. Each element of the list is a dataframe
def get_files():
directory_path = input('Enter directory path: ')
filenames = glob.glob(directory_path + '/*.xlsx')
number_of_files = len(filenames)
df_list = []
for f in filenames:
data = pd.read_excel(f, 'Sheet1')
df_list.append(data)
print(df_list)
print(number_of_files)
return df_list
get_files()
You can then access your dataframes with df_list[0], df_list[1]...
Just as another option by Jezrael answer here https://stackoverflow.com/a/52074347/13160821 but modified for your code.
from os.path import basename
def get_files():
directory_path = input('Enter directory path: ')
filenames = glob.glob(directory_path + '/*.xlsx')
number_of_files = len(filenames)
df_list = {basename(f) : pd.read_excel(f, 'Sheet1') for f in filenames}
print(number_of_files)
return df_list
get_files()
Which can then be accessed by the filename eg. dfs['file_name1.xlsx'] or dfs['some_file.xlsx']. You can also do things like splitext to remove the xlsx from the key or use just part of the filename.

How to merge multiple xlsx files into one single xlsx file with Different sheets

Hi I have multiple xlsx files
sales-feb-2014.xlsx
sales-jan-2014.xlsx
sales-mar-2014.xlsx
I have merged all 3 sheets into one data set using file name as INDEX[0]
script :
import pandas as pd
import numpy as np
import glob
import os
all_data = pd.DataFrame()
for f in glob.glob(r'H:\Learning\files\sales*.xlsx'):
df = pd.read_excel(f)
df['filename'] = os.path.basename(f)
df = df.reset_index().set_index('filename')
print(df)
Now Data looks like this :
file name col1 col2 col3
sales-jan-2014.xlsx .... .... ...
sales-feb-2014.xlsx .... .... ...
sales-mar-2014.xlsx .... .... ...
here I want to load new xlsx file where I need to load
sales-jan-2014.xlsx into sheet1
sales-feb-2014.xlsx into sheet2
sales-mar-2014.xlsx into sheet3
I have tried with this script :
writer = pd.ExcelWriter('output.xlsx')
for filename in df.index.get_level_values(0).unique():
temp_df = df.xs(filename, level=0)
temp_df.to_excel(writer,filename)
writer.save()
after executing this script i'm getting error :
loc, new_ax = labels.get_loc_level(key, level=level,
AttributeError: 'Index' object has no attribute 'get_loc_level'
can you please suggest where I'm missing
Try using the below code :
import os
import pandas as pd
dirpath = "C:\\Users\\Path\\TO\\Your XLS folder\\data\\"
fileNames = os.listdir(dirpath)
writer = pd.ExcelWriter(dirpath+'combined.xlsx', engine='xlsxwriter')
for fname in fileNames:
df = pd.read_excel(dirpath+fname)
print(df)
df.to_excel(writer, sheet_name=fname)
writer.save()
You can also use your code by make below changes :
for f in glob.glob(r'H:\Learning\files\sales*.xlsx'):
df = pd.read_excel(f)
df['filename'] = os.path.basename(f)
df = df.reset_index()
print(df.columns)
df.set_index(['filename','index'], inplace=True)
and saving it as you have done.
I hope this helps

Python , get duplicates in 1st column of all csv files in a directory

import pandas as pd
import glob
dataset = pd.read_csv('masterfeedproduction-EURNA_2016-06-27.csv',sep =
',',delimiter = None) # select 1 file in the directory
datasets_cols = ['transactionID','gvkey','companyName']
df= dataset.transactionID
df.shape
df.loc[df.duplicated()]
returns the duplicates in the selected file. displays row number and transactionID. so this is correct.
target_directory = r'C:\Users\nikol\Downloads\fullDailyDeltas\fullDailyDeltas'
file_list = glob.glob(target_directory + "/*.csv")
df_result = df.loc[df.duplicated()]
for file in file_list:
return(df_result)
here I am stuck.
target_directory = r'C:\Users\nikol\Downloads\fullDailyDeltas\fullDailyDeltas'
file_list = glob.glob(target_directory + "/*.csv")
for file in file_list:
dataset = pd.read_csv(file)
df = dataset.transactionID
duplicated = df.loc[df.duplicated()]
if duplicated.empty == False:
print(file)
print(duplicated)
Have a look at the glob module.
import pandas as pd
import glob
def your_function(file):
# put your df processing logic here
return df_result
Step 1 - Create list of files in directory
target_directory = r'Path/to/your/dir'
file_list = glob.glob(target_directory + "/*.csv")
# Include slash or it will search in the wrong directory!!
Step 2 - Loop through files in list
for file in file_list: # Loop files
df_result = your_function(file) # Put your logic into a separate function
new_filename = file.replace('.csv', '_processed.csv')
df_result.to_csv(new_filename, index = False)
Comment
In case you would have included your code showing your attempts to do this yourself, your question was answered within seconds.

Categories