I am iterating through a directory of files (.pdf files) that are names 'FIRST LAST Zip.pdf'... I save the name and zip into variables just fine.. Then I basically want to get the row number they match at (via excel in pandas), then with the row number, get that row's column value as the new name i.e. ID.
I have the below; which worked for about 1/6 if my files. The rest are just hitting the prints 'no match found', when there is indefinite matches found. No errors in terminal, just remaining files (ALOT) which definite matches, just output 'match not found, match not found'. Is it something with index.min()?
import PyPDF2
from PyPDF2 import PdfFileWriter, PdfFileReader
import re
import config
import xlrd
import numpy as np
import pandas as pd
import math
import os
for filename in os.listdir(config.Total):
if filename.endswith(".pdf"):
First_Name, Last_Name, Zip = filename.replace(".pdf",'').split()
Name = First_Name + " " + Last_Name
print(Name)
print(Zip)
data1 = pd.read_excel(config.Excel1)
df = pd.DataFrame(data1)
header = df.iloc[0]
df = df[1:]
df.rename(columns = header)
row_numberd1 = df[df['Member Name'].str.contains(Name)].index.min()
row_numberd12 = df[df['Member Address Line 3'].str.contains(Zip)].index.min()
if row_numberd1 == row_numberd12: # When rows match of NameUp and Zip var in DF1
rowMatched = row_numberd1
print("Match Found in DF1")
print(rowMatched)
MemberID = df['ID'][rowMatched]
MemberI = str(MemberID)
os.rename(config.Total+filename, config.ID+MemberI+'.pdf')
else:
print("No Match Found in DF1, Search Df2")
data2 = pd.read_excel(config.Excel2)
df2 = pd.DataFrame(data2)
header2 = df2.iloc[0]
df2 = df2[1:]
df2.rename(columns = header2)
row_numberd2 = df2[df2['Member Name'].str.contains(Name)].index.min()
row_numberd22 = df2[df2['Member Address Line 3'].str.contains(Zip)].index.min()
if row_numberd2 == row_numberd22: # When rows match of NameUp and Zip var in DF2
rowMatched2 = row_numberd2
print("Match Found in DF2")
print(rowMatched2)
MemberID = df2['ID'][rowMatched2]
MemberI = str(MemberID)
os.rename(config.Total+filename, config.ID+MemberI+'.pdf')
Update; via the comment I am trying this!
import PyPDF2
from PyPDF2 import PdfFileWriter, PdfFileReader
import re
import config
import xlrd
import numpy as np
import pandas as pd
import math
import os
data1 = pd.read_excel(config.Excel1)
data2 = pd.read_excel(config.Excel2)
df = pd.DataFrame(data1)
header = df.iloc[0]
df = df[1:]
df.rename(columns = header)
df2 = pd.DataFrame(data2)
header2 = df2.iloc[0]
df2 = df2[1:]
df2.rename(columns = header2)
for filename in os.listdir(config.Total):
if filename.endswith(".pdf"):
First_Name, Last_Name, Zip = filename.replace(".pdf",'').split()
Name = First_Name + " " + Last_Name
print(Name)
print(Zip)
UniqueMatch = len(df[df['Member Name'].str.contains(Name) & df['Member Address Line 3'].str.contains(Zip)]) == 1
if UniqueMatch: # When rows match of NameUp and Zip var in DF1
rowMatched = UniqueMatch
print("Match Found in DF1")
print(rowMatched)
MemberID = df['ID'][rowMatched]
MemberI = str(MemberID)
os.rename(config.Total+filename, config.ID+MemberI+'.pdf')
else:
print("Match not Found in DF1")
print("No Match Found in DF1, Search Df2")
UniqueMatch2 = len(df2[df2['Member Name'].str.contains(Name) & df2['Member Address Line 3'].str.contains(Zip)]) == 1
if UniqueMatch2: # When rows match of NameUp and Zip var in DF2
rowMatched2 = UniqueMatch2
print("Match Found in DF2")
print(rowMatched2)
MemberID = df2['ID'][rowMatched2]
MemberI = str(MemberID)
os.rename(config.Total+filename, config.ID+MemberI+'.pdf')
else:
print("Match not Found in DF2")
But it is outputting an error and failing when I try to rename with the row number ID column value:
Traceback (most recent call last):
File "rename.py", line 60, in <module>
MemberID = df2['ID'][rowMatched2]
File "C:\Program Files (x86)\Python37-32\lib\site-packages\pandas\core\series.py", line 1064, in __getitem__
result = self.index.get_value(self, key)
File "C:\Program Files (x86)\Python37-32\lib\site-packages\pandas\core\indexes\base.py", line 4723, in get_value
return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
Final Update: It's still only catching some of the matches below, via #ilfy6 comments/suggestions.
for filename in os.listdir(config.Total2):
if filename.endswith(".pdf"):
First_Name, Last_Name, Zip = filename.replace(".pdf",'').split()
Name = First_Name + " " + Last_Name
print(Name)
print(Zip)
matches2 = df2[df['Member Name'].str.contains(Name) & df2['Member Address Line 3'].str.contains(Zip)]
if len(matches2) == 1:
row_index = matches2.iloc[0]['ID']
print("Match Found in DF2")
print(row_index)
# MemberID = df2.loc[row_index, 'ID']
MemberI = str(row_index)
os.rename(config.Total2+filename, config.ID+MemberI+'.pdf')
else:
print("Match not Found in DF2")
# os.rename(config.Total+filename, config.ManualCheck+filename+'.pdf')
matches1 = df[df['Member Name'].str.contains(Name) & df['Member Address Line 3'].str.contains(Zip)]
if len(matches1) == 1:
row_index = matches1.iloc[0]['ID']
print("Match Found in DF1")
print(row_index)
# MemberID = df.loc[row_index, 'ID']
MemberI = str(row_index)
os.rename(config.Total2+filename, config.ID+MemberI+'.pdf')
else:
print("Match not Found in DF1")
# print("No Match Found in DF1, Search Df2")
Regarding your error:
UniqueMatch = len(df[df['Member Name'].str.contains(Name) & df['Member Address Line 3'].str.contains(Zip)]) == 1
if UniqueMatch: # When rows match of NameUp and Zip var in DF1
rowMatched = UniqueMatch
UniqueMatch is a boolean.
Consider instead:
matches = df[df['Member Name'].str.contains(Name) & df['Member Address Line 3'].str.contains(Zip)]
if len(matches) == 1:
row_index = matches.iloc[0].name
Getting the first (and here, only) entry's name yields the index of the row which you seem to be wanting. Perhaps it would also be even easier to just query what you want from the matches:
matches.iloc[0]['ID']
# get the first element, then the contents of the ID column
On your last comment, I renamed your variable from rowMatched to row_index, which is more consistent with Python conventions.
Related
As Iam New to Python I need some help to compare two XML files.
These are the Following Conditions:
To print Common fullPath Name and Name (fullPath and Name are the attributes present in the XML file) between the two XML files.
To print the values which is present in only first file and not in second file.
To print the values which is present in only second file and not in first file.
Later, Have to print this output in excel file having different sheets.
for example (1st condition in sheet 1, 2nd condition in sheet2, 3rd condition in sheer3 of the same excel file.)
Can please anyone help me with the code that satisfies the above condition which I have mentioned.
This is the code which I have tried.
from lxml import etree
Base = etree.parse('Base.xml')
Target = etree.parse('Target.xml')
Base_fullPath = Base.xpath("//Member/#fullPath")
Target_fullPath = Target.xpath("//Member/#fullPath")
Base_name = Base.xpath("//Member/#name")
Target_name = Target.xpath("//Member/#name")
def match(Base_fullPath, Target_fullPath, Base_name,Target_name):
Base_fullPath_set = set(Base_fullPath)
Target_fullPath_set = set(Target_fullPath)
Base_name_set = set(Base_name)
Target_name_set = set(Target_name)
if (Base_fullPath_set & Target_fullPath_set, Base_name_set & Target_name_set):
x = open('C:\\Users\\pvl\\Desktop\\New folder\\Common_FullPath.csv', 'w')
y=(Base_fullPath_set & Target_fullPath_set)
z=(Base_name_set & Target_name_set)
print("common details Full Path: \n", *y, sep='\n', file = x)
print("\n")
x = open('C:\\Users\\pvl\\Desktop\\New folder\\Common_name.csv', 'w')
print("\n common details Name: \n", *z, sep='\n', file=x)
else:
print("No Matches Found")
match(Base_fullPath, Target_fullPath, Base_name,Target_name)
def non_match_elements(list_base, list_target):
non_match_base = []
non_match_target = []
for i in list_base:
if i not in list_target:
non_match_base.append(i)
for i in list_target:
if i not in list_base:
non_match_target.append(i)
return non_match_base
return non_match_target
list_base = Base.xpath("//Member/#*")
list_target = Target.xpath("//Member/#*")
non_match_base = non_match_elements(list_base, list_target)
x = open('C:\\Users\\pvl\\Desktop\\New folder\\Present_in_base.csv', 'w')
print("\n Base Details: \n", *non_match_base, sep='\n', file = x)
non_match_target = non_match_elements(list_target, list_base)
x = open('C:\\Users\\pvl\\Desktop\\New folder\\Present_in_target.csv', 'w')
print("\n Target Details: \n", *non_match_target, sep='\n', file = x)
import pandas as pd
df = pd.read_csv('C:\\Users\\pvl\\Desktop\\New folder\\Common_FullPath.csv')
df1 = pd.read_csv('C:\\Users\\pvl\\Desktop\\New folder\\Common_name.csv')
df2 = pd.read_csv('C:\\Users\\pvl\\Desktop\\New folder\\Present_in_base.csv', delimiter=';;', on_bad_lines = 'skip', engine = 'python' )
df3 = pd.read_csv('C:\\Users\\pvl\\Desktop\\New folder\\Present_in_target.csv', delimiter=';', on_bad_lines = 'skip', engine = 'python')
with pd.ExcelWriter("C:\\Users\\pvl\\Desktop\\New folder\\combined.xlsx") as writer:
df1.to_excel(writer, sheet_name="Common_name", index=False)
df2.to_excel(writer, sheet_name="base_Details", index=False)
df3.to_excel(writer, sheet_name = "target_Details", index=Fal
I have a list of dataframes and am attempting to export each using the pandas.df.to_csv method to a folder on disk. However, only the last item in the list of dataframes is being written to disk as a .csv
Please see code below:
import pandas as pd
import os
import datetime
from pathlib import Path
CSV_Folder = Path('C:\PA_Boundaries\Tests')
Output = r'C:/PA_Boundaries/test_output'
today = datetime.date.today()
date = today.strftime('%Y%m%d')
try:
dfs = []
for file in os.listdir(CSV_Folder):
df = pd.read_csv(CSV_Folder / file)
dfs.append(df)
new_dfs = []
for df in dfs:
new_df = pd.DataFrame()
new_df['Original Addr string'] = df['StreetConc']
new_df['Addr #'] = df['AddNum']
new_df['Prefix'] = df['StPreDir']
new_df['Street Name'] = df['StName']
new_df['StreetType'] = df['StType']
new_df['Suffix'] = df['StDir']
new_df['Multi-Unit'] = ''
new_df['City'] = df['City']
new_df['Zip Code'] = df['PostCode']
new_df['4'] = df['PostalExt']
new_df['County'] = df['CountyID']
new_df['Addr Type'] = ''
new_df['Precint Part Name'] = ''
new_df['Lat'] = df['X']
new_df['Long'] = df['Y']
replaced_address_names = []
for index, row in new_df.iterrows():
new_row = row['Original Addr string'].replace(',', ' ')
replaced_address_names.append(new_row)
new_df['Original Addr string'] = replaced_address_names
county_id = df.iloc[0, 37]
new_dfs.append(new_df)
for i in range(len(new_dfs)):
new_dfs[i].to_csv(f'{Output}\ADDR_{county_id}_{date}.csv', index=False)
except FileNotFoundError:
print(f'{file} not found in {CSV_Folder}')
except PermissionError:
print('Check syntax of paths')
else:
print('Process Complete')
new_dfs contains the correct number of dataframes. However, when looping through the new list of dataframes and calling .to_csv on each item in the list, only the last item in the list is written to the disk.
The problem lies in the way in which you name your exported file.
After running through the loop, county_id will be equal to the last county_id, or the county_id of the last iterated df.
Since the name of your exported dataframe is {Output}\ADDR_{county_id}_{date}.csv, all the exported files are being named by the same count_id and date, or in other words, they are being rewritten.
To avoid this, you can create a new list called county_ids and then use the last loop to change the name of the saved file. This would be your resulting code:
import pandas as pd
import os
import datetime
from pathlib import Path
CSV_Folder = Path('C:\PA_Boundaries\Tests')
Output = r'C:/PA_Boundaries/test_output'
today = datetime.date.today()
date = today.strftime('%Y%m%d')
try:
dfs = []
for file in os.listdir(CSV_Folder):
df = pd.read_csv(CSV_Folder / file)
dfs.append(df)
new_dfs, county_ids = [], []
for df in dfs:
new_df = pd.DataFrame()
new_df['Original Addr string'] = df['StreetConc']
new_df['Addr #'] = df['AddNum']
new_df['Prefix'] = df['StPreDir']
new_df['Street Name'] = df['StName']
new_df['StreetType'] = df['StType']
new_df['Suffix'] = df['StDir']
new_df['Multi-Unit'] = ''
new_df['City'] = df['City']
new_df['Zip Code'] = df['PostCode']
new_df['4'] = df['PostalExt']
new_df['County'] = df['CountyID']
new_df['Addr Type'] = ''
new_df['Precint Part Name'] = ''
new_df['Lat'] = df['X']
new_df['Long'] = df['Y']
replaced_address_names = []
for index, row in new_df.iterrows():
new_row = row['Original Addr string'].replace(',', ' ')
replaced_address_names.append(new_row)
new_df['Original Addr string'] = replaced_address_names
county_ids.append(df.iloc[0, 37])
new_dfs.append(new_df)
for i in range(len(new_dfs)):
new_dfs[i].to_csv(f'{Output}\ADDR_{county_id[i]}_{date}.csv', index=False)
except FileNotFoundError:
print(f'{file} not found in {CSV_Folder}')
except PermissionError:
print('Check syntax of paths')
else:
print('Process Complete')
Obviously I cannot test this - if you do run it there maybe lines that need tweaking. However, I'd do the code something like the below. Basically I'd call a function to replace as I'm opening and write out immediately.
If you can get it working it will probably be faster and reads slightly better as there are less lines.
Example:
import pandas as pd
import os
import datetime
from pathlib import Path
CSV_Folder = Path(r'C:/PA_Boundaries/Tests')
Output = r'C:/PA_Boundaries/test_output/'
today = datetime.date.today()
date = today.strftime('%Y%m%d')
def updateFrame(f):
new_df = pd.DataFrame()
new_df['Original Addr string'] = f['StreetConc']
new_df['Addr #'] = f['AddNum']
new_df['Prefix'] = f['StPreDir']
new_df['Street Name'] = f['StName']
new_df['StreetType'] = f['StType']
new_df['Suffix'] = f['StDir']
new_df['Multi-Unit'] = ''
new_df['City'] = f['City']
new_df['Zip Code'] = f['PostCode']
new_df['4'] = f['PostalExt']
new_df['County'] = f['CountyID']
new_df['Addr Type'] = ''
new_df['Precint Part Name'] = ''
new_df['Lat'] = f['X']
new_df['Long'] = f['Y']
# better way to replace without looping the rows...
new_df['Original Addr string'] = new_df['Original Addr string'].str.replace(',', ' ')
return new_df
for file in os.listdir(CSV_Folder):
working_file = str(CSV_Folder) + '/' + file
if working_file.endswith('.csv'):
try:
df = pd.read_csv(working_file)
county_id = str(df.iloc[0, 37])
# the function returns a frame so you can treat it as such...
updateFrame(df).to_csv(f'{Output}ADDR_{county_id}_{date}.csv', index=False)
except FileNotFoundError:
print(f'{file} not found in {CSV_Folder}')
except PermissionError:
print('Check syntax of paths')
else:
print('Process Complete')
I am attempting to convert some columns to a date format but am not having any luck. Here is my code:
from datetime import date, datetime
from utils import misc_errors
from os import listdir, remove
from os.path import isfile, join
from pathlib import Path
import pandas as pd
import shutil
import csv
import io
import codecs
# getting paths that will be used later
path = str(Path().absolute()) + r'\\'
files = []
fetch = r'C:\' +'\\'
net = r'C:\' +'\\'
# getting the names of the files needed to copy
allfiles = [f for f in listdir(fetch) if isfile(join(fetch, f))]
for name in allfiles:
if name.endswith('csv'):
files.append(name)
for file_name in files:
#copy the file
shutil.copy2(fetch + file_name, path + file_name)
#get the date for later
file_date = date.today().strftime("%Y%m%d")
# Reading the data from the csv file
#file_df = pd.read_csv(file_name, sep=',', quotechar='"', thousands=',', encoding='Latin-1')
file_df = pd.read_csv(file_name, sep=',',delimiter=',', quotechar='"', thousands=',', encoding='Latin-1', dtype='object', low_memory=False, skiprows=5)
file_df.columns = [col.strip() for col in file_df.columns]
#populate the count column
total = len(file_df.index)
count = []
for i in range(0, total):
count.append('1')
file_df["count()"] = count
# get a list of the headers for use later
headers = file_df.columns.values.tolist()
file_df.fillna('',inplace=True)
if 'project' in file_name:
# remove all duplicates from the projects file
file_df = file_df.drop_duplicates(keep='first')
file_final = "PROJECTS.FULL." + file_date
supplier = []
for i in range(0, total):
supplier.append('Unclassified')
file_df["Suppliers - ERP Supplier ID"] = supplier
file_df["Suppliers - ERP Supplier"] = supplier
file_df = file_df.apply(lambda x: pd.Series([str(x[i]).replace("\n",'') for i in range(0, len(x))], index=headers), axis=1)
num_headers = [r"sum(Annual Spend Amount)", r"sum(Total Contract Value Amount)"]
for header in num_headers:
file_df[header] = ['{0:.0f}'.format(float(file_df[header][i])) if file_df[header][i] == 0 else '{0:,.2f}'.format(float(file_df[header][i])) if file_df[header][i] != '' else '' for i in range(0,len(file_df[header]))]
header = r"sum(% of Total Contract Value in US)"
file_df[header] = [int(float(file_df[header][i])) if file_df[header][i] != '' else '' for i in range(0, len(file_df[header]))]
header = "Reporting Year"
file_df[header] = [int(float(file_df[header][i])) if file_df[header][i] != '' else '' for i in range(0, len(file_df[header]))]
word_headers = ["Description", "Key Considerations", "Key Highlights / Value Statement", "Status Update"]
for header in word_headers:
file_df = misc_errors(file_df, header)
file_df.columns = [c.replace("–", "-") for c in file_df]
file_headers = ["Begin Date","End Date - Date","Estimated Completion Date - Date",
"Anticipated T&O Legal Engagement Date - Date","Benefits Start Date - Date", "Benefits End Date - Date" ]
pd.to_datetime(file_headers['Begin Date'], errors='ignore')
file_df.to_csv(file_final, index=False, encoding="latin-1")
remove(file_name)
shutil.copy2(path + file_final, net + file_final)
I am trying to convert the columns in file_header (near the bottom) to date,
Updated and added the full error log here:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-3-f04094ba593a> in <module>
79 "Anticipated T&O Legal Engagement Date - Date","Benefits Start Date - Date", "Benefits End Date - Date" ]
80
---> 81 pd.to_datetime(file_headers['Begin Date'], errors='ignore')
82
83 file_df.to_csv(file_final, index=False, encoding="latin-1")
TypeError: list indices must be integers or slices, not str
Still not working though. Thanks for all the help so far, and let me know if there is anything else I can do here.
You may want to check the content of your Date column. Try to loop over it and spot the non-working values:
for row in df.itertuples():
try:
pd.to_datetime(row.Date)
except:
print(row.Date)
Then think of what you want to do with them
I am simply trying to get the value of column 'ID' where two rows meet (meet where Name and Zip are on the same row) then with that row number, I get column 'ID' value to rename file with. I don't think writing the '.rename' correctly for one.. any pointers appreciated. Currently no error, but no output either.
import PyPDF2
from PyPDF2 import PdfFileWriter, PdfFileReader
import re
import config
import xlrd
import numpy as np
import pandas as pd
import math
import os
for filename in os.listdir(config.Total):
if filename.endswith(".pdf"):
First_Name, Last_Name, Zip = filename.replace(".pdf",'').split()
Name = First_Name + " " + Last_Name
print(Name)
print(Zip)
data1 = pd.read_excel(config.Excel1)
data2 = pd.read_excel(config.Excel2)
df = pd.DataFrame(data1)
header = df.iloc[0]
df2 = pd.DataFrame(data2)
header2 = df2.iloc[0]
df = df[1:]
df.rename(columns = header)
df2 = df2[1:]
df2.rename(columns = header2)
row_numberd1 = df[df['Member Name'].str.contains(Name)].index.min()
row_numberd12 = df[df['Member Address Line 3'].str.contains(Zip)].index.min()
if row_numberd1 == row_numberd12: # When rows match of NameUp and Zip var in DF1
rowMatched = row_numberd1
print("Match Found")
print(rowMatched)
MemberID = df['ID'][rowMatched]
MemberI = str(MemberID)
os.rename(config.ID+ "/" + MemberI)
row_numberd2 = df2[df2['Member Name'].str.contains(Name)].index.min()
row_numberd22 = df2[df2['Member Address Line 3'].str.contains(Zip)].index.min()
if row_numberd2 == row_numberd22: # When rows match of NameUp and Zip var in DF2
rowMatched2 = row_numberd2
print("No Match Found")
print(rowMatched2)
MemberID = df2['ID'][rowMatched2]
MemberI = str(MemberID)
os.rename(config.ID+ "/" + MemberI)
Maybe?
os.rename(config.Total, + MemberI, config.ID)
Since the code runs without an error, it must never gets into the body of if statement. Hence never renames. Maybe there is no match! You can find out by explicit renaming:
row = df[df['Member Name'].str.contains(Name) and df['Member Address Line 3'].str.contains(Zip)].index.min()
os.rename("old_filename",str(df['ID'][row]))
In the below; I believe I am doing the os.rename incorrectly. In the console it's finding and printing the right contents; but it is not doing the renaming.
What I really want to do is rename the current file, with the MemberI value, then move to a new directory. But at this point I'll settle to rename as is. MemberI is reporting accurately in console, just can't get the rename to perform. Any ideas.
for filename in os.listdir(config.Total):
if filename.endswith(".pdf"):
First_Name, Last_Name, Zip = filename.replace(".pdf",'').split()
Name = First_Name + " " + Last_Name
print(Name)
print(Zip)
data1 = pd.read_excel(config.Excel1)
data2 = pd.read_excel(config.Excel2)
df = pd.DataFrame(data1)
header = df.iloc[0]
df2 = pd.DataFrame(data2)
header2 = df2.iloc[0]
df = df[1:]
df.rename(columns = header)
df2 = df2[1:]
df2.rename(columns = header2)
row_numberd1 = df[df['Member Name'].str.contains(Name)].index.min()
row_numberd12 = df[df['Member Address Line 3'].str.contains(Zip)].index.min()
if row_numberd1 == row_numberd12: # When rows match of NameUp and Zip var in DF1
rowMatched = row_numberd1
print("Match Found")
print(rowMatched)
MemberID = df['ID'][rowMatched]
MemberI = str(MemberID)
os.rename(filename, MemberI)
You need the full path of the file that you're trying to rename filename only has the name of the file not the entire path
os.rename(config.Total+'/'+filename, MemberI)
if config.Total already has / then just
os.rename(config.Total+filename, MemberI)