from sepa import parser
import re
import csv
import pandas as pd
import numpy as np
# Utility function to remove additional namespaces from the XML
def strip_namespace(xml):
return re.sub(' xmlns="[^"]+"', '', xml, count=1)
# Read file
with open('test.xml', 'r') as f:
input_data = f.read()
# Parse the bank statement XML to dictionary
camt_dict = parser.parse_string(parser.bank_to_customer_statement, bytes(strip_namespace(input_data), 'utf8'))
statements = pd.DataFrame.from_dict(camt_dict['statements'])
all_entries = []
for i, _ in statements.iterrows():
if 'entries' in camt_dict['statements'][i]:
df = pd.DataFrame()
dd = pd.DataFrame.from_records(camt_dict['statements'][i]['entries'])
dg = dd['entry_details']
df['Date'] = dd['value_date'].str['date']
df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%d-%m-%Y')
iban = camt_dict['statements'][i]['account']['id']['iban']
df['IBAN'] = iban
df['Currency'] = dd['amount'].str['currency']
# Sort Credit/Debit in separate Columns
df['Credit'] = np.where(dd['credit_debit_indicator'] == 'CRDT', dd['amount'].str['_value'], '')
df['Debit'] = np.where(dd['credit_debit_indicator'] == 'DBIT', dd['amount'].str['_value'], '')
# Get destination IBAN
getlength = len(dg.index) #2
for i in range(0, getlength):
result = str(dd['entry_details'][i])
print(result + "Resultat " + str(i))
search_for_iban = re.search("CH\d{2}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{1}|CH\d{19}", result)
if(search_for_iban is None):
print('the search is none')
df['Test'] = 'None'
else:
print('the search is a match')
df['Test'] = 'Yes'
all_entries.append(df)
df_entries = pd.concat(all_entries)
print(df_entries)
**My problem here is just with this code block **
for i in range(0, getlength):
result = str(dd['entry_details'][i])
search_for_iban = re.search("CH\d{2}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{1}|CH\d{19}", result)
if(search_for_iban is None):
df['Test'] = 'None'
else:
df['Test'] = search_for_iban.group()
all_entries.append(df)
I have already tried to solve various things via the index, this also counts cleanly high in the variable i and the getlength is also correct for 2 entries
What im expecting
If there is an IBAN number in the 'search_for_iban' (which is using regex lookup (re.search)) which is matching in 2nd row i want that iban just in 2nd row (dataframe) "Test" as follows:
what i expect
What im getting
I got double the entry in row 1 and 2 although none was found in row 1. What am i overlooking, my head is hurting! :D
what i got
i think i am making a thinking error here between normal for loop and panda entries
You can try:
for i in range(0, getlength):
.
.
.
else:
df.loc[i, 'Test'] = search_for_iban
Related
I must modularize the name_2_sex function that receives a dataframe, for this I call it from a file called test.py but it gives me this error.
The function receives a dataframe with data on people and returns the dataframe with 2 extra columns, one with the patient's first name and the other with their gender.
NameError: free variable 'gender_list' referenced before assignment in enclosing scope
The algorithm worked without modularizing.
name_2_sex code:
import pandas as pd
import operator
import re
def name_2_sex(df):
def clean_text(txt):
txt = re.sub("[^a-záéíóúñüäë]", " ", txt.lower())
txt = re.sub(' +',' ', txt)
return txt.strip().split()
def df_to_dict(df, key_column, val_column):
"""convierte dos pandas series en un diccionario"""
xkey = df[key_column].tolist()
xval = df[val_column].tolist()
return dict(zip(xkey,xval))
def get_gender2(names):
names = clean_text(names)
names = [x for x in names if gender_list.get(x,'a') != 'a']
gender ={'m':0, 'f':0, 'a':0}
for i, name in enumerate(names):
g = gender_list.get(name,'a')
gender[g] += 1
gender[g] += 2 if len(names) > 1 and i == 0 and g != 'a' else 0
gender['a'] = 0 if (gender['f']+gender['m']) > 0 else 1
return max(gender.items(), key=operator.itemgetter(1))[0]
if __name__ == '__main__':
path = 'https://www.dropbox.com/s/edm5383iffurv4x/nombres.csv?dl=1'
gender_list = pd.read_csv(path)
gender_list = df_to_dict(gender_list, key_column='nombre', val_column='genero')
df_nombre_completo= df["patient_full_name"]
pacientes_primer_nombre = []
for name in df_nombre_completo:
if (isinstance(name, str)):
pacientes_primer_nombre.append(name.split(" ")[0])
for name in df["patient_full_name"]:
if (isinstance(name, str)):
df["first_name"] = name.split(" ")[0]
else:
df["first_name"] = 0
df["first_name"] = [str(name).split(" ")[0] for name in df["patient_full_name"]]
df["gender"] = df["first_name"]
df["gender"] = [get_gender2(name) for name in df["first_name"]]
return df
code of the file where I want to execute it (test.py):
from nombre_a_sexo import name_2_sex
import pandas as pd
df = pd.read_csv("nuevo_dataset.csv", index_col=0)
print(name_2_sex(df))
Both files are in the same folder.
I did not do the algorithm that filters by gender, so I would not know what to edit if the problem comes from there.
You only assign gender_list in this block:
if __name__ == '__main__':
path = 'https://www.dropbox.com/s/edm5383iffurv4x/nombres.csv?dl=1'
gender_list = pd.read_csv(path)
gender_list = df_to_dict(gender_list, key_column='nombre', val_column='genero')
But this condition will only be true if you execute nombre_a_sexo.py as a top-level script, not when you import from it.
So you never assign gender_list before the rest of the code tries to use it.
When the function is called from another file, I think you want to use the df parameter instead of reading from this file. So change it to:
if __name__ == '__main__':
path = 'https://www.dropbox.com/s/edm5383iffurv4x/nombres.csv?dl=1'
gender_list = pd.read_csv(path)
gender_list = df_to_dict(gender_list, key_column='nombre', val_column='genero')
else:
gender_list = df_to_dict(df, key_column='nombre', val_column='genero')
I need to parse the following text file into a dataframe, any suggestion about the methods?
Input:
('name: ', u'Jacky')
('male: ', True)
('hobby: ', u'play football and bascket')
('age: ', 24.0)
----------------
('name: ', u'Belly')
('male: ', True)
('hobby: ', u'dancer')
('age: ', 74.0)
----------------
('name: ', u'Chow')
('male: ', True)
('hobby: ', u'artist')
('age: ', 46.0)
output:
name male hobby age
jacky True football 24
...
I used regex to parse your text file :
import re
import pandas as pd
text_path = 'text.txt'
my_dict = {}
pattern = r"\('(\w+):\s+',\s+u*'*([a-zA-Z0-9\s.]*)'*\)"
with open(text_path, 'r') as txt:
for block in re.split(r"-+\n", txt.read()):
for line in filter(None, block.split('\n')):
col_name, value = re.search(pattern, line).group(1,2)
try:
value = int(float(value))
except ValueError:
value = True if value=='True' else False if value=='False' else value
if col_name in my_dict:
my_dict[col_name].append(value)
else:
my_dict[col_name] = [value]
df = pd.DataFrame(my_dict)
print(df)
Output :
name male hobby age
0 Jacky True play football and bascket 24
1 Belly True dancer 74
2 Chow True artist 46
Booleans values are not string but real bool True or False, numerical value (like age) are int (you could keep them as float) and not strings.
Ask me if you don't understand something.
I don't know any way to parse this data convention with usage of some existing parser so I suggest to build your own ones. Then I would use readlines() method on open file so it allows me to iterate over lines of data and apply correct parser to each row in iteration. Finally, I would combine data and create DataFrame. Example code is below:
import pandas as pd
import sys
def parse_from_weird_file_to_pandas_df(file):
with open(file, 'r') as f:
content = f.readlines()
name_vals = [_parse_text(content[line]) for line in range(0, len(content), 5)]
male_vals = [_parse_bool(content[line]) for line in range(1, len(content), 5)]
hobby_vals = [_parse_text(content[line]) for line in range(2, len(content), 5)]
age_vals = [_parse_int(content[line]) for line in range(3, len(content), 5)]
df_rows = zip(name_vals, male_vals, hobby_vals, age_vals)
df = pd.DataFrame(data=df_rows, columns=["name", "male", "hobby", "age"])
return df
def _parse_text(text_line):
text = text_line[text_line.find("u'") + 2: text_line.find("')")]
return text
def _parse_bool(bool_line):
val_bool = bool_line[bool_line.find("', ") + 3: bool_line.find(")")]
return True if val_bool == "True" else False
def _parse_int(int_line):
val_int = int_line[int_line.find("', ") + 3: int_line.find(")")]
return int(float(val_int))
If you wish to shorten 'play football and bascket' to just 'football' you can achieve this for example by creating list with all available hobbies, looping them through parsed hobby and returning the matching one.
Here is a quick code I made just before lunch, not optimised but seems to work (I did not remove the 'u'in the string and did not convert the int but you should be able to manage that ? If not let me kow and i will work on it after !
The .join remove unecessary char and I assume you only have 4 object every time...
file = open("yourfile.txt", 'r')
lines = file.readlines()
init = True
list_to_append = []
df = pd.DataFrame(columns=['name', 'male', 'hobby','age'])
for line in lines:
if '---' not in line:
line = line.split(',')[1]
processed_line = ''.join(c for c in line if c not in " ()'\n")
list_to_append.append(processed_line)
if len(list_to_append) == 4:
df.loc[len(df)] = list_to_append
list_to_append = []
else :
pass
file.close()
I have a list of dataframes and am attempting to export each using the pandas.df.to_csv method to a folder on disk. However, only the last item in the list of dataframes is being written to disk as a .csv
Please see code below:
import pandas as pd
import os
import datetime
from pathlib import Path
CSV_Folder = Path('C:\PA_Boundaries\Tests')
Output = r'C:/PA_Boundaries/test_output'
today = datetime.date.today()
date = today.strftime('%Y%m%d')
try:
dfs = []
for file in os.listdir(CSV_Folder):
df = pd.read_csv(CSV_Folder / file)
dfs.append(df)
new_dfs = []
for df in dfs:
new_df = pd.DataFrame()
new_df['Original Addr string'] = df['StreetConc']
new_df['Addr #'] = df['AddNum']
new_df['Prefix'] = df['StPreDir']
new_df['Street Name'] = df['StName']
new_df['StreetType'] = df['StType']
new_df['Suffix'] = df['StDir']
new_df['Multi-Unit'] = ''
new_df['City'] = df['City']
new_df['Zip Code'] = df['PostCode']
new_df['4'] = df['PostalExt']
new_df['County'] = df['CountyID']
new_df['Addr Type'] = ''
new_df['Precint Part Name'] = ''
new_df['Lat'] = df['X']
new_df['Long'] = df['Y']
replaced_address_names = []
for index, row in new_df.iterrows():
new_row = row['Original Addr string'].replace(',', ' ')
replaced_address_names.append(new_row)
new_df['Original Addr string'] = replaced_address_names
county_id = df.iloc[0, 37]
new_dfs.append(new_df)
for i in range(len(new_dfs)):
new_dfs[i].to_csv(f'{Output}\ADDR_{county_id}_{date}.csv', index=False)
except FileNotFoundError:
print(f'{file} not found in {CSV_Folder}')
except PermissionError:
print('Check syntax of paths')
else:
print('Process Complete')
new_dfs contains the correct number of dataframes. However, when looping through the new list of dataframes and calling .to_csv on each item in the list, only the last item in the list is written to the disk.
The problem lies in the way in which you name your exported file.
After running through the loop, county_id will be equal to the last county_id, or the county_id of the last iterated df.
Since the name of your exported dataframe is {Output}\ADDR_{county_id}_{date}.csv, all the exported files are being named by the same count_id and date, or in other words, they are being rewritten.
To avoid this, you can create a new list called county_ids and then use the last loop to change the name of the saved file. This would be your resulting code:
import pandas as pd
import os
import datetime
from pathlib import Path
CSV_Folder = Path('C:\PA_Boundaries\Tests')
Output = r'C:/PA_Boundaries/test_output'
today = datetime.date.today()
date = today.strftime('%Y%m%d')
try:
dfs = []
for file in os.listdir(CSV_Folder):
df = pd.read_csv(CSV_Folder / file)
dfs.append(df)
new_dfs, county_ids = [], []
for df in dfs:
new_df = pd.DataFrame()
new_df['Original Addr string'] = df['StreetConc']
new_df['Addr #'] = df['AddNum']
new_df['Prefix'] = df['StPreDir']
new_df['Street Name'] = df['StName']
new_df['StreetType'] = df['StType']
new_df['Suffix'] = df['StDir']
new_df['Multi-Unit'] = ''
new_df['City'] = df['City']
new_df['Zip Code'] = df['PostCode']
new_df['4'] = df['PostalExt']
new_df['County'] = df['CountyID']
new_df['Addr Type'] = ''
new_df['Precint Part Name'] = ''
new_df['Lat'] = df['X']
new_df['Long'] = df['Y']
replaced_address_names = []
for index, row in new_df.iterrows():
new_row = row['Original Addr string'].replace(',', ' ')
replaced_address_names.append(new_row)
new_df['Original Addr string'] = replaced_address_names
county_ids.append(df.iloc[0, 37])
new_dfs.append(new_df)
for i in range(len(new_dfs)):
new_dfs[i].to_csv(f'{Output}\ADDR_{county_id[i]}_{date}.csv', index=False)
except FileNotFoundError:
print(f'{file} not found in {CSV_Folder}')
except PermissionError:
print('Check syntax of paths')
else:
print('Process Complete')
Obviously I cannot test this - if you do run it there maybe lines that need tweaking. However, I'd do the code something like the below. Basically I'd call a function to replace as I'm opening and write out immediately.
If you can get it working it will probably be faster and reads slightly better as there are less lines.
Example:
import pandas as pd
import os
import datetime
from pathlib import Path
CSV_Folder = Path(r'C:/PA_Boundaries/Tests')
Output = r'C:/PA_Boundaries/test_output/'
today = datetime.date.today()
date = today.strftime('%Y%m%d')
def updateFrame(f):
new_df = pd.DataFrame()
new_df['Original Addr string'] = f['StreetConc']
new_df['Addr #'] = f['AddNum']
new_df['Prefix'] = f['StPreDir']
new_df['Street Name'] = f['StName']
new_df['StreetType'] = f['StType']
new_df['Suffix'] = f['StDir']
new_df['Multi-Unit'] = ''
new_df['City'] = f['City']
new_df['Zip Code'] = f['PostCode']
new_df['4'] = f['PostalExt']
new_df['County'] = f['CountyID']
new_df['Addr Type'] = ''
new_df['Precint Part Name'] = ''
new_df['Lat'] = f['X']
new_df['Long'] = f['Y']
# better way to replace without looping the rows...
new_df['Original Addr string'] = new_df['Original Addr string'].str.replace(',', ' ')
return new_df
for file in os.listdir(CSV_Folder):
working_file = str(CSV_Folder) + '/' + file
if working_file.endswith('.csv'):
try:
df = pd.read_csv(working_file)
county_id = str(df.iloc[0, 37])
# the function returns a frame so you can treat it as such...
updateFrame(df).to_csv(f'{Output}ADDR_{county_id}_{date}.csv', index=False)
except FileNotFoundError:
print(f'{file} not found in {CSV_Folder}')
except PermissionError:
print('Check syntax of paths')
else:
print('Process Complete')
I have a huge data set which contains shipper/supplier names from different sources and are having near duplicate values in it.
I tried so many different techniques available on the internet but none of them were quit satisfying or was too slow for this huge data.
I found this openrefine GitHub repo for fingerprinting algorithms and I added some more code and it solved my purpose.
Have a look.
My dataset something looks like this...
import re, string
import pandas as pd
from unidecode import unidecode
from collections import defaultdict
# clean the text before processing
def cleansing_special_characters(txt):
seps = [' ',';',':','.','`','~',',','*','#','#','|','\\','-','_','?','%','!','^','(',')','[',']','{','}','$','=','+','"','<','>',"'",' AND ', ' and ']
default_sep = seps[0]
txt = str(txt)
for sep in seps[1:]:
if sep == " AND " or sep == " and ":
txt = txt.upper()
txt = txt.replace(sep, ' & ')
else:
txt = txt.upper()
txt = txt.replace(sep, default_sep)
try :
list(map(int,txt.split()))
txt = 'NUMBERS'
except:
pass
txt = re.sub(' +', ' ', txt)
temp_list = [i.strip() for i in txt.split(default_sep)]
temp_list = [i for i in temp_list if i]
return " ".join(temp_list)
punctuation = re.compile('[%s]' % re.escape(string.punctuation))
class fingerprinter(object):
# __init__function
def __init__(self, string):
self.string = self._preprocess(string)
# strip leading, trailing spaces and to lower case
def _preprocess(self, string):
return punctuation.sub('',string.strip().lower())
def _latinize(self, string):
return unidecode(string)
# return unidecode(string.decode('utf-8'))
def _unique_preserve_order(self,seq):
seen = set()
seen_add = seen.add
return [x for x in seq if not (x in seen or seen_add(x))]
#-####################################################
def get_fingerprint(self):
return self._latinize(' '.join(self._unique_preserve_order(sorted(self.string.split()))))
def get_ngram_fingerprint(self, n=1):
return self._latinize(''.join(self._unique_preserve_order(sorted([self.string[i:i + n] for i in range(len(self.string) - n +1)]))))
# read excel file
df = pd.read_excel('Input_File.xlsx')
#preprocess the column
df['Clean'] = df['SUPPLIER_NAME'].apply(cleansing_special_characters)
# step 1 cleanining
# ##for n_gram fingerprint algorithm
###########################################################################################
df['n_gram_fingerprint_n2'] = df['Clean'].apply(lambda x : fingerprinter(x.replace(" ","")).get_ngram_fingerprint(n=2))
## generate tag_id for every unique generated n_gram_fingerprint
d = defaultdict(lambda: len(d))
df['tag_idn']=[d[x] for x in df['n_gram_fingerprint_n2']]
###########################################################################################
#drop n_gram column
df.drop(columns=['n_gram_fingerprint_n2'], inplace=True)
# make copy to create group of tag_id
df1 = df[['SUPPLIER_NAME','tag_idn']]
# drop SUPPLIER_NAME column , we have tag_id's now
df.drop(columns=['SUPPLIER_NAME'], inplace=True)
# group df with tag_id with selecting minimum
#group = df.groupby('tag_id').min().reset_index()
group = df.loc[df["Clean"].str.len().groupby(df["tag_idn"]).idxmax()]
# join both the data frames group(unique) and main data
df_merge = pd.merge(df1,group, on=['tag_idn'])
# # output excel file
df_merge.to_excel('Output_File.xlsx', index = False)
This is what the outpout data in an excel file looks like
Hi I am new to python and struggling my way out. Currently ia m doing some appending excel files kind of task and here's my sample code. Getting list out of index error as according to me while loop is not breaking at rhe end of each excel file. Any help would be appreciated. Thanks:
import xlrd
import glob
import os
import openpyxl
import csv
from xlrd import open_workbook
from os import listdir
row = {}
basedir = '../files/'
files = listdir('../files')
sheets = [filename for filename in files if filename.endswith("xlsx")]
header_is_written = False
for filename in sheets:
print('Parsing {0}{1}\r'.format(basedir,filename))
worksheet = open_workbook(basedir+filename).sheet_by_index(0)
print (worksheet.cell_value(5,6))
counter = 0
while True:
row['plan name'] = worksheet.cell_value(1+counter,1).strip()
row_values = worksheet.row_slice(counter+1,start_colx=0, end_colx=30)
row['Dealer'] = int(row_values[0].value)
row['Name'] = str(row_values[1].value)
row['City'] = str(row_values[2].value)
row['State'] = str(row_values[3].value)
row['Zip Code'] = int(row_values[4].value)
row['Region'] = str(row_values[5].value)
row['AOM'] = str(row_values[6].value)
row['FTS Short Name'] = str(row_values[7].value)
row['Overall Score'] = float(row_values[8].value)
row['Overall Rank'] = int(row_values[9].value)
row['Count of Ros'] = int(row_values[10].value)
row['Count of PTSS Cases'] = int(row_values[11].value)
row['% of PTSS cases'] = float(row_values[12].value)
row['Rank of Cases'] = int(row_values[13].value)
row['% of Not Prepared'] = float(row_values[14].value)
row['Rank of Not Prepared'] = int(row_values[15].value)
row['FFVt Pre Qrt'] = float(row_values[16].value)
row['Rank of FFVt'] = int(row_values[17].value)
row['CSI Pre Qrt'] = int(row_values[18].value)
row['Rank of CSI'] = int(row_values[19].value)
row['FFVC Pre Qrt'] = float(row_values[20].value)
row['Rank of FFVc'] = int(row_values[21].value)
row['OnSite'] = str(row_values[22].value)
row['% of Onsite'] = str(row_values[23].value)
row['Not Prepared'] = int(row_values[24].value)
row['Open'] = str(row_values[25].value)
row['Cost per Vin Pre Qrt'] = float(row_values[26].value)
row['Damages per Visit Pre Qrt'] = float(row_values[27].value)
row['Claim Sub time pre Qrt'] = str(row_values[28].value)
row['Warranty Index Pre Qrt'] = str(row_values[29].value)
counter += 1
if row['plan name'] is None:
break
with open('table.csv', 'a',newline='') as f:
w=csv.DictWriter(f, row.keys())
if header_is_written is False:
w.writeheader()
header_is_written = True
w.writerow(row)
In place of while True use for.
row['plan name'] = worksheet.cell_value(1 + counter, 1).strip()
row_values = worksheet.row_slice(counter + 1, start_colx=0, end_colx=30)
for values in row_values:
row['Dealer'] = int(values.value)
row['Name'] = str(values.value)
....
because while True means to run this loop infinite time.(or until it means break keyword) inside while loop
Read more about while loop
while True loop basically means: execute the following code block to infinity, unless a break or sys.exit statement get you out.
So in your case, you need to terminate after the lines to append the excel are over (exhausted). You have two options: check if there are more lines to append, and if not break.
A more suitable approach when writing a file is for loops. This kind of a loop terminates when it is exausted.
Also, you should consider gathering the content of the excel in one operation, and save it to a variable. Then, once you have it, create iteration and append it to csv.