from sepa import parser
import re
import csv
import pandas as pd
import numpy as np
# Utility function to remove additional namespaces from the XML
def strip_namespace(xml):
return re.sub(' xmlns="[^"]+"', '', xml, count=1)
# Read file
with open('test.xml', 'r') as f:
input_data = f.read()
# Parse the bank statement XML to dictionary
camt_dict = parser.parse_string(parser.bank_to_customer_statement, bytes(strip_namespace(input_data), 'utf8'))
statements = pd.DataFrame.from_dict(camt_dict['statements'])
all_entries = []
for i, _ in statements.iterrows():
if 'entries' in camt_dict['statements'][i]:
df = pd.DataFrame()
dd = pd.DataFrame.from_records(camt_dict['statements'][i]['entries'])
dg = dd['entry_details']
df['Date'] = dd['value_date'].str['date']
df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%d-%m-%Y')
iban = camt_dict['statements'][i]['account']['id']['iban']
df['IBAN'] = iban
df['Currency'] = dd['amount'].str['currency']
# Sort Credit/Debit in separate Columns
df['Credit'] = np.where(dd['credit_debit_indicator'] == 'CRDT', dd['amount'].str['_value'], '')
df['Debit'] = np.where(dd['credit_debit_indicator'] == 'DBIT', dd['amount'].str['_value'], '')
# Get destination IBAN
getlength = len(dg.index) #2
for i in range(0, getlength):
result = str(dd['entry_details'][i])
print(result + "Resultat " + str(i))
search_for_iban = re.search("CH\d{2}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{1}|CH\d{19}", result)
if(search_for_iban is None):
print('the search is none')
df['Test'] = 'None'
else:
print('the search is a match')
df['Test'] = 'Yes'
all_entries.append(df)
df_entries = pd.concat(all_entries)
print(df_entries)
**My problem here is just with this code block **
for i in range(0, getlength):
result = str(dd['entry_details'][i])
search_for_iban = re.search("CH\d{2}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{1}|CH\d{19}", result)
if(search_for_iban is None):
df['Test'] = 'None'
else:
df['Test'] = search_for_iban.group()
all_entries.append(df)
I have already tried to solve various things via the index, this also counts cleanly high in the variable i and the getlength is also correct for 2 entries
What im expecting
If there is an IBAN number in the 'search_for_iban' (which is using regex lookup (re.search)) which is matching in 2nd row i want that iban just in 2nd row (dataframe) "Test" as follows:
what i expect
What im getting
I got double the entry in row 1 and 2 although none was found in row 1. What am i overlooking, my head is hurting! :D
what i got
i think i am making a thinking error here between normal for loop and panda entries
You can try:
for i in range(0, getlength):
.
.
.
else:
df.loc[i, 'Test'] = search_for_iban
I'm taking CS50 and got stuck on this pset6.
I made this code and it's working fine for 'small' given database.
On 'large' one i get wrong values in my DNA sequence.
Like, using debug50 i got that Albus sequence should be 15,49,38... and my seq is 21, 55, 64...
whats wrong? AND why it works fine on small database and not in large one?
Thanks for the help!
# Import ARGV and CSV library
from sys import argv, exit
import pandas as pd
import csv
# Check if argv has 3 arguments (program name, cvs file and dna sequence)
while True:
if len(argv) != 3:
print("Usage: python dna.py data.csv sequence.txt")
exit(1)
else:
break
with open(argv[2], 'r', encoding="UTF-8") as txt:
dna_seq = txt.read()
#Find the number of STR - AGATC,TTTTTTCT,AATG,TCTAG,GATA,TATC,GAAA,TCTG
AGATC = dna_seq.count("AGATC")
TTTTTTCT = dna_seq.count("TTTTTTCT")
AATG = dna_seq.count("AATG")
TCTAG = dna_seq.count("TCTAG")
GATA = dna_seq.count("GATA")
TATC = dna_seq.count("TATC")
GAAA = dna_seq.count("GAAA")
TCTG = dna_seq.count("TCTG")
name = 0
if argv[1] == "databases/small.csv":
with open(argv[1], 'r') as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
dna_db = row['name'], row['AGATC'], row['AATG'], row['TATC']
dna_db = list(dna_db)
seq = [AGATC, AATG, TATC]
seq = [str(x) for x in seq]
if dna_db[1:4] == seq:
name = dna_db[:1]
break
else:
name = "No match"
elif argv[1] == "databases/large.csv":
with open(argv[1], 'r') as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
dna_db = row['name'], row['AGATC'], row['TTTTTTCT'], row['AATG'], row['TCTAG'],
row['GATA'], row['TATC'], row['GAAA'], row['TCTG']
dna_db = list(dna_db)
seq = [AGATC,TTTTTTCT,AATG,TCTAG,GATA,TATC,GAAA,TCTG]
seq = [str(x) for x in seq]
if dna_db[1:9] == seq:
name = dna_db[:1]
break
else:
name = "No match"
print(name)
so, toward the end of my first file; we'll call /file.py.
def get_excel_data(self):
"""Places excel data into pandas dataframe"""
# excel_data = pandas.read_excel(self.find_file())
for extracted_archive in self.find_file():
excel_data = pandas.read_excel(extracted_archive)
# print(excel_data)
columns = pandas.DataFrame(columns=excel_data.columns.tolist())
excel_data = pandas.concat([excel_data, columns])
excel_data.columns = excel_data.columns.str.strip()
excel_data.columns = excel_data.columns.str.replace("/", "_")
excel_data.columns = excel_data.columns.str.replace(" ", "_")
total_records = 0
num_valid_records = 0
num_invalid_records = 0
for row in excel_data.itertuples():
mrn = row.MRN
total_records += 1
if mrn in ("", " ", "N/A", "NaT", "NaN", None) or math.isnan(mrn):
# print(f"Invalid record: {row}")
num_invalid_records += 1
# total_invalid = num_invalid_records + dup_count
excel_data = excel_data.drop(excel_data.index[row.Index])
# continue
else:
# print(mrn) # outputs all MRN ids
for row in excel_data.itertuples():
num_valid_records += 1
continue
with open("./logs/metrics.csv", "a", newline="\n") as f:
csv_writer = DictWriter(f, ['date', 'total_records', 'processed', 'skipped', 'success_rate'])
# csv_writer.writeheader()
currentDT = datetime.datetime.now()
success_rate = num_valid_records / total_records * 100
csv_writer.writerow(dict(date=currentDT,
total_records=total_records,
processed=num_valid_records,
skipped=num_invalid_records,
success_rate=num_valid_records / total_records * 100))
return self.clean_data_frame(excel_data)
def clean_data_frame(self, data_frame):
"""Cleans up dataframes"""
for col in data_frame.columns:
if "date" in col.lower():
data_frame[col] = pandas.to_datetime(data_frame[col],
errors='coerce', infer_datetime_format=True)
data_frame[col] = data_frame[col].dt.date
data_frame['MRN'] = data_frame['MRN'].astype(int).astype(str)
return data_frame
def get_mapping_data(self):
map_data = pandas.read_excel(config.MAPPING_DOC, sheet_name='main')
columns = pandas.DataFrame(columns=map_data.columns.tolist())
return pandas.concat([map_data, columns])
in my second file I would like to keep that end state; and do another iteration for instance.... second_file.py
def process_records(self, records, map_data, completed=None, errors=None):
"""Code to execute after webdriver initialization."""
series_not_null = False
try:
num_attempt = 0
for record in data_frame.itertuples(): # not working
print(record)
series_not_null = True
mrn = record.MRN
self.navigate_to_search(num_attempt)
self.navigate_to_member(mrn)
self.navigate_to_assessment()
self.add_assessment(record, map_data)
self.driver.switch_to.parent_frame() # not working
sleep(.5)
error_flag = self.close_member_tab(self.driver, mrn, error_flag)
except Exception as exc:
if series_not_null:
errors = self.process_series_error(exc)
return completed, error
both have import pandas
you can save your dataframe in a pickle file like this. it is also worth noting that you can store most anything in a pickle file. here is a link to some info here: pickle info
import pandas as pd
import pickle
x = pd.DataFrame({'a':[1,2,3],'b':[4,5,6],'c':[7,8,9]})
#this will create a file called pickledata.p that will store the data frame
with open('pickledata.p', 'wb') as fh: #notice that you need the 'wb' for the dump
pickle.dump(x, fh)
#to load the file do this
with open('pickledata.p', 'rb') as fh: #you need to use 'rb' to read
df = pickle.load(fh)
#you can now use df like a normal dataframe
print(df)
you dont actually need the '.p' extension for a pickle file, i just like it.
so you save your dataframe at the end of script one, and then load it in at the start of script 2.
Use Dataframe.to_pickle and pandas.read_pickle:
To persist
df.to_pickle('./dataframe.pkl')
To load
df = pd.read_pickle('./dataframe.pkl')
I want to write the values fetched from url to csv file which has some float values too. The code below shows an error "float found."
import urllib2
import json
import csv
url = 'https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime=2016-10-01&endtime=2016-10-02'
i=0
csvfile = csv.writer(open('earthquakedet.csv', 'w'))
csvfile.writerow(["Latitude", "Longitude ","Title","Place","Mag"])
json_string = urllib2.urlopen(url).read()
j = json.loads(json_string)
names = [d['properties'] for d in j['features']]
names1 = [d['geometry'] for d in j['features']]
while i <= len(names):
print names[i]['title']
print names[i]['place']
print names[i]['mag']
print names1[i]['coordinates'][0]
print names1[i]['coordinates'][1]
i=i+1
finalstr=float(names1[i]['coordinates'][0]) + float(names1[i]['coordinates'][1]) + names[i]['title'] + names[i]['place'] + names[i]['mag']
csvfile.writerow(finalstr)
csvfile.close()
writerow takes a list of values to put on the row, not a string. So, instead of concatenating the values yourself, just put them in a list to pass to writerow:
# ...
i = i + 1
csvfile.writerow([names1[i]['coordinates'][0], names1[i]['coordinates'][1], names[i]['title'], names[i]['place'], names[i]['mag']])
I have this input in a file.csv
"","min","max","rainfall","days_clear"
"Missouri",-2,10,300,23
"Amsterdam",-3,5,1212,34
"LA",10,20,1000,54
I wanted to write a simple program to find the city with the lowest rainfall which is Missouri in this case. How can I do that using Python csv reader?
I can try extract the items but unfortunately the first row of the file has to be there.
I wanted to have something like count[Missouri]=300
count[Amsterdam]=1212 etc.. so that I can do a minimum and reference back to print the city.
Please advise. Thanks.
import csv
def main():
with open('file.csv', 'rb') as inf:
data = [(int(row['rainfall']), row['']) for row in csv.DictReader(inf)]
data.sort()
print data[0]
if __name__=="__main__":
main()
returns
(300, 'Missouri')
One way to do this would be to use the csv module's DictReader class to write a function to extract the column of data. DictReader will take care of handling the first row of field names automatically. The built-in min() function can then be used to determine the item with the smallest value in the column.
import csv
def csv_extract_col(csvinput, colname, key):
""" extract a named column from a csv stream into a dictionary
colname: name of columm to extract
key: name of another columm to use as keys in returned dict
"""
col = {}
for row in csv.DictReader(csvinput):
col[row[key]] = row[colname]
return col
if __name__=='__main__':
import StringIO
csvdata = """\
"","min","max","rainfall","days_clear" # field name row
"Missouri",-2,10,300,23
"Amsterdam",-3,5,1212,34
"LA",10,20,1000,54
"""
csvfile = StringIO.StringIO(csvdata)
rainfall = csv_extract_col(csvfile, 'rainfall', '')
print rainfall
# {'Amsterdam': '1212', 'LA': '1000', 'Missouri': '300'}
print min(rainfall.iteritems(), key=lambda r: float(r[1]))
# ('Missouri', '300')
import StringIO
import csv
example = """"","min","max","rainfall","days_clear"
"Missouri",-2,10,300,23
"Amsterdam",-3,5,1212,34
"LA",10,20,1000,54
"""
data_in = StringIO.StringIO(example)
#data_in = open('mycsvdata.csv')
def read_data(data_in):
reader = csv.reader(data_in)
cols = []
results = {}
for row in reader:
if not cols:
cols = row
continue
row = [ int(x) if x.lstrip('-').isdigit() else x for x in row ]
results[row[0]] = dict(zip(cols[1:],row[1:]))
return results
data = read_data(data_in)
min(data.items(),key=lambda x: x[1].get('rainfall'))
Returns
('Missouri', {'max': 10, 'days_clear': 23, 'rainfall': 300, 'min': -2})
To read from a file, you need to remove all code that deals with a string:
reader = csv.reader(open('file.csv', 'rb'))
rainfall = csv_extract_col(reader, 'rainfall', '')
Update: Sorry, it neads a bit more work than that. The first arg of csv_extract_col will be used as the first arg of csv.DictReader so (in this case) it should be an open file object, and should never be a csv.reader instance. See below:
import csv
### def csv_extract_col(csvinput, colname, key):
### exactly as provided by #martineau
if __name__ == '__main__':
import sys
filename, data_col_name, key_col_name = sys.argv[1:4]
input_file_object = open(filename, 'rb')
result_dict = csv_extract_col(input_file_object, data_col_name, key_col_name)
print result_dict
print min(result_dict.iteritems(), key=lambda r: float(r[1]))
Results:
command-prompt>\python27\python joj_csv.py joj.csv rainfall ""
{'Amsterdam': '1212', 'LA': '1000', 'Missouri': '300'}
('Missouri', '300')
command-prompt>\python27\python joj_csv.py joj.csv days_clear ""
{'Amsterdam': '34', 'LA': '54', 'Missouri': '23'}
('Missouri', '23')
Update 2 in response to comment """there must be something i missed out.. i tried.. [what looks like #martineau's function] with the above main function you define. Then in my shell, i define python rainfall "". But it gives me KeyError: 'rainfall'"""
Two possibilities:
(1) You made a mistake patching the pieces of source code together. Check your work.
(2) Your file doesn't have the expected heading row contents. Try some debugging e.g. change #martineau's code so that you can insert a print statement etc. to show what the csv.DictReader thinks about your heading row:
reader = csv.DictReader(csvinput)
print "fieldnames", reader.fieldnames
assert colname in reader.fieldnames
assert key in reader.fieldnames
for row in reader:
If you are still stuck, show us ALL of your code plus the full traceback and error message -- either edit your question or put it up on pastbin or dropbox; DON'T put it into a comment!!
My code for cases in which there are several cities having the same minimum or several cities having the same maximum:
import csv
def minmax_col(filename,key,colname):
with open(filename,'rb') as csvfile:
rid = csv.DictReader(csvfile,
fieldnames=None,
quoting=csv.QUOTE_NONNUMERIC)
mini = float('inf')
maxi = float('-inf')
limin = limax =[]
for row in rid:
if row[colname] == maxi:
limax.append(row[key])
elif row[colname] > maxi:
maxi = row[colname]
limax = [row[key]]
if row[colname] == mini:
limin.append(row[key])
elif row[colname] < mini:
mini = row[colname]
limin = [row[key]]
return (key,(maxi,limax),(mini,limin))
key = 'rainfall'
city,(Ma,liMa),(mi,limi) = minmax_col('filename.csv','',key)
print 'Cities analysed on ' + repr(key) + ' parameter :'
print 'maximum==',Ma,' cities :',', '.join(liMa)
print 'minimum==',mi,' cities :',', '.join(limi)
print
key = 'min'
city,(Ma,liMa),(mi,limi) = minmax_col('filename.csv','',key)
print 'Cities analysed on ' + repr(key) + ' parameter :'
print 'maximum==',Ma,' cities :',', '.join(liMa)
print 'minimum==',mi,' cities :',', '.join(limi)
On a file like that:
"","min","max","rainfall","days_clear"
"Missouri",-2,10,300,23
"Amsterdam",-3,5,1212,34
"Oslo",-2,8,800,12
"LA",10,20,1000,54
"Kologoro",28,45,1212,1
the result is
Cities analysed according the 'rainfall' parameter :
maximum== 1212.0 cities : Amsterdam, Kologoro
minimum== 300.0 cities : Missouri
Cities analysed according the 'min' parameter :
maximum== 28.0 cities : Kologoro
minimum== -3.0 cities : Amsterdam