Python CSV multivalued columns to multiple rows - python

I am trying to convert nested JSON to CSV using python.
There can be multiple values for some attributes:
like phone1, phone2,phone3 for a single individual.
I wrote a python code for this and its doing the job perfectly.
But I am getting the values of multiple attributes in multiple columns but I want them in multiple rows,
like for example:
my output:
name phone1 phone2 phone3
xyx 98 34 56
required output:
name phone
xyx 98
xyx 34
xyx 56
The code for this is:
import sys
import json
import csv
import io
import pandas as pd
##
# Convert to string keeping encoding in mind...
##
processed_data = []
def to_string(s):
try:
return str(s)
except:
# Change the encoding type if needed
return s.encode('utf-8')
def reduce_item(key, value):
global reduced_item
global res
# Reduction Condition 1
if type(value) is list:
i = 0
for sub_item in value:
reduce_item(key + '_' + to_string(i), sub_item)
i = i + 1
# Reduction Condition 2
elif type(value) is dict:
sub_keys = value.keys()
for sub_key in sub_keys:
reduce_item(key+'_'+to_string(sub_key), value[sub_key])
# Base Condition
else:
#if reduced_item.get(to_string(key)):
#reduced_item[to_string(key)] = to_string(value)
#processed_data.append(reduced_item)
#else:
reduced_item[to_string(key)]=to_string(value)
if __name__ == "__main__":
if len(sys.argv) != 4:
print("\nUsage: python json_to_csv.py <node> <json_in_file_path> <csv_out_file_path>\n")
else:
# Reading arguments
node = sys.argv[1]
json_file_path = sys.argv[2]
csv_file_path = sys.argv[3]
with io.open(json_file_path, 'r', encoding='utf-8-sig') as fp:
json_value = fp.read()
raw_data = json.loads(json_value)
try:
data_to_be_processed = raw_data[node]
except:
data_to_be_processed = raw_data
header = []
for item in data_to_be_processed:
reduced_item={}
reduce_item(node, item)
header += reduced_item.keys()
processed_data.append(reduced_item)
header = list(set(header))
header.sort()
with open(csv_file_path, 'w+') as f:
writer = csv.DictWriter(f, header,quoting=csv.QUOTE_ALL)
writer.writeheader()
for row in processed_data:
writer.writerow(row)
print("Just completed writing csv file with %d columns" % len(header))
I tried changing the code but was not able to achieve the desired result.It would be of great help if anyone can suggest the changes for this code
Thanks in advance

Related

Same entry, although only found in one column

from sepa import parser
import re
import csv
import pandas as pd
import numpy as np
# Utility function to remove additional namespaces from the XML
def strip_namespace(xml):
return re.sub(' xmlns="[^"]+"', '', xml, count=1)
# Read file
with open('test.xml', 'r') as f:
input_data = f.read()
# Parse the bank statement XML to dictionary
camt_dict = parser.parse_string(parser.bank_to_customer_statement, bytes(strip_namespace(input_data), 'utf8'))
statements = pd.DataFrame.from_dict(camt_dict['statements'])
all_entries = []
for i, _ in statements.iterrows():
if 'entries' in camt_dict['statements'][i]:
df = pd.DataFrame()
dd = pd.DataFrame.from_records(camt_dict['statements'][i]['entries'])
dg = dd['entry_details']
df['Date'] = dd['value_date'].str['date']
df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%d-%m-%Y')
iban = camt_dict['statements'][i]['account']['id']['iban']
df['IBAN'] = iban
df['Currency'] = dd['amount'].str['currency']
# Sort Credit/Debit in separate Columns
df['Credit'] = np.where(dd['credit_debit_indicator'] == 'CRDT', dd['amount'].str['_value'], '')
df['Debit'] = np.where(dd['credit_debit_indicator'] == 'DBIT', dd['amount'].str['_value'], '')
# Get destination IBAN
getlength = len(dg.index) #2
for i in range(0, getlength):
result = str(dd['entry_details'][i])
print(result + "Resultat " + str(i))
search_for_iban = re.search("CH\d{2}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{1}|CH\d{19}", result)
if(search_for_iban is None):
print('the search is none')
df['Test'] = 'None'
else:
print('the search is a match')
df['Test'] = 'Yes'
all_entries.append(df)
df_entries = pd.concat(all_entries)
print(df_entries)
**My problem here is just with this code block **
for i in range(0, getlength):
result = str(dd['entry_details'][i])
search_for_iban = re.search("CH\d{2}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{1}|CH\d{19}", result)
if(search_for_iban is None):
df['Test'] = 'None'
else:
df['Test'] = search_for_iban.group()
all_entries.append(df)
I have already tried to solve various things via the index, this also counts cleanly high in the variable i and the getlength is also correct for 2 entries
What im expecting
If there is an IBAN number in the 'search_for_iban' (which is using regex lookup (re.search)) which is matching in 2nd row i want that iban just in 2nd row (dataframe) "Test" as follows:
what i expect
What im getting
I got double the entry in row 1 and 2 although none was found in row 1. What am i overlooking, my head is hurting! :D
what i got
i think i am making a thinking error here between normal for loop and panda entries
You can try:
for i in range(0, getlength):
.
.
.
else:
df.loc[i, 'Test'] = search_for_iban

CS50 PSET6 - DNA - Works fine on SMALL but not for LARGE database

I'm taking CS50 and got stuck on this pset6.
I made this code and it's working fine for 'small' given database.
On 'large' one i get wrong values in my DNA sequence.
Like, using debug50 i got that Albus sequence should be 15,49,38... and my seq is 21, 55, 64...
whats wrong? AND why it works fine on small database and not in large one?
Thanks for the help!
# Import ARGV and CSV library
from sys import argv, exit
import pandas as pd
import csv
# Check if argv has 3 arguments (program name, cvs file and dna sequence)
while True:
if len(argv) != 3:
print("Usage: python dna.py data.csv sequence.txt")
exit(1)
else:
break
with open(argv[2], 'r', encoding="UTF-8") as txt:
dna_seq = txt.read()
#Find the number of STR - AGATC,TTTTTTCT,AATG,TCTAG,GATA,TATC,GAAA,TCTG
AGATC = dna_seq.count("AGATC")
TTTTTTCT = dna_seq.count("TTTTTTCT")
AATG = dna_seq.count("AATG")
TCTAG = dna_seq.count("TCTAG")
GATA = dna_seq.count("GATA")
TATC = dna_seq.count("TATC")
GAAA = dna_seq.count("GAAA")
TCTG = dna_seq.count("TCTG")
name = 0
if argv[1] == "databases/small.csv":
with open(argv[1], 'r') as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
dna_db = row['name'], row['AGATC'], row['AATG'], row['TATC']
dna_db = list(dna_db)
seq = [AGATC, AATG, TATC]
seq = [str(x) for x in seq]
if dna_db[1:4] == seq:
name = dna_db[:1]
break
else:
name = "No match"
elif argv[1] == "databases/large.csv":
with open(argv[1], 'r') as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
dna_db = row['name'], row['AGATC'], row['TTTTTTCT'], row['AATG'], row['TCTAG'],
row['GATA'], row['TATC'], row['GAAA'], row['TCTG']
dna_db = list(dna_db)
seq = [AGATC,TTTTTTCT,AATG,TCTAG,GATA,TATC,GAAA,TCTG]
seq = [str(x) for x in seq]
if dna_db[1:9] == seq:
name = dna_db[:1]
break
else:
name = "No match"
print(name)

how to import state of pandas dataframe to second .py file

so, toward the end of my first file; we'll call /file.py.
def get_excel_data(self):
"""Places excel data into pandas dataframe"""
# excel_data = pandas.read_excel(self.find_file())
for extracted_archive in self.find_file():
excel_data = pandas.read_excel(extracted_archive)
# print(excel_data)
columns = pandas.DataFrame(columns=excel_data.columns.tolist())
excel_data = pandas.concat([excel_data, columns])
excel_data.columns = excel_data.columns.str.strip()
excel_data.columns = excel_data.columns.str.replace("/", "_")
excel_data.columns = excel_data.columns.str.replace(" ", "_")
total_records = 0
num_valid_records = 0
num_invalid_records = 0
for row in excel_data.itertuples():
mrn = row.MRN
total_records += 1
if mrn in ("", " ", "N/A", "NaT", "NaN", None) or math.isnan(mrn):
# print(f"Invalid record: {row}")
num_invalid_records += 1
# total_invalid = num_invalid_records + dup_count
excel_data = excel_data.drop(excel_data.index[row.Index])
# continue
else:
# print(mrn) # outputs all MRN ids
for row in excel_data.itertuples():
num_valid_records += 1
continue
with open("./logs/metrics.csv", "a", newline="\n") as f:
csv_writer = DictWriter(f, ['date', 'total_records', 'processed', 'skipped', 'success_rate'])
# csv_writer.writeheader()
currentDT = datetime.datetime.now()
success_rate = num_valid_records / total_records * 100
csv_writer.writerow(dict(date=currentDT,
total_records=total_records,
processed=num_valid_records,
skipped=num_invalid_records,
success_rate=num_valid_records / total_records * 100))
return self.clean_data_frame(excel_data)
def clean_data_frame(self, data_frame):
"""Cleans up dataframes"""
for col in data_frame.columns:
if "date" in col.lower():
data_frame[col] = pandas.to_datetime(data_frame[col],
errors='coerce', infer_datetime_format=True)
data_frame[col] = data_frame[col].dt.date
data_frame['MRN'] = data_frame['MRN'].astype(int).astype(str)
return data_frame
def get_mapping_data(self):
map_data = pandas.read_excel(config.MAPPING_DOC, sheet_name='main')
columns = pandas.DataFrame(columns=map_data.columns.tolist())
return pandas.concat([map_data, columns])
in my second file I would like to keep that end state; and do another iteration for instance.... second_file.py
def process_records(self, records, map_data, completed=None, errors=None):
"""Code to execute after webdriver initialization."""
series_not_null = False
try:
num_attempt = 0
for record in data_frame.itertuples(): # not working
print(record)
series_not_null = True
mrn = record.MRN
self.navigate_to_search(num_attempt)
self.navigate_to_member(mrn)
self.navigate_to_assessment()
self.add_assessment(record, map_data)
self.driver.switch_to.parent_frame() # not working
sleep(.5)
error_flag = self.close_member_tab(self.driver, mrn, error_flag)
except Exception as exc:
if series_not_null:
errors = self.process_series_error(exc)
return completed, error
both have import pandas
you can save your dataframe in a pickle file like this. it is also worth noting that you can store most anything in a pickle file. here is a link to some info here: pickle info
import pandas as pd
import pickle
x = pd.DataFrame({'a':[1,2,3],'b':[4,5,6],'c':[7,8,9]})
#this will create a file called pickledata.p that will store the data frame
with open('pickledata.p', 'wb') as fh: #notice that you need the 'wb' for the dump
pickle.dump(x, fh)
#to load the file do this
with open('pickledata.p', 'rb') as fh: #you need to use 'rb' to read
df = pickle.load(fh)
#you can now use df like a normal dataframe
print(df)
you dont actually need the '.p' extension for a pickle file, i just like it.
so you save your dataframe at the end of script one, and then load it in at the start of script 2.
Use Dataframe.to_pickle and pandas.read_pickle:
To persist
df.to_pickle('./dataframe.pkl')
To load
df = pd.read_pickle('./dataframe.pkl')

python csv write error while writing a float value as a column

I want to write the values fetched from url to csv file which has some float values too. The code below shows an error "float found."
import urllib2
import json
import csv
url = 'https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime=2016-10-01&endtime=2016-10-02'
i=0
csvfile = csv.writer(open('earthquakedet.csv', 'w'))
csvfile.writerow(["Latitude", "Longitude ","Title","Place","Mag"])
json_string = urllib2.urlopen(url).read()
j = json.loads(json_string)
names = [d['properties'] for d in j['features']]
names1 = [d['geometry'] for d in j['features']]
while i <= len(names):
print names[i]['title']
print names[i]['place']
print names[i]['mag']
print names1[i]['coordinates'][0]
print names1[i]['coordinates'][1]
i=i+1
finalstr=float(names1[i]['coordinates'][0]) + float(names1[i]['coordinates'][1]) + names[i]['title'] + names[i]['place'] + names[i]['mag']
csvfile.writerow(finalstr)
csvfile.close()
writerow takes a list of values to put on the row, not a string. So, instead of concatenating the values yourself, just put them in a list to pass to writerow:
# ...
i = i + 1
csvfile.writerow([names1[i]['coordinates'][0], names1[i]['coordinates'][1], names[i]['title'], names[i]['place'], names[i]['mag']])

How to extract column and row in csv using python

I have this input in a file.csv
"","min","max","rainfall","days_clear"
"Missouri",-2,10,300,23
"Amsterdam",-3,5,1212,34
"LA",10,20,1000,54
I wanted to write a simple program to find the city with the lowest rainfall which is Missouri in this case. How can I do that using Python csv reader?
I can try extract the items but unfortunately the first row of the file has to be there.
I wanted to have something like count[Missouri]=300
count[Amsterdam]=1212 etc.. so that I can do a minimum and reference back to print the city.
Please advise. Thanks.
import csv
def main():
with open('file.csv', 'rb') as inf:
data = [(int(row['rainfall']), row['']) for row in csv.DictReader(inf)]
data.sort()
print data[0]
if __name__=="__main__":
main()
returns
(300, 'Missouri')
One way to do this would be to use the csv module's DictReader class to write a function to extract the column of data. DictReader will take care of handling the first row of field names automatically. The built-in min() function can then be used to determine the item with the smallest value in the column.
import csv
def csv_extract_col(csvinput, colname, key):
""" extract a named column from a csv stream into a dictionary
colname: name of columm to extract
key: name of another columm to use as keys in returned dict
"""
col = {}
for row in csv.DictReader(csvinput):
col[row[key]] = row[colname]
return col
if __name__=='__main__':
import StringIO
csvdata = """\
"","min","max","rainfall","days_clear" # field name row
"Missouri",-2,10,300,23
"Amsterdam",-3,5,1212,34
"LA",10,20,1000,54
"""
csvfile = StringIO.StringIO(csvdata)
rainfall = csv_extract_col(csvfile, 'rainfall', '')
print rainfall
# {'Amsterdam': '1212', 'LA': '1000', 'Missouri': '300'}
print min(rainfall.iteritems(), key=lambda r: float(r[1]))
# ('Missouri', '300')
import StringIO
import csv
example = """"","min","max","rainfall","days_clear"
"Missouri",-2,10,300,23
"Amsterdam",-3,5,1212,34
"LA",10,20,1000,54
"""
data_in = StringIO.StringIO(example)
#data_in = open('mycsvdata.csv')
def read_data(data_in):
reader = csv.reader(data_in)
cols = []
results = {}
for row in reader:
if not cols:
cols = row
continue
row = [ int(x) if x.lstrip('-').isdigit() else x for x in row ]
results[row[0]] = dict(zip(cols[1:],row[1:]))
return results
data = read_data(data_in)
min(data.items(),key=lambda x: x[1].get('rainfall'))
Returns
('Missouri', {'max': 10, 'days_clear': 23, 'rainfall': 300, 'min': -2})
To read from a file, you need to remove all code that deals with a string:
reader = csv.reader(open('file.csv', 'rb'))
rainfall = csv_extract_col(reader, 'rainfall', '')
Update: Sorry, it neads a bit more work than that. The first arg of csv_extract_col will be used as the first arg of csv.DictReader so (in this case) it should be an open file object, and should never be a csv.reader instance. See below:
import csv
### def csv_extract_col(csvinput, colname, key):
### exactly as provided by #martineau
if __name__ == '__main__':
import sys
filename, data_col_name, key_col_name = sys.argv[1:4]
input_file_object = open(filename, 'rb')
result_dict = csv_extract_col(input_file_object, data_col_name, key_col_name)
print result_dict
print min(result_dict.iteritems(), key=lambda r: float(r[1]))
Results:
command-prompt>\python27\python joj_csv.py joj.csv rainfall ""
{'Amsterdam': '1212', 'LA': '1000', 'Missouri': '300'}
('Missouri', '300')
command-prompt>\python27\python joj_csv.py joj.csv days_clear ""
{'Amsterdam': '34', 'LA': '54', 'Missouri': '23'}
('Missouri', '23')
Update 2 in response to comment """there must be something i missed out.. i tried.. [what looks like #martineau's function] with the above main function you define. Then in my shell, i define python rainfall "". But it gives me KeyError: 'rainfall'"""
Two possibilities:
(1) You made a mistake patching the pieces of source code together. Check your work.
(2) Your file doesn't have the expected heading row contents. Try some debugging e.g. change #martineau's code so that you can insert a print statement etc. to show what the csv.DictReader thinks about your heading row:
reader = csv.DictReader(csvinput)
print "fieldnames", reader.fieldnames
assert colname in reader.fieldnames
assert key in reader.fieldnames
for row in reader:
If you are still stuck, show us ALL of your code plus the full traceback and error message -- either edit your question or put it up on pastbin or dropbox; DON'T put it into a comment!!
My code for cases in which there are several cities having the same minimum or several cities having the same maximum:
import csv
def minmax_col(filename,key,colname):
with open(filename,'rb') as csvfile:
rid = csv.DictReader(csvfile,
fieldnames=None,
quoting=csv.QUOTE_NONNUMERIC)
mini = float('inf')
maxi = float('-inf')
limin = limax =[]
for row in rid:
if row[colname] == maxi:
limax.append(row[key])
elif row[colname] > maxi:
maxi = row[colname]
limax = [row[key]]
if row[colname] == mini:
limin.append(row[key])
elif row[colname] < mini:
mini = row[colname]
limin = [row[key]]
return (key,(maxi,limax),(mini,limin))
key = 'rainfall'
city,(Ma,liMa),(mi,limi) = minmax_col('filename.csv','',key)
print 'Cities analysed on ' + repr(key) + ' parameter :'
print 'maximum==',Ma,' cities :',', '.join(liMa)
print 'minimum==',mi,' cities :',', '.join(limi)
print
key = 'min'
city,(Ma,liMa),(mi,limi) = minmax_col('filename.csv','',key)
print 'Cities analysed on ' + repr(key) + ' parameter :'
print 'maximum==',Ma,' cities :',', '.join(liMa)
print 'minimum==',mi,' cities :',', '.join(limi)
On a file like that:
"","min","max","rainfall","days_clear"
"Missouri",-2,10,300,23
"Amsterdam",-3,5,1212,34
"Oslo",-2,8,800,12
"LA",10,20,1000,54
"Kologoro",28,45,1212,1
the result is
Cities analysed according the 'rainfall' parameter :
maximum== 1212.0 cities : Amsterdam, Kologoro
minimum== 300.0 cities : Missouri
Cities analysed according the 'min' parameter :
maximum== 28.0 cities : Kologoro
minimum== -3.0 cities : Amsterdam

Categories