Adding a counter as index of the Dataframe - python

The code below takes a folder with xml files and parses them into a single csv file.
It does the job really good.
from xml.etree import ElementTree as ET
from collections import defaultdict
import csv
from pathlib import Path
directory = 'C:/Users/docs/FolderwithXMLs'
with open('output.csv', 'w', newline='') as f:
writer = csv.writer(f)
headers = ['id', 'service_code', 'rational', 'qualify', 'description_num', 'description_txt', 'set_data_xin', 'set_data_xax', 'set_data_value', 'set_data_x']
writer.writerow(headers)
xml_files_list = list(map(str,Path(directory).glob('**/*.xml')))
for xml_file in xml_files_list:
tree = ET.parse(xml_file)
root = tree.getroot()
start_nodes = root.findall('.//START')
for sn in start_nodes:
row = defaultdict(str)
for k,v in sn.attrib.items():
row[k] = v
for rn in sn.findall('.//Rational'):
row['rational'] = rn.text
for qu in sn.findall('.//Qualify'):
row['qualify'] = qu.text
for ds in sn.findall('.//Description'):
row['description_txt'] = ds.text
row['description_num'] = ds.attrib['num']
for st in sn.findall('.//SetData'):
for k,v in st.attrib.items():
row['set_data_'+ str(k)] = v
row_data = [row[i] for i in headers]
writer.writerow(row_data)
row = defaultdict(str)
The output looks like this
I have been trying to add a counter for the numbers of how many rows of set_data_value for that specific ID there are.
The output should look like this
If necessary I can provide the xml data also. I am sorry someone has to edit the question to show the pictures instead of just hypelink
I have checked other posts here but I wasn't able to implement into this code

Without seeing the XML it will be a bit if a guess, but if you add "Counter" to headers and then add enumerate on the last for loop it may work
for counter, st in enumerate( sn.findall('.//SetData') ):
for k,v in st.attrib.items():
row['set_data_'+ str(k)] = v
row["Counter"] = counter
row_data = [row[i] for i in headers]
writer.writerow(row_data)
row = defaultdict(str)

Related

How to build specific format with open()?

Here's my code:
import glob
import itertools
import sys, os
import six
import csv
import numpy as np
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
os.chdir("PATH/pdf")
extension = 'pdf'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
valeur = []
n = 1
for i in all_filenames:
fp = open(i, "rb")
parser = PDFParser(fp)
doc = PDFDocument(parser)
fields = resolve1(doc.catalog["AcroForm"])["Fields"]
for i in fields:
field = resolve1(i)
name, value = field.get("T"), field.get("V")
filehehe = "{0}:{1}".format(name,value)
values = resolve1(value)
names = resolve1(name)
valeur.append(values)
n = n+1
with open('test.csv','wb') as f:
for i in valeur:
f.write(i)
The goal here is to pick up some informations in PDF. Here's the output :
As you can see, the format is not pretty. I'm not very familiar with open() so I'm kind of stuck.
I would like to have distinct rows for each PDF with each informations having her own cell. Something like that :
Try to store the data from each pdf file in a separate list. And add this list to the valeur list which you have.
Use csv module as #martineau rightly suggested.
You can try the with below code.
import csv
valeur = []
#your code
n = 1
for i in all_filenames:
temp_list = []
fp = open(i, "rb")
parser = PDFParser(fp)
doc = PDFDocument(parser)
fields = resolve1(doc.catalog["AcroForm"])["Fields"]
for i in fields:
field = resolve1(i)
name, value = field.get("T"), field.get("V")
filehehe = "{0}:{1}".format(name,value)
values = resolve1(value)
names = resolve1(name)
temp_list.append(values)
n = n+1
valeur.append(temp_list)
#Finally when you have the required data, you can write to csv file like this.
with open('mycsv.csv', 'w', newline='') as myfile:
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
for val in valeur:
wr.writerow(val)
With this, the output would be like this

Plotly Takes to long to Render an output

I need a help please :)
Hi! I have a dataframe CSV file in which I have longitude column, latitude column, and sales.
I would love to visualize my data geographically.
First I have tried to do a scatters directly CSV and wrote the next.
px.set_mapbox_access_token("my token on mapbox")
fig = px.scatter_mapbox(df, lat="latitude", lon="longitude",color="sales",
color_continuous_scale=px.colors.cyclical.IceFire, size_max=20,zoom=12)
fig.show()
fig.write_html("example_map.html")
It has never opened.
Then I have tried to try without mapbox and converted CSV into Json file.
import csv
import json
from collections import OrderedDict
li = []
with open("Path to my file") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
d = OrderedDict()
d['type'] = 'zipCode'
d['geometry'] = {
'type': 'Point',
'coordinates': [float(row['latitude']), float(row['longitude'])]
}
li.append(d)
d = OrderedDict()
d['type'] = 'FeatureCollection'
d['features'] = li
with open('output.json','w') as f:
json.dump(d,f,indent=2)
and next I have tried to plot it:
import csv
import json
from collections import OrderedDict
li = []
with open("C:\\Users\\Dell\\Desktop\\Intern 2021\\McGill\\full_dataset_csv.csv", newline='') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
d = OrderedDict()
d['type'] = 'zipCode'
d['geometry'] = {
'type': 'Point',
'coordinates': [float(row['latitude']), float(row['longitude'])]
}
li.append(d)
d = OrderedDict()
d['type'] = 'FeatureCollection'
d['features'] = li
with open('output.json','w') as f:
json.dump(d,f,indent=2)
Again it took forever to show an output. I think I might have done something wrong in codding? Or I should think about an alternative to Plotly?

How can I write just one float item in csv?

In this question:
First I read the scores from the csv file and then
I saved an item in the following code in the lw list.
I want to write the lw list in a csv file.
How can I do this?
I read scores from a csv file called alaki.csv:
mandana,5,7,3,15
hamid,3,9,4,20,9,1,8,16,0,5,2,4,7,2,1
sina,19,10,19,6,8,14,3
sara,0,5,20,14
soheila,13,2,5,1,3,10,12,4,13,17,7,7
ali,1,9
sarvin,0,16,16,13,19,2,17,8
import csv
# For the average
from statistics import mean
import operator
from collections import Counter
def calculate_average_of_averages(input_file_name, output_file_name):
#output_file_name=chert.csv
with open(input_file_name) as d:
se = csv.reader(d)
l = {}
for ldf in se:
name = ldf[0]
lsd = mean([float(sd) for sd in ldf[1:]])
l[name] = lsd
with open(output_file_name,'w') as ff:
fd = csv.writer(ff)
a = list(l.values())
lw = []
m = mean(a)
lw.append(m)
calculate_average_of_averages('alaki.csv','chert.csv')
output in csv file:
8.401530612244898
please help me
How about this:
import csv
# For the average
from statistics import mean
import operator
from collections import Counter
def calculate_average_of_averages(input_file_name, output_file_name):
#output_file_name=chert.csv
with open(input_file_name) as d:
se = csv.reader(d)
l = {}
for ldf in se:
name = ldf[0]
lsd = mean([float(sd) for sd in ldf[1:]])
l[name] = lsd
m = mean(list(l.values()))
l["average_of_average"]=m
with open(output_file_name,'w') as ff:
for name,value in l.items():
ff.write("{},{}\n".format(name,value))
calculate_average_of_averages('alaki.csv','chert.csv')
output looks like:
mandana,7.5
hamid,6.066666666666666
sina,11.285714285714286
sara,9.75
soheila,7.833333333333333
ali,5.0
sarvin,11.375
average_of_average,8.401530612244898
to output just average_of_average
replace the write block:
with open(output_file_name,'w') as ff:
ff.write(l['average_of_average'])
You can use the pandas library by adding these 2 lines
import csv
import pandas as pd
# For the average
from statistics import mean
import operator
from collections import Counter
def calculate_average_of_averages(input_file_name, output_file_name):
with open(input_file_name) as d:
se = csv.reader(d)
l = {}
for ldf in se:
name = ldf[0]
lsd = mean([float(sd) for sd in ldf[1:]])
l[name] = lsd
a = list(l.values())
lw = []
m = mean(a)
lw.append(m)
pd.DataFrame(lw,columns=["yourColumn"]).to_csv(output_file_name+".csv")
calculate_average_of_averages('alaki.csv','chert.csv')
I am not sure if CSV writer is necessary to write just one line.
import csv
from statistics import mean
def calculate_mean_of_means(input_file, output_file):
with open(input_file, newline='') as csvfile:
csvreader = csv.reader(csvfile)
ls = {}
for row in csvreader:
str_to_int = [int(i) for i in row[1:]]
ls[row[0]] = str_to_int
total_means = 0
for score in ls.values():
total_means += mean(score)
mean_of_means = [total_means / len(ls)]
with open(output_file, 'w', newline='') as csvfile:
meanwriter = csv.writer(csvfile)
meanwriter.writerow(mean_of_means)
calculate_mean_of_means('alaki.csv', 'chert.csv')

Progress bar while parsing files

The code below goes to a directory that has xml files, it takes them and parses them into a dataframe.
from xml.etree import ElementTree as ET
from collections import defaultdict
from pathlib import Path
import csv
from pathlib import Path
directory = 'C:/Users/xml_files'
with open('try.csv', 'w', newline='') as f:
writer = csv.writer(f, delimiter=';')
#◙ writer = csv.writer(f)
headers = ['identify','id', 'service_code', 'rational', 'qualify', 'description_num', 'description_txt','Counter', 'set_data_xin', 'set_data_xax', 'set_data_value', 'set_data_x']
writer.writerow(headers)
xml_files_list = list(map(str,Path(directory).glob('**/*.xml')))
for xml_file in xml_files_list:
tree = ET.parse(xml_file)
root = tree.getroot()
p_get = tree.find('.//Phones/Get').text
p_set = tree.find('.//Phones/Set').text
start_nodes = root.findall('.//START')
for sn in start_nodes:
row = defaultdict(str)
# <<<<< Indentation was wrong here
for k,v in sn.attrib.items():
row[k] = v
for rn in sn.findall('.//Rational'):
row['Rational'] = rn.text
for qu in sn.findall('.//Qualify'):
row['Qualify'] = qu.text
for ds in sn.findall('.//Description'):
row['Description_txt'] = ds.text
row['Description_text_id'] = ds.attrib['text_id']
for counter, st in enumerate( sn.findall('.//SetData') ):
for k,v in st.attrib.items():
if v.startswith("-"):
v = v.replace("-","",1)
v=v.replace(',', '.')
row['SetData_'+ str(k)] = v
row["Counter"] = counter
row_data = [row[i] for i in headers]
row_data[0]=p_get + '_' + p_set
writer.writerow(row_data)
row = defaultdict(str)
Upon using more data, it is really hard to just wait there and not know how far the parsing into dataframe has been done.
So I went and tried to find a way I can show the progress bar. I ended up finding the following
import tqdm
import time
for i in tqdm.tqdm(range(1000)):
time.sleep(0.01)
# or other long operations
I am having problem implementing the code into my code and finding the range which preferably would be to get the numbers of Xml files in that directory
This library tqdm seemed like the easiest one to implement.
You could use
for xml_file in tqdm.tqdm(xml_files_list):
it should automatically use len(xml_files_list) and it will return xml_file.
And you don't need sleep(). It was used in documentation only to slow down loop for example.

Merging two CSV files by a common column python

I am trying to merge two csv files with a common id column and write the merge to a new file. I have tried the following but it is giving me an error -
import csv
from collections import OrderedDict
filenames = "stops.csv", "stops2.csv"
data = OrderedDict()
fieldnames = []
for filename in filenames:
with open(filename, "rb") as fp: # python 2
reader = csv.DictReader(fp)
fieldnames.extend(reader.fieldnames)
for row in reader:
data.setdefault(row["stop_id"], {}).update(row)
fieldnames = list(OrderedDict.fromkeys(fieldnames))
with open("merged.csv", "wb") as fp:
writer = csv.writer(fp)
writer.writerow(fieldnames)
for row in data.itervalues():
writer.writerow([row.get(field, '') for field in fieldnames])
Both files have the "stop_id" column but I'm getting this error back -
KeyError: 'stop_id'
Any help would be much appreciated.
Thanks
Here is an example using pandas
import sys
from StringIO import StringIO
import pandas as pd
TESTDATA=StringIO("""DOB;First;Last
2016-07-26;John;smith
2016-07-27;Mathew;George
2016-07-28;Aryan;Singh
2016-07-29;Ella;Gayau
""")
list1 = pd.read_csv(TESTDATA, sep=";")
TESTDATA=StringIO("""Date of Birth;Patient First Name;Patient Last Name
2016-07-26;John;smith
2016-07-27;Mathew;XXX
2016-07-28;Aryan;Singh
2016-07-20;Ella;Gayau
""")
list2 = pd.read_csv(TESTDATA, sep=";")
print list2
print list1
common = pd.merge(list1, list2, how='left', left_on=['Last', 'First', 'DOB'], right_on=['Patient Last Name', 'Patient First Name', 'Date of Birth']).dropna()
print common
Thanks Shijo.
This is what worked for me after - merged by the first column in each csv.
import csv
from collections import OrderedDict
with open('stops.csv', 'rb') as f:
r = csv.reader(f)
dict2 = {row[0]: row[1:] for row in r}
with open('stops2.csv', 'rb') as f:
r = csv.reader(f)
dict1 = OrderedDict((row[0], row[1:]) for row in r)
result = OrderedDict()
for d in (dict1, dict2):
for key, value in d.iteritems():
result.setdefault(key, []).extend(value)
with open('ab_combined.csv', 'wb') as f:
w = csv.writer(f)
for key, value in result.iteritems():
w.writerow([key] + value)

Categories