I'm a complete beginner in Python, so I've had to rely on several tutorials to put this code together. It does produce a .csv file, but it turns out empty (0kb). I've found others with this question had forgotten to close the file, but that doesn't seem to be the problem here. I'm grateful for any hints.
This is the xml if that helps: https://api.nextbike.net/maps/nextbike-live.xml?city=14
import xml.etree.ElementTree as ET
import csv
tree = ET.parse('nextbike-live.xml')
root = tree.getroot()
with open('Bike_pos.csv', 'w') as Bike_pos:
csvwriter = csv.writer(Bike_pos)
bike_head = []
count = 0
for member in root.findall('place'):
bike = []
if count == 0:
station = member.find('name').tag
bike_head.append(station)
lat = member.find('lat').tag
bike_head.append(lat)
lon = member.find('lng').tag
bike_head.append(lon)
bikeno = member.find('bike_numbers').tag
bike_head.append(bikeno)
count = count + 1
station = member.find('name').text
bike.append(station)
lat = member.find('lat').text
bike.append(lat)
lon = member.find('lng').text
bike.append(lon)
bikeno = member.find('bike_numbers').text
csvwriter.writerow(bike)
Bike_pos.close()
I got help from a good friend. My xml source file had several children that my code wasn't searching.
He gave me this code that worked like a charm and is a lot simpler than what I had:
import xml.etree.ElementTree as ET
import csv
tree = ET.parse('nextbike-live-test.xml')
root = tree.getroot()
with open('Bike_pos.csv', 'w') as Bike_pos:
csvwriter = csv.writer(Bike_pos)
#CSV Header
csvwriter.writerow(['Station', 'lat', 'lng', 'Bikes'])
#Add info about each station
for country in root.findall('country'):
for city in country.findall('city'):
for place in city.findall('place'):
bike = []
bike.append(place.get('name'))
bike.append(place.get('lat'))
bike.append(place.get('lng'))
bike.append(place.get('bike_numbers'))
csvwriter.writerow(bike)
To make it simpler you can try like this as well:
import requests
import csv
from lxml.html import fromstring
with open("Bike_Details.csv","w",newline="") as infile:
writer = csv.writer(infile)
writer.writerow(["station","lat","lng","bike_num"])
res = requests.get("https://api.nextbike.net/maps/nextbike-live.xml?city=14")
root = fromstring(res.content)
for items in root.cssselect("country city place"):
station = items.get("name")
lat = items.get("lat")
lng = items.get("lng")
bike_num = items.get("bike_numbers")
print(station,lat,lng,bike_num)
writer.writerow([station,lat,lng,bike_num])
Related
Here's my code:
import glob
import itertools
import sys, os
import six
import csv
import numpy as np
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
os.chdir("PATH/pdf")
extension = 'pdf'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
valeur = []
n = 1
for i in all_filenames:
fp = open(i, "rb")
parser = PDFParser(fp)
doc = PDFDocument(parser)
fields = resolve1(doc.catalog["AcroForm"])["Fields"]
for i in fields:
field = resolve1(i)
name, value = field.get("T"), field.get("V")
filehehe = "{0}:{1}".format(name,value)
values = resolve1(value)
names = resolve1(name)
valeur.append(values)
n = n+1
with open('test.csv','wb') as f:
for i in valeur:
f.write(i)
The goal here is to pick up some informations in PDF. Here's the output :
As you can see, the format is not pretty. I'm not very familiar with open() so I'm kind of stuck.
I would like to have distinct rows for each PDF with each informations having her own cell. Something like that :
Try to store the data from each pdf file in a separate list. And add this list to the valeur list which you have.
Use csv module as #martineau rightly suggested.
You can try the with below code.
import csv
valeur = []
#your code
n = 1
for i in all_filenames:
temp_list = []
fp = open(i, "rb")
parser = PDFParser(fp)
doc = PDFDocument(parser)
fields = resolve1(doc.catalog["AcroForm"])["Fields"]
for i in fields:
field = resolve1(i)
name, value = field.get("T"), field.get("V")
filehehe = "{0}:{1}".format(name,value)
values = resolve1(value)
names = resolve1(name)
temp_list.append(values)
n = n+1
valeur.append(temp_list)
#Finally when you have the required data, you can write to csv file like this.
with open('mycsv.csv', 'w', newline='') as myfile:
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
for val in valeur:
wr.writerow(val)
With this, the output would be like this
I am new to python and XML.
attaching my code
from xml.etree import ElementTree as ET
import glob
def extract_points(filename):
tree = ET.parse(filename)
root = tree.getroot()
pointss = list()
for pnts in root.findall('.//shapes/points'):
pt1 = float(pnts.find('row').text)
pt2 = float(pnts.find('row1').text)
coors = [pt1,pt2]
pointss.append(coors)
height = int(root.find('.//imageHeight').text)
width = int(root.find('.//imageWidth').text)
return pointss,height,width
txtfiles = []
for file in glob.glob("C:/Users/MSI/Desktop/xmlfiles/*.xml"):
txtfiles.append(file)
for f in txtfiles:
pointss,height,width = extract_points(f)
part of my XML file is:
PYTH
when i run the code only get first row value instead of both rows under the tag point.
i want both the values to be a part of [[1019.2222222222223,304.974358974359 ]] but i only get [1019.2222222222223]. need help plz
Based on your xml here is working code
from xml.etree import ElementTree as ET
def extract_points(filename):
tree = ET.parse(filename)
root = tree.getroot()
points = list()
for points_ele in root.findall('.//points'):
points.append([])
for row in points_ele:
points[-1].append(float(row.text))
height = int(root.find('.//imageHeight').text)
width = int(root.find('.//imageWidth').text)
return points, height, width
print(extract_points('data.xml'))
The code below goes to a directory that has xml files, it takes them and parses them into a dataframe.
from xml.etree import ElementTree as ET
from collections import defaultdict
from pathlib import Path
import csv
from pathlib import Path
directory = 'C:/Users/xml_files'
with open('try.csv', 'w', newline='') as f:
writer = csv.writer(f, delimiter=';')
#◙ writer = csv.writer(f)
headers = ['identify','id', 'service_code', 'rational', 'qualify', 'description_num', 'description_txt','Counter', 'set_data_xin', 'set_data_xax', 'set_data_value', 'set_data_x']
writer.writerow(headers)
xml_files_list = list(map(str,Path(directory).glob('**/*.xml')))
for xml_file in xml_files_list:
tree = ET.parse(xml_file)
root = tree.getroot()
p_get = tree.find('.//Phones/Get').text
p_set = tree.find('.//Phones/Set').text
start_nodes = root.findall('.//START')
for sn in start_nodes:
row = defaultdict(str)
# <<<<< Indentation was wrong here
for k,v in sn.attrib.items():
row[k] = v
for rn in sn.findall('.//Rational'):
row['Rational'] = rn.text
for qu in sn.findall('.//Qualify'):
row['Qualify'] = qu.text
for ds in sn.findall('.//Description'):
row['Description_txt'] = ds.text
row['Description_text_id'] = ds.attrib['text_id']
for counter, st in enumerate( sn.findall('.//SetData') ):
for k,v in st.attrib.items():
if v.startswith("-"):
v = v.replace("-","",1)
v=v.replace(',', '.')
row['SetData_'+ str(k)] = v
row["Counter"] = counter
row_data = [row[i] for i in headers]
row_data[0]=p_get + '_' + p_set
writer.writerow(row_data)
row = defaultdict(str)
Upon using more data, it is really hard to just wait there and not know how far the parsing into dataframe has been done.
So I went and tried to find a way I can show the progress bar. I ended up finding the following
import tqdm
import time
for i in tqdm.tqdm(range(1000)):
time.sleep(0.01)
# or other long operations
I am having problem implementing the code into my code and finding the range which preferably would be to get the numbers of Xml files in that directory
This library tqdm seemed like the easiest one to implement.
You could use
for xml_file in tqdm.tqdm(xml_files_list):
it should automatically use len(xml_files_list) and it will return xml_file.
And you don't need sleep(). It was used in documentation only to slow down loop for example.
if the XML file is empty i.e. no root element, how to handle in python? When there are XML records, below Python code works fine but whenever XML file empty, conversion process files. I am looking for an exit instead of failing the process.
XML:
<?xml version = '1.0' encoding = 'UTF-8'?>
<ns2:exportEmpData xmlns:ns2="http://webservice.example.com/"/>
Python Code:
import xml
import csv
import xml.etree.ElementTree as ET
tree = ET.parse('C:/emp/emplist.xml')
root = tree.getroot()
# open a file for writing
Emp_data = open('C:/emp/emplist.csv', 'wb')
# create the csv writer object
csvwriter = csv.writer(Emp_data)
emp_head = []
count = 0
for member in root.findall('emplist'):
emp_nodes = []
if count == 0:
empId = member.find('empId').tag
emp_head.append(empId)
fullName = member.find('fullName').tag
emp_head.append(fullName)
currentAddress = member.find('currentAddress').tag
emp_head.append(currentAddress)
csvwriter.writerow(emp_head)
count = count + 1
empId = member.find('empId').text
emp_nodes.append(empId)
fullName = member.find('fullName').text
emp_nodes.append(fullName)
currentAddress = member.find('currentAddress').attrib.get('city')
emp_nodes.append(currentAddress)
csvwriter.writerow(emp_nodes)
Emp_data.close()
i have a real (and maybe pretty stupid) problem to convert a xml-file into a dataframe from pandas. Im new in python and need some help. I trying a code from another thread and modificate it but it not works.
I want to iterate through this file:
<objects>
<object id="123" name="some_string">
<object>
<id>123</id>
<site id="456" name="somename" query="some_query_as_string"/>
<create-date>some_date</create-date>
<update-date>some_date</update-date>
<update-user id="567" name="User:xyz" query="some_query_as_string"/>
<delete-date/>
<delete-user/>
<deleted>false</deleted>
<system-object>false</system-object>
<to-string>some_string_notifications</to-string>
</object>
<workflow>
<workflow-type id="12345" name="WorkflowType_some_workflow" query="some_query_as_string"/>
<validated>true</validated>
<name>somestring</name>
<exported>false</exported>
</workflow>
Here is my code:
import xml.etree.ElementTree as ET
import pandas as pd
path = "C:/Users/User/Desktop/test.xml"
with open(path, 'rb') as fp:
content = fp.read()
parser = ET.XMLParser(encoding="utf-8")
tree = ET.fromstring(content, parser=parser)
def xml2df(tree):
root = ET.XML(tree)
all_records = []
for i, child in enumerate(root):
record ={}
for subchild in child:
record[subchild.tag] = subchild.text
all_records.append(record)
return pd.DataFrame(all_records)
Where is the problem? Please help :O
You are passing the file location string to ET.fromstring(), which is not the actual contents of the file. You need to read the contents of the file first, then pass that to ET.fromstring().
path = "C:/Users/User/Desktop/test.xml"
with open(path, 'rb') as fp:
content = fp.read()
parser = ET.XMLParser(encoding="utf-8")
tree = ET.fromstring(content, parser=parser)