Progress bar while parsing files

Progress bar while parsing files - python

The code below goes to a directory that has xml files, it takes them and parses them into a dataframe.
from xml.etree import ElementTree as ET
from collections import defaultdict
from pathlib import Path
import csv
from pathlib import Path
directory = 'C:/Users/xml_files'
with open('try.csv', 'w', newline='') as f:
writer = csv.writer(f, delimiter=';')
#◙ writer = csv.writer(f)
headers = ['identify','id', 'service_code', 'rational', 'qualify', 'description_num', 'description_txt','Counter', 'set_data_xin', 'set_data_xax', 'set_data_value', 'set_data_x']
writer.writerow(headers)
xml_files_list = list(map(str,Path(directory).glob('**/*.xml')))
for xml_file in xml_files_list:
tree = ET.parse(xml_file)
root = tree.getroot()
p_get = tree.find('.//Phones/Get').text
p_set = tree.find('.//Phones/Set').text
start_nodes = root.findall('.//START')
for sn in start_nodes:
row = defaultdict(str)
# <<<<< Indentation was wrong here
for k,v in sn.attrib.items():
row[k] = v
for rn in sn.findall('.//Rational'):
row['Rational'] = rn.text
for qu in sn.findall('.//Qualify'):
row['Qualify'] = qu.text
for ds in sn.findall('.//Description'):
row['Description_txt'] = ds.text
row['Description_text_id'] = ds.attrib['text_id']
for counter, st in enumerate( sn.findall('.//SetData') ):
for k,v in st.attrib.items():
if v.startswith("-"):
v = v.replace("-","",1)
v=v.replace(',', '.')
row['SetData_'+ str(k)] = v
row["Counter"] = counter
row_data = [row[i] for i in headers]
row_data[0]=p_get + '_' + p_set
writer.writerow(row_data)
row = defaultdict(str)
Upon using more data, it is really hard to just wait there and not know how far the parsing into dataframe has been done.
So I went and tried to find a way I can show the progress bar. I ended up finding the following
import tqdm
import time
for i in tqdm.tqdm(range(1000)):
time.sleep(0.01)
# or other long operations
I am having problem implementing the code into my code and finding the range which preferably would be to get the numbers of Xml files in that directory
This library tqdm seemed like the easiest one to implement.

You could use
for xml_file in tqdm.tqdm(xml_files_list):
it should automatically use len(xml_files_list) and it will return xml_file.
And you don't need sleep(). It was used in documentation only to slow down loop for example.

Related

How to build specific format with open()?

Here's my code:
import glob
import itertools
import sys, os
import six
import csv
import numpy as np
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
os.chdir("PATH/pdf")
extension = 'pdf'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
valeur = []
n = 1
for i in all_filenames:
fp = open(i, "rb")
parser = PDFParser(fp)
doc = PDFDocument(parser)
fields = resolve1(doc.catalog["AcroForm"])["Fields"]
for i in fields:
field = resolve1(i)
name, value = field.get("T"), field.get("V")
filehehe = "{0}:{1}".format(name,value)
values = resolve1(value)
names = resolve1(name)
valeur.append(values)
n = n+1
with open('test.csv','wb') as f:
for i in valeur:
f.write(i)
The goal here is to pick up some informations in PDF. Here's the output :
As you can see, the format is not pretty. I'm not very familiar with open() so I'm kind of stuck.
I would like to have distinct rows for each PDF with each informations having her own cell. Something like that :

Try to store the data from each pdf file in a separate list. And add this list to the valeur list which you have.
Use csv module as #martineau rightly suggested.
You can try the with below code.
import csv
valeur = []
#your code
n = 1
for i in all_filenames:
temp_list = []
fp = open(i, "rb")
parser = PDFParser(fp)
doc = PDFDocument(parser)
fields = resolve1(doc.catalog["AcroForm"])["Fields"]
for i in fields:
field = resolve1(i)
name, value = field.get("T"), field.get("V")
filehehe = "{0}:{1}".format(name,value)
values = resolve1(value)
names = resolve1(name)
temp_list.append(values)
n = n+1
valeur.append(temp_list)
#Finally when you have the required data, you can write to csv file like this.
with open('mycsv.csv', 'w', newline='') as myfile:
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
for val in valeur:
wr.writerow(val)
With this, the output would be like this

Defining a list within the code versus reading it from a file

I am trying to count the number of specific words in a given report. Does anyone know why defining a list within the code makes the second part of the following code run faster than reading the list from a file? Is there a solution? The list contains the same words is a lot longer than two words in the following example.
# Example code: Within code list
import csv
import glob
import re
import time
TARGET_FILES = r'C:/Users/s170760/Desktop/Reports_Cleaned/*.*'
OUTPUT_FILE = r'C:/Users/s170760/Desktop/Parser.csv'
OUTPUT_FIELDS = ['file name', 'create']
create = {'agile', 'skills'}
def main():
f_out = open(OUTPUT_FILE, 'w')
wr = csv.writer(f_out, lineterminator='\n')
wr.writerow(OUTPUT_FIELDS)
file_list = glob.glob(TARGET_FILES)
for file in file_list:
print(file)
with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
doc = f_in.read()
doc = doc.lower()
output_data = get_data(doc)
output_data[0] = file
wr.writerow(output_data)
def get_data(doc):
_odata = [0] * 2
tokens = re.findall('\w(?:[-\w]*\w)?', doc)
for token in tokens:
if token in create:
_odata[1] += 1
return _odata
Here is the other way:
# Example code: Reading list from a file
import csv
import glob
import re
import time
TARGET_FILES = r'C:/Users/s170760/Desktop/Reports_Cleaned/*.*'
OUTPUT_FILE = r'C:/Users/s170760/Desktop/Parser.csv'
OUTPUT_FIELDS = ['file name', 'create']
create = open('C:/Users/s170760/Desktop/Create.txt', 'r').read().splitlines()
def main():
f_out = open(OUTPUT_FILE, 'w')
wr = csv.writer(f_out, lineterminator='\n')
wr.writerow(OUTPUT_FIELDS)
file_list = glob.glob(TARGET_FILES)
for file in file_list:
print(file)
with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
doc = f_in.read()
doc = doc.lower()
output_data = get_data(doc)
output_data[0] = file
wr.writerow(output_data)
def get_data(doc):
_odata = [0] * 2
tokens = re.findall('\w(?:[-\w]*\w)?', doc)
for token in tokens:
if token in create:
_odata[1] += 1
return _odata

As pointed out by Mark in the comments, the first code snippet uses a set of strings, while the second code snippet loads a file into a list of strings.
Why sets are faster than lists in this use case, is well explained in this Stack Overflow answer. Parsing the output of open to a set can indeed solve your problem.
So replace:
create = open('C:/Users/s170760/Desktop/Create.txt', 'r').read().splitlines()
With:
create = set(open('C:/Users/s170760/Desktop/Create.txt', 'r').read().splitlines())

How can I write just one float item in csv?

In this question:
First I read the scores from the csv file and then
I saved an item in the following code in the lw list.
I want to write the lw list in a csv file.
How can I do this?
I read scores from a csv file called alaki.csv:
mandana,5,7,3,15
hamid,3,9,4,20,9,1,8,16,0,5,2,4,7,2,1
sina,19,10,19,6,8,14,3
sara,0,5,20,14
soheila,13,2,5,1,3,10,12,4,13,17,7,7
ali,1,9
sarvin,0,16,16,13,19,2,17,8
import csv
# For the average
from statistics import mean
import operator
from collections import Counter
def calculate_average_of_averages(input_file_name, output_file_name):
#output_file_name=chert.csv
with open(input_file_name) as d:
se = csv.reader(d)
l = {}
for ldf in se:
name = ldf[0]
lsd = mean([float(sd) for sd in ldf[1:]])
l[name] = lsd
with open(output_file_name,'w') as ff:
fd = csv.writer(ff)
a = list(l.values())
lw = []
m = mean(a)
lw.append(m)
calculate_average_of_averages('alaki.csv','chert.csv')
output in csv file:
8.401530612244898
please help me

How about this:
import csv
# For the average
from statistics import mean
import operator
from collections import Counter
def calculate_average_of_averages(input_file_name, output_file_name):
#output_file_name=chert.csv
with open(input_file_name) as d:
se = csv.reader(d)
l = {}
for ldf in se:
name = ldf[0]
lsd = mean([float(sd) for sd in ldf[1:]])
l[name] = lsd
m = mean(list(l.values()))
l["average_of_average"]=m
with open(output_file_name,'w') as ff:
for name,value in l.items():
ff.write("{},{}\n".format(name,value))
calculate_average_of_averages('alaki.csv','chert.csv')
output looks like:
mandana,7.5
hamid,6.066666666666666
sina,11.285714285714286
sara,9.75
soheila,7.833333333333333
ali,5.0
sarvin,11.375
average_of_average,8.401530612244898
to output just average_of_average
replace the write block:
with open(output_file_name,'w') as ff:
ff.write(l['average_of_average'])

You can use the pandas library by adding these 2 lines
import csv
import pandas as pd
# For the average
from statistics import mean
import operator
from collections import Counter
def calculate_average_of_averages(input_file_name, output_file_name):
with open(input_file_name) as d:
se = csv.reader(d)
l = {}
for ldf in se:
name = ldf[0]
lsd = mean([float(sd) for sd in ldf[1:]])
l[name] = lsd
a = list(l.values())
lw = []
m = mean(a)
lw.append(m)
pd.DataFrame(lw,columns=["yourColumn"]).to_csv(output_file_name+".csv")
calculate_average_of_averages('alaki.csv','chert.csv')

I am not sure if CSV writer is necessary to write just one line.
import csv
from statistics import mean
def calculate_mean_of_means(input_file, output_file):
with open(input_file, newline='') as csvfile:
csvreader = csv.reader(csvfile)
ls = {}
for row in csvreader:
str_to_int = [int(i) for i in row[1:]]
ls[row[0]] = str_to_int
total_means = 0
for score in ls.values():
total_means += mean(score)
mean_of_means = [total_means / len(ls)]
with open(output_file, 'w', newline='') as csvfile:
meanwriter = csv.writer(csvfile)
meanwriter.writerow(mean_of_means)
calculate_mean_of_means('alaki.csv', 'chert.csv')

Adding a counter as index of the Dataframe

The code below takes a folder with xml files and parses them into a single csv file.
It does the job really good.
from xml.etree import ElementTree as ET
from collections import defaultdict
import csv
from pathlib import Path
directory = 'C:/Users/docs/FolderwithXMLs'
with open('output.csv', 'w', newline='') as f:
writer = csv.writer(f)
headers = ['id', 'service_code', 'rational', 'qualify', 'description_num', 'description_txt', 'set_data_xin', 'set_data_xax', 'set_data_value', 'set_data_x']
writer.writerow(headers)
xml_files_list = list(map(str,Path(directory).glob('**/*.xml')))
for xml_file in xml_files_list:
tree = ET.parse(xml_file)
root = tree.getroot()
start_nodes = root.findall('.//START')
for sn in start_nodes:
row = defaultdict(str)
for k,v in sn.attrib.items():
row[k] = v
for rn in sn.findall('.//Rational'):
row['rational'] = rn.text
for qu in sn.findall('.//Qualify'):
row['qualify'] = qu.text
for ds in sn.findall('.//Description'):
row['description_txt'] = ds.text
row['description_num'] = ds.attrib['num']
for st in sn.findall('.//SetData'):
for k,v in st.attrib.items():
row['set_data_'+ str(k)] = v
row_data = [row[i] for i in headers]
writer.writerow(row_data)
row = defaultdict(str)
The output looks like this
I have been trying to add a counter for the numbers of how many rows of set_data_value for that specific ID there are.
The output should look like this
If necessary I can provide the xml data also. I am sorry someone has to edit the question to show the pictures instead of just hypelink
I have checked other posts here but I wasn't able to implement into this code

Without seeing the XML it will be a bit if a guess, but if you add "Counter" to headers and then add enumerate on the last for loop it may work
for counter, st in enumerate( sn.findall('.//SetData') ):
for k,v in st.attrib.items():
row['set_data_'+ str(k)] = v
row["Counter"] = counter
row_data = [row[i] for i in headers]
writer.writerow(row_data)
row = defaultdict(str)

Using ElementTree to parse XML results in empty csv file

I'm a complete beginner in Python, so I've had to rely on several tutorials to put this code together. It does produce a .csv file, but it turns out empty (0kb). I've found others with this question had forgotten to close the file, but that doesn't seem to be the problem here. I'm grateful for any hints.
This is the xml if that helps: https://api.nextbike.net/maps/nextbike-live.xml?city=14
import xml.etree.ElementTree as ET
import csv
tree = ET.parse('nextbike-live.xml')
root = tree.getroot()
with open('Bike_pos.csv', 'w') as Bike_pos:
csvwriter = csv.writer(Bike_pos)
bike_head = []
count = 0
for member in root.findall('place'):
bike = []
if count == 0:
station = member.find('name').tag
bike_head.append(station)
lat = member.find('lat').tag
bike_head.append(lat)
lon = member.find('lng').tag
bike_head.append(lon)
bikeno = member.find('bike_numbers').tag
bike_head.append(bikeno)
count = count + 1
station = member.find('name').text
bike.append(station)
lat = member.find('lat').text
bike.append(lat)
lon = member.find('lng').text
bike.append(lon)
bikeno = member.find('bike_numbers').text
csvwriter.writerow(bike)
Bike_pos.close()

I got help from a good friend. My xml source file had several children that my code wasn't searching.
He gave me this code that worked like a charm and is a lot simpler than what I had:
import xml.etree.ElementTree as ET
import csv
tree = ET.parse('nextbike-live-test.xml')
root = tree.getroot()
with open('Bike_pos.csv', 'w') as Bike_pos:
csvwriter = csv.writer(Bike_pos)
#CSV Header
csvwriter.writerow(['Station', 'lat', 'lng', 'Bikes'])
#Add info about each station
for country in root.findall('country'):
for city in country.findall('city'):
for place in city.findall('place'):
bike = []
bike.append(place.get('name'))
bike.append(place.get('lat'))
bike.append(place.get('lng'))
bike.append(place.get('bike_numbers'))
csvwriter.writerow(bike)

To make it simpler you can try like this as well:
import requests
import csv
from lxml.html import fromstring
with open("Bike_Details.csv","w",newline="") as infile:
writer = csv.writer(infile)
writer.writerow(["station","lat","lng","bike_num"])
res = requests.get("https://api.nextbike.net/maps/nextbike-live.xml?city=14")
root = fromstring(res.content)
for items in root.cssselect("country city place"):
station = items.get("name")
lat = items.get("lat")
lng = items.get("lng")
bike_num = items.get("bike_numbers")
print(station,lat,lng,bike_num)
writer.writerow([station,lat,lng,bike_num])

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Progress bar while parsing files - python

You could use for xml_file in tqdm.tqdm(xml_files_list): it should automatically use len(xml_files_list) and it will return xml_file. And you don't need sleep(). It was used in documentation only to slow down loop for example.

Related

How to build specific format with open()?

Defining a list within the code versus reading it from a file

How can I write just one float item in csv?

Adding a counter as index of the Dataframe

Using ElementTree to parse XML results in empty csv file

Categories

Resources