How to build specific format with open()?

How to build specific format with open()? - python

Here's my code:
import glob
import itertools
import sys, os
import six
import csv
import numpy as np
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
os.chdir("PATH/pdf")
extension = 'pdf'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
valeur = []
n = 1
for i in all_filenames:
fp = open(i, "rb")
parser = PDFParser(fp)
doc = PDFDocument(parser)
fields = resolve1(doc.catalog["AcroForm"])["Fields"]
for i in fields:
field = resolve1(i)
name, value = field.get("T"), field.get("V")
filehehe = "{0}:{1}".format(name,value)
values = resolve1(value)
names = resolve1(name)
valeur.append(values)
n = n+1
with open('test.csv','wb') as f:
for i in valeur:
f.write(i)
The goal here is to pick up some informations in PDF. Here's the output :
As you can see, the format is not pretty. I'm not very familiar with open() so I'm kind of stuck.
I would like to have distinct rows for each PDF with each informations having her own cell. Something like that :

Try to store the data from each pdf file in a separate list. And add this list to the valeur list which you have.
Use csv module as #martineau rightly suggested.
You can try the with below code.
import csv
valeur = []
#your code
n = 1
for i in all_filenames:
temp_list = []
fp = open(i, "rb")
parser = PDFParser(fp)
doc = PDFDocument(parser)
fields = resolve1(doc.catalog["AcroForm"])["Fields"]
for i in fields:
field = resolve1(i)
name, value = field.get("T"), field.get("V")
filehehe = "{0}:{1}".format(name,value)
values = resolve1(value)
names = resolve1(name)
temp_list.append(values)
n = n+1
valeur.append(temp_list)
#Finally when you have the required data, you can write to csv file like this.
with open('mycsv.csv', 'w', newline='') as myfile:
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
for val in valeur:
wr.writerow(val)
With this, the output would be like this

Related

Defining a list within the code versus reading it from a file

I am trying to count the number of specific words in a given report. Does anyone know why defining a list within the code makes the second part of the following code run faster than reading the list from a file? Is there a solution? The list contains the same words is a lot longer than two words in the following example.
# Example code: Within code list
import csv
import glob
import re
import time
TARGET_FILES = r'C:/Users/s170760/Desktop/Reports_Cleaned/*.*'
OUTPUT_FILE = r'C:/Users/s170760/Desktop/Parser.csv'
OUTPUT_FIELDS = ['file name', 'create']
create = {'agile', 'skills'}
def main():
f_out = open(OUTPUT_FILE, 'w')
wr = csv.writer(f_out, lineterminator='\n')
wr.writerow(OUTPUT_FIELDS)
file_list = glob.glob(TARGET_FILES)
for file in file_list:
print(file)
with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
doc = f_in.read()
doc = doc.lower()
output_data = get_data(doc)
output_data[0] = file
wr.writerow(output_data)
def get_data(doc):
_odata = [0] * 2
tokens = re.findall('\w(?:[-\w]*\w)?', doc)
for token in tokens:
if token in create:
_odata[1] += 1
return _odata
Here is the other way:
# Example code: Reading list from a file
import csv
import glob
import re
import time
TARGET_FILES = r'C:/Users/s170760/Desktop/Reports_Cleaned/*.*'
OUTPUT_FILE = r'C:/Users/s170760/Desktop/Parser.csv'
OUTPUT_FIELDS = ['file name', 'create']
create = open('C:/Users/s170760/Desktop/Create.txt', 'r').read().splitlines()
def main():
f_out = open(OUTPUT_FILE, 'w')
wr = csv.writer(f_out, lineterminator='\n')
wr.writerow(OUTPUT_FIELDS)
file_list = glob.glob(TARGET_FILES)
for file in file_list:
print(file)
with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
doc = f_in.read()
doc = doc.lower()
output_data = get_data(doc)
output_data[0] = file
wr.writerow(output_data)
def get_data(doc):
_odata = [0] * 2
tokens = re.findall('\w(?:[-\w]*\w)?', doc)
for token in tokens:
if token in create:
_odata[1] += 1
return _odata

As pointed out by Mark in the comments, the first code snippet uses a set of strings, while the second code snippet loads a file into a list of strings.
Why sets are faster than lists in this use case, is well explained in this Stack Overflow answer. Parsing the output of open to a set can indeed solve your problem.
So replace:
create = open('C:/Users/s170760/Desktop/Create.txt', 'r').read().splitlines()
With:
create = set(open('C:/Users/s170760/Desktop/Create.txt', 'r').read().splitlines())

How can I write just one float item in csv?

In this question:
First I read the scores from the csv file and then
I saved an item in the following code in the lw list.
I want to write the lw list in a csv file.
How can I do this?
I read scores from a csv file called alaki.csv:
mandana,5,7,3,15
hamid,3,9,4,20,9,1,8,16,0,5,2,4,7,2,1
sina,19,10,19,6,8,14,3
sara,0,5,20,14
soheila,13,2,5,1,3,10,12,4,13,17,7,7
ali,1,9
sarvin,0,16,16,13,19,2,17,8
import csv
# For the average
from statistics import mean
import operator
from collections import Counter
def calculate_average_of_averages(input_file_name, output_file_name):
#output_file_name=chert.csv
with open(input_file_name) as d:
se = csv.reader(d)
l = {}
for ldf in se:
name = ldf[0]
lsd = mean([float(sd) for sd in ldf[1:]])
l[name] = lsd
with open(output_file_name,'w') as ff:
fd = csv.writer(ff)
a = list(l.values())
lw = []
m = mean(a)
lw.append(m)
calculate_average_of_averages('alaki.csv','chert.csv')
output in csv file:
8.401530612244898
please help me

How about this:
import csv
# For the average
from statistics import mean
import operator
from collections import Counter
def calculate_average_of_averages(input_file_name, output_file_name):
#output_file_name=chert.csv
with open(input_file_name) as d:
se = csv.reader(d)
l = {}
for ldf in se:
name = ldf[0]
lsd = mean([float(sd) for sd in ldf[1:]])
l[name] = lsd
m = mean(list(l.values()))
l["average_of_average"]=m
with open(output_file_name,'w') as ff:
for name,value in l.items():
ff.write("{},{}\n".format(name,value))
calculate_average_of_averages('alaki.csv','chert.csv')
output looks like:
mandana,7.5
hamid,6.066666666666666
sina,11.285714285714286
sara,9.75
soheila,7.833333333333333
ali,5.0
sarvin,11.375
average_of_average,8.401530612244898
to output just average_of_average
replace the write block:
with open(output_file_name,'w') as ff:
ff.write(l['average_of_average'])

You can use the pandas library by adding these 2 lines
import csv
import pandas as pd
# For the average
from statistics import mean
import operator
from collections import Counter
def calculate_average_of_averages(input_file_name, output_file_name):
with open(input_file_name) as d:
se = csv.reader(d)
l = {}
for ldf in se:
name = ldf[0]
lsd = mean([float(sd) for sd in ldf[1:]])
l[name] = lsd
a = list(l.values())
lw = []
m = mean(a)
lw.append(m)
pd.DataFrame(lw,columns=["yourColumn"]).to_csv(output_file_name+".csv")
calculate_average_of_averages('alaki.csv','chert.csv')

I am not sure if CSV writer is necessary to write just one line.
import csv
from statistics import mean
def calculate_mean_of_means(input_file, output_file):
with open(input_file, newline='') as csvfile:
csvreader = csv.reader(csvfile)
ls = {}
for row in csvreader:
str_to_int = [int(i) for i in row[1:]]
ls[row[0]] = str_to_int
total_means = 0
for score in ls.values():
total_means += mean(score)
mean_of_means = [total_means / len(ls)]
with open(output_file, 'w', newline='') as csvfile:
meanwriter = csv.writer(csvfile)
meanwriter.writerow(mean_of_means)
calculate_mean_of_means('alaki.csv', 'chert.csv')

Progress bar while parsing files

The code below goes to a directory that has xml files, it takes them and parses them into a dataframe.
from xml.etree import ElementTree as ET
from collections import defaultdict
from pathlib import Path
import csv
from pathlib import Path
directory = 'C:/Users/xml_files'
with open('try.csv', 'w', newline='') as f:
writer = csv.writer(f, delimiter=';')
#◙ writer = csv.writer(f)
headers = ['identify','id', 'service_code', 'rational', 'qualify', 'description_num', 'description_txt','Counter', 'set_data_xin', 'set_data_xax', 'set_data_value', 'set_data_x']
writer.writerow(headers)
xml_files_list = list(map(str,Path(directory).glob('**/*.xml')))
for xml_file in xml_files_list:
tree = ET.parse(xml_file)
root = tree.getroot()
p_get = tree.find('.//Phones/Get').text
p_set = tree.find('.//Phones/Set').text
start_nodes = root.findall('.//START')
for sn in start_nodes:
row = defaultdict(str)
# <<<<< Indentation was wrong here
for k,v in sn.attrib.items():
row[k] = v
for rn in sn.findall('.//Rational'):
row['Rational'] = rn.text
for qu in sn.findall('.//Qualify'):
row['Qualify'] = qu.text
for ds in sn.findall('.//Description'):
row['Description_txt'] = ds.text
row['Description_text_id'] = ds.attrib['text_id']
for counter, st in enumerate( sn.findall('.//SetData') ):
for k,v in st.attrib.items():
if v.startswith("-"):
v = v.replace("-","",1)
v=v.replace(',', '.')
row['SetData_'+ str(k)] = v
row["Counter"] = counter
row_data = [row[i] for i in headers]
row_data[0]=p_get + '_' + p_set
writer.writerow(row_data)
row = defaultdict(str)
Upon using more data, it is really hard to just wait there and not know how far the parsing into dataframe has been done.
So I went and tried to find a way I can show the progress bar. I ended up finding the following
import tqdm
import time
for i in tqdm.tqdm(range(1000)):
time.sleep(0.01)
# or other long operations
I am having problem implementing the code into my code and finding the range which preferably would be to get the numbers of Xml files in that directory
This library tqdm seemed like the easiest one to implement.

You could use
for xml_file in tqdm.tqdm(xml_files_list):
it should automatically use len(xml_files_list) and it will return xml_file.
And you don't need sleep(). It was used in documentation only to slow down loop for example.

How to write csv file in html?

I have read file of csv but I have a problem that how to read CSV file and save it in table.html?
import csv
html_about = ''
names = []
with open('filo.csv') as data_file:
csv_data = csv.reader(data_file)
for line in csv_data:
names.append(f'{line[0]}')
html_output = '\n<ul>'
for name in names:
html_output += f'\n\t<li>{name}</li>'
html_output += '\n</ul>'
from prettytable import PrettyTable
x = PrettyTable(line[0])
html_code = x.get_html_string()
html_file = open('table.html','w')
html_file = html_file.write(html_code)

I suggest you use pandas library,
it has pd.read_csv, and also pd.to_html
usage should look like this, let me know if this works for you:
import pandas as pd
df = pd.read_csv('filo.csv')
with open('table.html', 'w') as html_file:
df.to_html(html_file)

convert xls to json in python

I am trying to convert xls to json and but when I am executing the code it's not giving me the data inside xls sheet, it's only giving me the json structure.
Below is the code which I am running, I am not able to understand what modification I should further make in this so that I can get a perfect json file.
Please note - input is in the form of binary stream and output is also in the form of a stream and not file.
#!/usr/bin/python -u
import sys
import xlrd
import simplejson
from collections import OrderedDict
wb = xlrd.open_workbook(file_contents=sys.stdin.read())
for sheet_index in range(wb.nsheets):
# print sheet_index
sh = wb.sheet_by_index(sheet_index)
# print "Processing sheet no ", sheet_index
attributes = sh.row_values(0)
#print attributes
rows_list = []
attr_list = []
# print attr_list[0]
for rownum in range(1,sh.nrows):
row_val_list = sh.row_values(rownum)
row_dict = OrderedDict()
for index in range(len(attr_list)):
row_dict[attr_list[index]] = row_val_list[index]
#row_dict['ID'] = row_val_list[0]
#row_dict['Name'] = row_val_list[1]
#rows_list.append(row_dict)
#json_data = simplejson.dumps(rows_list)
#sys.stdout.write(json_data)
rows_list.append(row_dict)
json_data = simplejson.dumps(rows_list)
sys.stdout.write(json_data)
# json_data = simplejson.dumps(rows_list)
#sys.stdout.write(json_data)
~
Any help is much appreciated

here is the correct working python code
#!/usr/bin/python -u
import sys
import xlrd
import simplejson
from collections import OrderedDict
wb = xlrd.open_workbook(file_contents=sys.stdin.read())
#print "Sheets are .... ", wb.nsheets
for sheet_index in range(wb.nsheets):
sh = wb.sheet_by_index(sheet_index)
if sh.nrows == 0:
continue
attr_list = sh.row_values(0)
rows_list = []
for rownum in range(1,sh.nrows):
row_values = sh.row_values(rownum)
row_dict = OrderedDict()
for index in range(len(attr_list)):
row_dict[attr_list[index]] = row_values[index]
rows_list.append(row_dict)
json_data = simplejson.dumps(rows_list)
sys.stdout.write(json_data)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to build specific format with open()? - python

Related

Defining a list within the code versus reading it from a file

How can I write just one float item in csv?

Progress bar while parsing files

How to write csv file in html?

convert xls to json in python

Categories

Resources