I have an excel file where sheet1 looks like this:
Company names are in bold [APPLE, EPSON, ROLAND]
project names are below the Company names.
here is CSV from the pictured:
Report Date,10/10/18,,,
Page 1 of 1,,,,
,,,,
Project Name,,Job Number,Start Date,Due Date
,,,,
APPLE,,,,
macbook,,12345,1/1/19,2/1/19
iphone,,23456,1/2/19,2/2/19
,,,,
EPSON,,,,
ET-2000 printer,,34567,1/3/19,2/4/19
,,,,
ROLAND,,,,
RD-700,,45678,1/4/19,2/4/19
The worksheet is in memory using openpyxl. My desired output is to have a python dictionary with company as the key. Below is what I have tried but the output dict has all projects in each company rather that just the projects for each company.
from openpyxl import load_workbook
from collections import namedtuple
Record = namedtuple('Record', 'project_name job_number start_date due_date ')
from pprint import pprint
wb = load_workbook('SOquestion.xlsx')
ws = wb.active
def make_co_list(ws):
co_list = []
for _ in ws.rows:
if _[0].value and _[2].value == None:
co_list.append(_[0].value)
co_list.remove('Report Date')
co_list.remove('Page 1 of 1')
return co_list
co_list = make_co_list(ws)
co_dict = {c:[] for c in co_list}
for k,v in co_dict.items():
for row in ws.rows:
if row[0].value == k:
co = k
for row in ws.rows:
if co and row[2].value and row[0].value not in ["Report Date", "Page 1 of 1", "Project Name", co_list] :
record = Record(row[0].value,
row[2].value,
row[3].value,
row[4].value
)
print("record", record)
co_dict[co].append(record)
That double loop over ws.rows doesn't look good. I would go with a state-based approach: (I have not tested this, but the principle should work.)
current_company = None
co_dict = collections.defaultdict(list)
for row in ws.rows:
if row[0].value and row[2].value is None: # new company section
current_company = row[0].value
continue
if current_company is None or row[0].value is None: # empty row
continue
record = Record(row[0].value,
row[2].value,
row[3].value,
row[4].value
)
print("record", record)
co_dict[current_company].append(record)
Related
I'm trying to read an excel Table object into python and can't find any syntax for doing so.
It would be useful to read a whole table into e.g. a dict of dicts (I'm trying not to pull in pandas as a dependency for this particular project).
I can't find any way of doing this.
Below code will read through the table row by row, Also you can specify the range
import openpyxl
wb = openpyxl.load_workbook('example.xlsx')
sheet = wb.active
sheet['A1':'B7']
for i1,i2 in sheet:
print("{0:8} {1:8}".format(i1.value,i2.value))
Output:
Student_name Marks
Tony Stark 47
Loki 59
Oddin 73
Nick Fury 62
Samaul 75
Peter Parkar 80
I got it working as follows:
import openpyxl
def all_tables_data(filename: str) -> dict:
"""
Get values for all tables in a spreadsheet.
Returns a dict of tables, keyed by table name.
Table values are given as lists of lists.
"""
workbook = openpyxl.load_workbook(filename)
tables_by_name = {}
table_worksheets = {}
for worksheet in workbook.worksheets:
for table in worksheet._tables:
tables_by_name[table.name] = table
table_worksheets[table.name] = worksheet
def get_vals(table_name: str) -> list:
worksheet = table_worksheets[table_name]
cell_range = worksheet[tables_by_name[table_name].ref]
return [[cell.value for cell in row] for row in cell_range]
return {table_name: get_vals(table_name) for table_name in tables_by_name}
if __name__ == "__main__":
FILENAME = "my_spreadsheet.xlsx"
TABLES = all_tables_data(FILENAME)
print(TABLES)
I'm interested in converting tables to dicts.
Often the rows/entries in a table may have >1 key.
Assuming the table has a header row, I also cobbled the code below together.
It can be used to convert a table to a dict of rows, with each row being a dict of values, keyed by the column headers.
class WorkbookWithTables:
def __init__(self, workbook):
self.workbook = workbook
self.tables_by_name = {}
self.table_worksheets = {}
for worksheet in self.workbook.worksheets:
for table in worksheet._tables:
self.tables_by_name[table.name] = table
self.table_worksheets[table.name] = worksheet
#classmethod
def from_file(cls, filename):
_workbook = openpyxl.load_workbook(filename)
return cls(_workbook)
def table_to_dict(self, table_name, n_keys=1):
worksheet = self.table_worksheets[table_name]
cell_range = worksheet[self.tables_by_name[table_name].ref]
table_dict = {}
value_headers = tuple(cell.value for cell in cell_range[0][n_keys:])
for row in cell_range[1:]:
if n_keys == 1:
key = row[0].value
else:
key = tuple(partial_key.value for partial_key in row[:n_keys])
values = {
value_headers[i]: cell.value
for i, cell in enumerate(row[n_keys:])
}
table_dict[key] = values
return table_dict
if __name__ == "__main__":
FILENAME = "my_spreadsheet.xlsx"
WB = WorkbookWithTables.from_file(FILENAME)
MY_SINGLE_KEYED_TABLE = WB.table_to_dict("my_single_keyed_table")
MY_DOUBLE_KEYED_TABLE = WB.table_to_dict("my_double_keyed_table", 2)
i try to write this list of dicts into xlsx file using openpyxlsx
products= [{'id':46329',
'discription':'AD BLeu',
'marque':'AZERT',
'category':'liquid',
'family': 'ADBLEU',
'photos':'D:\\hamzawi\\hamza\\image2py\\46329_1.png'},
{dict2 ...},
{dictn...}
]
# creat a workbook
filena = "produitimage.xlsx"
workbook = Workbook()
sheet = workbook.active
#add headers
sheet.append(["Product ID", "Product Name", "Marque",
"Category", "Family", "Photos"])
for product in products:
for item in product.items():
for row, entry in enumerate(item, start=3):
sheet.cell(row=row, column=1, value=entry)
#add some images
images = [item['photos'] for item in products]
for image in images:
logo = Image(image)
#logo.height = 150
#logo.width = 150
sheet.add_image(logo)
workbook.save(filename=filena)
i got xlsx file with only headers no data
Question: append list of dict
import openpyxl
products = [{'id':46329,
'discription':'AD BLeu',
'marque':'AZERT',
'category':'liquid',
'family': 'ADBLEU',
'photos':'D:\\hamzawi\\hamza\\image2py\\46329_1.png'}
]
# Dictionarys are not in order by default
# Define a `list` of `keys` in desired order
fieldnames = ['id', 'discription', 'marque', 'category', 'family', 'photos']
# create a new workbook
wb = openpyxl.Workbook()
ws = wb.active
# append headers
ws.append(["Product ID", "Product Name", "Marque", "Category", "Family", "Photos"])
# append data
# iterate `list` of `dict`
for product in products:
# create a `generator` yield product `value`
# use the fieldnames in desired order as `key`
values = (product[k] for k in fieldnames)
# append the `generator values`
ws.append(values)
# show Worksheet Values
for row_values in ws.iter_rows(values_only=True):
for value in row_values:
print(value, end='\t')
print()
Output:
Product ID Product Name Marque Category Family Photos
46329 AD BLeu AZERT liquid ADBLEU D:\hamzawi\hamza\image2py\46329_1.png
If you want the image, instead of the image file path, change the following:
# remove 'photos' from fieldnames
fieldnames = \
['id', 'discription', 'marque', 'category', 'family']
# you need the Row index, add a `enumerate(..., 2)`
for row, product in enumerate(products,2):
values = (product[k] for k in fieldnames)
sheet.append(values)
# after append the `values` add the image
# Here, Column 'F'
ws.add_image(Image(product['photos']), 'F{}'.format(row))
There are some problems in your code.
First, you are incrementing next_row value in the loop where you setting it, so increment doesn't have effect and every iteration the value of next_row equals 3.
Second, you are trying to write a list of dict values to excel cell, but I think you want it will be written as a row. So you need to just append it as you did with header above the loops:
for product in products:
sheet.append(list(product.values()))
If you need to insert an image in last cell in a row you may rewrite loop that way:
for row_index, product in enumerate(products):
values = list(product.values())
sheet.append(values[:-1])
col_row = get_column_letter(len(values)) + str(row_index+1)
photo_path = values[-1]
sheet.add_image(Image(photo_path), col_row)
import PyPDF2
import re
import xlsxwriter
docsFile = open('image0001.pdf','rb')
pdfReader = PyPDF2.PdfFileReader(docsFile)
loanNumberlist = []
loan2Matchlist = []
poolNumlist = []
borrowerNamelist = []
wb = xlsxwriter.Workbook('docInfo.xlsx')
ws = wb.add_worksheet('sheet2')
row = 0
columnHeaders = ['Borrower Name', 'Loan Number', 'LD Loan Number', 'Pool #']
for col, colname in enumerate(columnHeaders, start=0):
ws.write(row, col, colname)
class pdfExtract:
def __init__(self, pg):
self.pg = pg
def extractShit(self):
pageObj = pdfReader.getpage(self.pg)
pgData = pageObj.extractText()
loanNumber = re.split('\\bLoan #:\\b', pgData)[-1]
loanNumberlist.append(loanNumber)
loan2Match = re.match(r"?:/\d{0,10}", pgData)[-1]
loan2Matchlist.append(loan2Match)
poolNumber = re.split('\\bPool #:\\b',pgData)[-1]
poolNumlist.append(poolNumber)
borrowerName =re.split('\\bBorrower #:\\b',pgData)[-1]
borrowerNamelist.append(borrowerName)
for page in range(0, 223):
pdfExtract(page)
for row, rowvar in enumerate(borrowerNamelist, start=1):#write Borrower name
col = 0
ws.write_string(row, col, rowvar)
for row, lnNM in enumerate(loanNumberlist, start=1):#write loan number 1
col = 1
ws.write_number(row, col, lnNM)
for row, lnNM2 in enumerate(loan2Matchlist, start=1):#write loan number 2
col = 2
ws.write_number(row, col, lnNM2)
for row, plNm in enumerate(poolNumlist, start=1):#write pool number
col = 3
ws.write_number(row, col, plNm)
wb.close()
So, I wrote this program to grab data from a pdf file and return 4 things in each page and put them into an excel file. That looks like:
Loan #: 0065192080/3000009289
Pool#: AK1576
Borrower: David h Theman
I have to grab each page get the first loan number, then the second loan number(after the “/“). Then the rest.
It runs through, but all I get to see on the excel file is the headers no data and no errors.
I thought it was how I'm returning the data or how I'm writing it, but it has no changes. Would it have to do with my For loops? The regex code I got from different answers on here. I've changed how I write it to the excel file, but no luck.
I extracted informations from ERP database of my company and with these data I have to add them to a sheet already created. But unfortunately my problem is being in how to use these commands.. The openpyxl documentation is not help me :(
This is an example of my sheet.
import sys
import datetime
from openpyxl import load_workbook
data = datetime.datetime.now()
wb = load_workbook('/home/multipla/Documentos/test.xlsx')
ws = wb.active
for i in ws['B3':'F12']:
for j in i:
print j
#Edit:
Write these cell "Hands On":
ws["B3"].value = '2324'
ws["C3"].value = 'Patty'
ws["D3"].value = ''
ws["E3"].value = "YES"
ws["F3"].value = "Reading"
Tried to automatize that process, by making a For Loop to go through each cell and write them...
#Edit2
If you have a list of dictionaries with your properties, that is:
object_1 = dict()
object_1['ID'] = 1337
object_1['NAME'] = 'Pencil'
...
object_2 = dict()
object_2['ID'] = 1338
object_2['NAME'] = 'Eraser'
...
object_list = list()
object_list.append(object_1)
object_list.append(object_2)
Then you could do something like this:
def add_object(ob, row):
ws.cell(column=2, row=row).value = ob['ID']
ws.cell(column=3, row=row).value = ob['NAME']
ws.cell(column=4, row=row).value = ob['Y']
ws.cell(column=5, row=row).value = ob['X']
ws.cell(column=6, row=row).value = ob['ISSUE']
def add_object_list(ob_list):
for i, ob in enumerate(ob_list):
add_object(ob, i + 3)
add_object_list(object_list)
Result:
I have a huge file, which has some missing rows. The data needs to be rooted at Country.
The input data is like:
csv_str = """Type,Country,State,County,City,
1,USA,,,
2,USA,OH,,
3,USA,OH,Franklin,
4,USA,OH,Franklin,Columbus
4,USA,OH,Franklin,Springfield
4,USA,WI,Dane,Madison
"""
which needed to be:
csv_str = """Type,Country,State,County,City,
1,USA,,,
2,USA,OH,,
3,USA,OH,Franklin,
4,USA,OH,Franklin,Columbus
4,USA,OH,Franklin,Springfield
4,USA,WI,,
4,USA,WI,Dane,
4,USA,WI,Dane,Madison
"""
The key as per my logic is Type field, where if I cannot find a County (type 3) for a City (type 4), then insert a row with fields upto County.
Same with County. If I cannot find a State (type 2) for a County (type 3), then insert a row with fields upto State.
With my lack of understanding the facilities in python, I was trying more of a brute-force approach. It is bit problematic as I need lot of iteration over the same file.
I was also tried google-refine, but couldn't get it work. Doing manually is quite error prone.
Any help appreciated.
import csv
import io
csv_str = """Type,Country,State,County,City,
1,USA,,,
2,USA,OH,,
3,USA,OH,Franklin,
4,USA,OH,Franklin,Columbus
4,USA,OH,Franklin,Springfield
4,USA,WI,Dane,Madison
"""
found_county =[]
missing_county =[]
def check_missing_county(row):
found = False
for elm in found_county:
if elm.Type == row.Type:
found = True
if not found:
missing_county.append(row)
print(row)
reader = csv.reader(io.StringIO(csv_str))
for row in reader:
check_missing_county(row)
I've knocked up the following based on my understanding of the question:
import csv
import io
csv_str = u"""Type,Country,State,County,City,
1,USA,,,
2,USA,OH,,
3,USA,OH,Franklin,
4,USA,OH,Franklin,Columbus
4,USA,OH,Franklin,Springfield
4,USA,WI,Dane,Madison
"""
counties = []
states = []
def handle_missing_data(row):
try:
rtype = int(row[0])
except ValueError:
return []
rtype = row[0]
country = row[1]
state = row[2]
county = row[3]
rows = []
# if a state is present and it hasn't a row of it's own
if state and state not in states:
rows.append([rtype, country, state, '', ''])
states.append(state)
# if a county is present and it hasn't a row of it's own
if county and county not in counties:
rows.append([rtype, country, state, county, ''])
counties.append(county)
# if the row hasn't already been added add it now
if row not in rows:
rows.append(row)
return rows
csvf = io.StringIO(csv_str)
reader = csv.reader(csvf)
for row in reader:
new_rows = handle_missing_data(row)
for new_row in new_rows:
print new_row