write python list of dicts to xlsx using openpyxl - python

i try to write this list of dicts into xlsx file using openpyxlsx
products= [{'id':46329',
'discription':'AD BLeu',
'marque':'AZERT',
'category':'liquid',
'family': 'ADBLEU',
'photos':'D:\\hamzawi\\hamza\\image2py\\46329_1.png'},
{dict2 ...},
{dictn...}
]
# creat a workbook
filena = "produitimage.xlsx"
workbook = Workbook()
sheet = workbook.active
#add headers
sheet.append(["Product ID", "Product Name", "Marque",
"Category", "Family", "Photos"])
for product in products:
for item in product.items():
for row, entry in enumerate(item, start=3):
sheet.cell(row=row, column=1, value=entry)
#add some images
images = [item['photos'] for item in products]
for image in images:
logo = Image(image)
#logo.height = 150
#logo.width = 150
sheet.add_image(logo)
workbook.save(filename=filena)
i got xlsx file with only headers no data

Question: append list of dict
import openpyxl
products = [{'id':46329,
'discription':'AD BLeu',
'marque':'AZERT',
'category':'liquid',
'family': 'ADBLEU',
'photos':'D:\\hamzawi\\hamza\\image2py\\46329_1.png'}
]
# Dictionarys are not in order by default
# Define a `list` of `keys` in desired order
fieldnames = ['id', 'discription', 'marque', 'category', 'family', 'photos']
# create a new workbook
wb = openpyxl.Workbook()
ws = wb.active
# append headers
ws.append(["Product ID", "Product Name", "Marque", "Category", "Family", "Photos"])
# append data
# iterate `list` of `dict`
for product in products:
# create a `generator` yield product `value`
# use the fieldnames in desired order as `key`
values = (product[k] for k in fieldnames)
# append the `generator values`
ws.append(values)
# show Worksheet Values
for row_values in ws.iter_rows(values_only=True):
for value in row_values:
print(value, end='\t')
print()
Output:
Product ID Product Name Marque Category Family Photos
46329 AD BLeu AZERT liquid ADBLEU D:\hamzawi\hamza\image2py\46329_1.png
If you want the image, instead of the image file path, change the following:
# remove 'photos' from fieldnames
fieldnames = \
['id', 'discription', 'marque', 'category', 'family']
# you need the Row index, add a `enumerate(..., 2)`
for row, product in enumerate(products,2):
values = (product[k] for k in fieldnames)
sheet.append(values)
# after append the `values` add the image
# Here, Column 'F'
ws.add_image(Image(product['photos']), 'F{}'.format(row))

There are some problems in your code.
First, you are incrementing next_row value in the loop where you setting it, so increment doesn't have effect and every iteration the value of next_row equals 3.
Second, you are trying to write a list of dict values to excel cell, but I think you want it will be written as a row. So you need to just append it as you did with header above the loops:
for product in products:
sheet.append(list(product.values()))
If you need to insert an image in last cell in a row you may rewrite loop that way:
for row_index, product in enumerate(products):
values = list(product.values())
sheet.append(values[:-1])
col_row = get_column_letter(len(values)) + str(row_index+1)
photo_path = values[-1]
sheet.add_image(Image(photo_path), col_row)

Related

How to search equal product numbers in two columns from two different excel tables and copy-paste certain cells from matched row to new table

I have two excel tables:
old_data.xlsx
Product number Name Current price Other columns
1000 Product name 1 10
AB23104 Product name 2 5
430267 Product name 3 20
new_data.xlsx
Product number Name New price Other columns
AB23104 Renamed product name 2 20
1000 Renamed product name 1 5
345LKT10023 Product name 4 100
Expected result: table below + 2 feedback messages somewhere
Message 1) Product ID 430267 is missing in new data file
Message 2) Product ID 345LKT10023 is newly added
Product ID Name of product New price Old price
AB23104 Product name 2 20 5
1000 Product name 1 5 10
345LKT10023 Product name 4 100 100
I have this code for now, but it is not working and not finished due to lack of knowledge on my part:
import openpyxl
import pandas as pd
new_datacols = [0, 1, 2]
old_datacols = [0, 1, 2]
new_data = pd.read_excel('new_data.xlsx', skiprows=1, usecols=new_datacols, index_col=0)
old_data = pd.read_excel('old_data.xlsx', skiprows=1, usecols=old_datacols, index_col=0)
def format_data():
# combine_type = inner, left, right, outer
df = pd.merge(new_data, old_data, on='Product number', how='outer')
df = df.rename(columns={"Product number": "Product ID",
"Name": "Name of product",
"Current price": "Old price"})
nan_value = float("NaN")
df.replace("", nan_value, inplace=True)
df.dropna(subset=["Name of product"], inplace=True)
df = df[['Product ID', 'Name of product',
'New price', 'Old price']]
print(df.columns)
# df.to_excel('updated_table.xlsx')
if __name__ == "__main__":
format_data()
This is my attempt. It puts the messages in another sheet in the same file. The final spreadsheet looks like this:
import os
import pandas as pd
old_data_filename = r"old_data.xlsx"
new_data_filename = r"new_data.xlsx"
new_spreadsheet_filename = r"updated_products.xlsx"
# Load spreadsheets into a dataframe and set their indexes to "Product number"
old_data_df = pd.read_excel(old_data_filename).set_index("Product number")
new_data_df = pd.read_excel(new_data_filename).set_index("Product number")
# Determine which products are new/missing, and store the corresponding
# messages in a list, which will be written to its own spreadsheet at the end
old_data_products = set(old_data_df.index)
new_data_products = set(new_data_df.index)
new_products = new_data_products - old_data_products
missing_products = old_data_products - new_data_products
messages = [f"Product ID {product} is missing in new data file" for product in missing_products]
messages.extend(f"Product ID {product} is newly added" for product in new_products)
messages = [f"Message {i}) {message}" for i, message in enumerate(messages, start=1)]
# Keep the original product names
new_data_df.update(old_data_df["Name"])
# Old price is the same as new price unless the product is in old_data_df, in which
# case it is old_data_df["Current price"]
new_data_df["Old price"] = new_data_df["New price"]
new_data_df["Old price"].update(old_data_df["Current price"])
# Rename the columns
new_data_df.reset_index(inplace=True)
new_data_df.rename(columns={"Product number": "Product ID",
"Name": "Name of product"}, inplace=True)
# Remove all other columns except the ones we want
new_data_df = new_data_df[["Product ID",
"Name of product",
"New price", "Old price"]]
# Write the new products and messages to separate sheets in the same file
with pd.ExcelWriter(new_spreadsheet_filename) as writer:
new_data_df.to_excel(writer, "Products", index=False)
pd.DataFrame({"Messages": messages}).to_excel(writer, "Messages", index=False)
# Launch the new spreadsheet
os.startfile(new_spreadsheet_filename)
EDIT: Code that works with the actual spreadsheets:
import os
import pandas as pd
old_data_filename = r"old_data.xlsx"
new_data_filename = r"new_data.xlsx"
new_spreadsheet_filename = r"updated_products.xlsx"
# Load spreadsheets into a dataframe and set their indexes to "Product number"
old_data_df = pd.read_excel(old_data_filename).set_index("Product ID")
new_data_df = pd.read_excel(new_data_filename).set_index("Product ID")
# Remove duplicated indexes for both the dataframes, keeping only the first occurrence
old_data_df = old_data_df[~old_data_df.index.duplicated()]
new_data_df = new_data_df[~new_data_df.index.duplicated()]
# Determine which products are new/missing, and store the corresponding
# messages in a list, which will be written to its own spreadsheet at the end
old_data_products = set(old_data_df.index)
new_data_products = set(new_data_df.index)
new_products = new_data_products - old_data_products
missing_products = old_data_products - new_data_products
messages = [f"Product ID {product} is missing in new data file" for product in missing_products]
messages.extend(f"Product ID {product} is newly added" for product in new_products)
messages = [f"Message {i}) {message}" for i, message in enumerate(messages, start=1)]
# Keep the original product names
new_data_df.update(old_data_df["Name"])
# Old price is the same as new price unless the product is in old_data_df, in which
# case it is old_data_df["Current price"]
new_data_df["Old price"] = new_data_df["New price"]
new_data_df["Old price"].update(old_data_df["Current price"])
# Rename the "Name" column to "Name of product"
new_data_df.rename(columns={"Name": "Name of product"}, inplace=True)
# Remove all other columns except the ones we want
new_data_df.reset_index(inplace=True)
new_data_df = new_data_df[["Product ID",
"Name of product",
"New price", "Old price"]]
# Write the new products and messages to separate sheets in the same file
with pd.ExcelWriter(new_spreadsheet_filename) as writer:
new_data_df.to_excel(writer, "Products", index=False)
pd.DataFrame({"Messages": messages}).to_excel(writer, "Messages", index=False)
# Launch the new spreadsheet
os.startfile(new_spreadsheet_filename)

Add every scraped item to csv row pandas

I have a selenium project that scrape website and loop to get inner class text
I want to save every scraped text from this loop to a new csv row located next to the py file, and accept new columns if added in the future
How do i do that?
This is what i tried
prodTitle = driver.find_elements_by_xpath("//*[contains(#class,'itemTitle')]")
for pTitle in prodTitle:
itemName = pTitle
pd = pd.dataframe(pTitle.text)
pd.to_csv('data.csv', pd)
print(pTitle.text)
but it add the last item only
You can add the data in the same loop and then save the whole dataframe, like this:
prodTitle = driver.find_elements_by_xpath("//*[contains(#class,'itemTitle')]")
df = pd.DataFrame(columns=['Title'])
for (idx,pTitle) in enumerate(prodTitle):
itemName = pTitle
df.loc[idx, 'Title'] = pTitle.text
print(pTitle.text)
df.to_csv('data.csv')
EDIT: to add more data it is convenient set the column before the loop, like this:
cols = ['Title', 'Col_0', 'Col_1', 'Col_N']
df = pd.DataFrame(columns=cols)
and then inside the loop:
...
df.loc[idx, 'Title'] = title
df.loc[idx, 'Col_0'] = data_0
df.loc[idx, 'Col_1'] = data_1
df.loc[idx, 'Col_N'] = data_N
...
EDIT (because I found another way):
You can create a list with all the data and then passed them to a DataFrame:
prodTitle = driver.find_elements_by_xpath("//*[contains(#class,'itemTitle')]")
data = []
for pTitle in prodTitle:
itemName = pTitle
data.append([pTitle.text, pTitle.data_0, pTitle.data_1, ...])
columns = ['Title', 'Col_0', 'Col_1', ...]
df = pd.DataFrame(data=data, columns=columns)

Speed up pd.read_excel in python

i'm writing python that read excel data and import it to database. For 10,000~30,000 records is fine. But 150,000+ records it took me over 13 seconds. How i can speed up it ?
f = request.files['file']
all_data = {} #insert group data
df = pd.read_excel (f)
df = df.dropna(how='all')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.index.name = excel_conf['config']['default_config']['identification_column']['name'] #column header identification
df.index += 1 #raise index to a valid one
df = df.fillna("")
########## loop take times #########
for index, row in df.iterrows():
row_dict = []
for key in excel_conf['config']['column_config']: #column header name lists
row_dict.append({
key : row[key]
#key (from excel config) row[key] (row value from excel)
})
index_key = index_keygen(create_blake2s_signature(...)) #just create index key stuffs, i shorted it
# add child data to main
all_data[index_key] = row_dict
#"test_key" : { "key":"value",... }
####################################
insert_db(all_data) #this is fast

openpyxl read Table to python

I'm trying to read an excel Table object into python and can't find any syntax for doing so.
It would be useful to read a whole table into e.g. a dict of dicts (I'm trying not to pull in pandas as a dependency for this particular project).
I can't find any way of doing this.
Below code will read through the table row by row, Also you can specify the range
import openpyxl
wb = openpyxl.load_workbook('example.xlsx')
sheet = wb.active
sheet['A1':'B7']
for i1,i2 in sheet:
print("{0:8} {1:8}".format(i1.value,i2.value))
Output:
Student_name Marks
Tony Stark 47
Loki 59
Oddin 73
Nick Fury 62
Samaul 75
Peter Parkar 80
I got it working as follows:
import openpyxl
def all_tables_data(filename: str) -> dict:
"""
Get values for all tables in a spreadsheet.
Returns a dict of tables, keyed by table name.
Table values are given as lists of lists.
"""
workbook = openpyxl.load_workbook(filename)
tables_by_name = {}
table_worksheets = {}
for worksheet in workbook.worksheets:
for table in worksheet._tables:
tables_by_name[table.name] = table
table_worksheets[table.name] = worksheet
def get_vals(table_name: str) -> list:
worksheet = table_worksheets[table_name]
cell_range = worksheet[tables_by_name[table_name].ref]
return [[cell.value for cell in row] for row in cell_range]
return {table_name: get_vals(table_name) for table_name in tables_by_name}
if __name__ == "__main__":
FILENAME = "my_spreadsheet.xlsx"
TABLES = all_tables_data(FILENAME)
print(TABLES)
I'm interested in converting tables to dicts.
Often the rows/entries in a table may have >1 key.
Assuming the table has a header row, I also cobbled the code below together.
It can be used to convert a table to a dict of rows, with each row being a dict of values, keyed by the column headers.
class WorkbookWithTables:
def __init__(self, workbook):
self.workbook = workbook
self.tables_by_name = {}
self.table_worksheets = {}
for worksheet in self.workbook.worksheets:
for table in worksheet._tables:
self.tables_by_name[table.name] = table
self.table_worksheets[table.name] = worksheet
#classmethod
def from_file(cls, filename):
_workbook = openpyxl.load_workbook(filename)
return cls(_workbook)
def table_to_dict(self, table_name, n_keys=1):
worksheet = self.table_worksheets[table_name]
cell_range = worksheet[self.tables_by_name[table_name].ref]
table_dict = {}
value_headers = tuple(cell.value for cell in cell_range[0][n_keys:])
for row in cell_range[1:]:
if n_keys == 1:
key = row[0].value
else:
key = tuple(partial_key.value for partial_key in row[:n_keys])
values = {
value_headers[i]: cell.value
for i, cell in enumerate(row[n_keys:])
}
table_dict[key] = values
return table_dict
if __name__ == "__main__":
FILENAME = "my_spreadsheet.xlsx"
WB = WorkbookWithTables.from_file(FILENAME)
MY_SINGLE_KEYED_TABLE = WB.table_to_dict("my_single_keyed_table")
MY_DOUBLE_KEYED_TABLE = WB.table_to_dict("my_double_keyed_table", 2)

New column for each element of the array

I use the csv library to create a table of products.
In order to then import it to the site, I need that each characteristic be written in a separate column.
Adding a new row is done using simple loop:
writer = csv.writer(csvfile)
for product in products:
writer.writerow((product['price'],
product['vendor_code'],
product['characteristics']))
Adding a new product:
product = []
product.append({
'price' : price,
'vendor_code' : vendor_code,
'characteristics' : characteristics,
})
characteristics - array that contains each characteristic as a separate element
How do I get the output file in this form:
190$ #0172 characteristic1 characteristic2 characteristic3
characteristics - initialization:
try:
characteristics = []
soup_characteristics = soup.find_all('tr', {'class' : 'product_card__product_characters_item clearfix'})
for ch in soup_characteristics:
characteristics.append(re.sub('\s\s+|\n',' ', ch.text))
except AttributeError:
characteristics = ""
Try unpacking the characteristic array:
for product in products:
writer.writerow((product['price'],
product['vendor_code'],
*product['characteristics']))
Here is the code I tested:
products = [{
'price': 100,
'vendor': 123,
'characters': [7, 8, 9],
}]
with open('test.csv', 'w') as fo:
writer = csv.writer(fo)
for p in products:
writer.writerow((
p['price'],
p['vendor'],
*p['characters'],
))
Here is the content of the test.csv file:
100,123,7,8,9
You should be able to build a list to write as an entire row:
for product in products:
row = [product['price'],product['vendor_code']] # [price,vendor_code]
row.extend(product['characteristics']) # [price,vendor_code,characteristic1,characteristic2,...]
writer.writerow(row) # writes each value in the list as a new column

Categories