While Running through the lots of file getting the memory error at some point,Is there a way to handle it,Is i am handling the files in wrong way or there is any other way of doing it
Here is my code i tried so far
from openpyxl import load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
import pandas as pd
import os
opco=['msd','ped','med','suy_consol','sidy','giser','sdisa']
print("Start reading the template file\n")
wb = load_workbook(filename='template_patch_debt.xlsx',data_only =True)
ws = wb.create_sheet()
ws1=wb.create_sheet()
ws.title = 'vuln_export'
ws1.title='device_export'
print('###########Done Reading the Template File#######'+'/n')
print('#########Start Reading The CSV File#############')
filelist_device=[f for f in os.listdir() if f.endswith(".csv") and 'device' in f]
filelist_patch=[f for f in os.listdir() if f.endswith(".csv") and 'patch' in f]
for file_device in filelist_device:
for file_patch in filelist_patch:
for opco_name in opco:
if opco_name in file_device and opco_name in file_patch:
print(opco_name)
print("Start Reading for the device file for:"+file_device+'\n')
df_1 = pd.read_csv(file_device)
for r in dataframe_to_rows(df_1, index=False, header=True):
ws1.append(r)
print("Start reading patch_debt file :-"+file_patch+'\n')
df = pd.read_csv(file_patch)
for r in dataframe_to_rows(df, index=False, header=True):
ws.append(r)
wb.save((file_patch.rsplit('_',1)[0] +'_debt_measure' '.xlsx'))
wb.close()
for row in ws:
for cell in row:
cell.value = None
for row in ws1:
for cell in row:
cell.value = None
Here is the Error i am getting
MemoryError Traceback (most recent call last)
<ipython-input-1-a60a13cb6158> in <module>
41 for r in dataframe_to_rows(df, index=False, header=True):
42 ws.append(r)
---> 43 wb.save((file_patch.rsplit('_',1)[0] +'_debt_measure' '.xlsx'))
44 wb.close()
Related
This code used to get a xlsx file and write over it, but after updating from pandas 1.1.5 to 1.5.1 I got zipfile.badzipfile file is not a zip file
Then I read here that after pandas 1.2.0 the pd.ExcelWriter(report_path, engine='openpyxl') creates a new file but as this is a completely empty file, openpyxl cannot load it.
Knowing that, I changed the code to this one, but now I'm getting AttributeError: property 'sheets' of 'OpenpyxlWriter' object has no setter. How should I handle this?
book = load_workbook('Resultados.xlsx')
writer = pd.ExcelWriter('Resultados.xlsx', engine='openpyxl')
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
reader = pd.read_excel(r'Resultados.xlsx')
df = pd.DataFrame.from_dict(dict_)
df.to_excel(writer, index=False, header=False, startrow=len(reader) + 1)
writer.close()
TLDR
Use .update to modify writer.sheets
Rearrange the order of your script to get it working
# run before initializing the ExcelWriter
reader = pd.read_excel("Resultados.xlsx", engine="openpyxl")
book = load_workbook("Resultados.xlsx")
# use `with` to avoid other exceptions
with pd.ExcelWriter("Resultados.xlsx", engine="openpyxl") as writer:
writer.book = book
writer.sheets.update(dict((ws.title, ws) for ws in book.worksheets))
df.to_excel(writer, index=False, header=False, startrow=len(reader)+1)
Details
Recreating your problem with some fake data
import numpy as np
from openpyxl import load_workbook
import pandas as pd
if __name__ == "__main__":
# make some random data
np.random.seed(0)
df = pd.DataFrame(np.random.random(size=(5, 5)))
# this makes an existing file
with pd.ExcelWriter("Resultados.xlsx", engine="openpyxl") as writer:
df.to_excel(excel_writer=writer)
# make new random data
np.random.seed(1)
df = pd.DataFrame(np.random.random(size=(5, 5)))
# what you tried...
book = load_workbook("Resultados.xlsx")
writer = pd.ExcelWriter("Resultados.xlsx", engine="openpyxl")
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
reader = pd.read_excel("Resultados.xlsx")
# skipping this step as we defined `df` differently
# df = pd.DataFrame.from_dict(dict_)
df.to_excel(writer, index=False, header=False, startrow=len(reader)+1)
writer.close()
We get the same error plus a FutureWarning
...\StackOverflow\answer.py:23: FutureWarning: Setting the `book` attribute is not part of the public API, usage can give unexpected or corrupted results and will be removed in a future version
writer.book = book
Traceback (most recent call last):
File "...\StackOverflow\answer.py", line 24, in <module>
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
AttributeError: can't set attribute 'sheets'
The AttributeError is because sheets is a property of the writer instance. If you're unfamiliar with it, here is a resource.
In shorter terms, the exception is raised because sheets cannot be modified in the way you're trying. However, you can do this:
# use the `.update` method
writer.sheets.update(dict((ws.title, ws) for ws in book.worksheets))
That will move us past the the AttributeError, but we'll hit a ValueError a couple lines down:
reader = pd.read_excel("Resultados.xlsx")
Traceback (most recent call last):
File "...\StackOverflow\answer.py", line 26, in <module>
reader = pd.read_excel("Resultados.xlsx")
...
File "...\lib\site-packages\pandas\io\excel\_base.py", line 1656, in __init__
raise ValueError(
ValueError: Excel file format cannot be determined, you must specify an engine manually.
Do what the error message says and supply an argument to the engine parameter
reader = pd.read_excel("Resultados.xlsx", engine="openpyxl")
And now we're back to your original zipfile.BadZipFile exception
Traceback (most recent call last):
File "...\StackOverflow\answer.py", line 26, in <module>
reader = pd.read_excel("Resultados.xlsx", engine="openpyxl")
...
File "...\Local\Programs\Python\Python310\lib\zipfile.py", line 1334, in _RealGetContents
raise BadZipFile("File is not a zip file")
zipfile.BadZipFile: File is not a zip file
After a bit of toying, I noticed that the Resultados.xlsx file could not be opened manually after running this line:
writer = pd.ExcelWriter("Resultados.xlsx", engine="openpyxl")
So I reordered some of the steps in your code:
# run before initializing the ExcelWriter
reader = pd.read_excel("Resultados.xlsx", engine="openpyxl")
book = load_workbook("Resultados.xlsx")
# the old way
# writer = pd.ExcelWriter("Resultados.xlsx", engine="openpyxl")
with pd.ExcelWriter("Resultados.xlsx", engine="openpyxl") as writer:
writer.book = book
writer.sheets.update(dict((ws.title, ws) for ws in book.worksheets))
df.to_excel(writer, index=False, header=False, startrow=len(reader)+1)
try this:
filepath = r'Resultados.xlsx'
with pd.ExcelWriter(
filepath,
engine='openpyxl',
mode='a',
if_sheet_exists='overlay') as writer:
reader = pd.read_excel(filepath)
df.to_excel(
writer,
startrow=reader.shape[0] + 1,
index=False,
header=False)
I'm trying to convert all CSV files within a directory into one XLXS file with each csv file becoming a separate worksheet.
The code below works except when I provide the input path in this Line
"for filename in glob.glob(InputPath + "*.csv"):"
I get this error - InvalidWorksheetName: Invalid Excel character '[]:*?/' in sheetname
Does anyone have a suggestion how I can get around this? Full code is below - Thanks!
import xlsxwriter
import glob
import csv
InputPath = r"C:\\Users\\.spyder-py3\\"
workbook = xlsxwriter.Workbook(r"C:\\Users\\.spyder-py3\\Output\\compiled.xlsx")
for filename in glob.glob(InputPath + "\*.csv"):
ws = workbook.add_worksheet(str(filename.split('.')[0]))
spamReader = csv.reader(open(filename, 'r'), delimiter=',',quotechar='"')
row_count = 0
print(filename)
for row in spamReader:
for col in range(len(row)):
ws.write(row_count,col,row[col])
row_count +=1
workbook.close()
Try this:
import xlsxwriter
import glob
import csv
InputPath = r"C:\Users\.spyder-py3"
workbook = xlsxwriter.Workbook(r"C:\Users\.spyder-py3\Output\compiled.xlsx")
for filename in glob.glob(InputPath + r"\*.csv"):
ws = workbook.add_worksheet(str(filename.split('.')[0]))
spamReader = csv.reader(open(filename, 'r'), delimiter=',',quotechar='"')
row_count = 0
print(filename)
for row in spamReader:
for col in range(len(row)):
ws.write(row_count,col,row[col])
row_count +=1
workbook.close()
I'm trying to read values from an xlsx file containing formulas using openpyxl; however, I noticed that for some cells, I'm getting a wrong value.
Here's the XLSX example:
Here's the result I get:
The code:
wb = openpyxl.load_workbook(excel_file, data_only=True)
# getting all sheets
sheets = wb.sheetnames
print(sheets)
# getting a particular sheet
worksheet = wb["Feuil1"]
print(worksheet)
# getting active sheet
active_sheet = wb.active
print(active_sheet)
# reading a cell
print(worksheet["A1"].value)
excel_data = list()
# iterating over the rows and
# getting value from each cell in row
for row in worksheet.iter_rows():
row_data = list()
for cell in row:
#cell.number_format='0.0########'
print(cell.number_format)
row_data.append(str(cell.value))
print(cell.value)
excel_data.append(row_data)
return render(request, 'myapp/index.html', {"excel_data":excel_data})
Hey What you want from an open excel file means which type of format do you gate
data,
This My answer for get data from excel file with xlrd.
import xlrd
from xlrd import open_workbook
fp = tempfile.NamedTemporaryFile(delete= False, suffix=filetype)
fp.write(binascii.a2b_base64(selected file))
workbook = xlrd.open_workbook(file name)
sheet = workbook.sheet_by_name(sheet name)
row = [c or '' for c in sheet.row_values(header_row)]
first_row = []
for col in range(sheet.ncols):
first_row.append(sheet.cell_value(0,col) )
archive_lines = []
for row in range(1, sheet.nrows):
elm = {}
for col in range(sheet.ncols):
elm[first_row[col]]=sheet.cell_value(row,col)
archive_lines.append(elm)
I tried the below code. I tried opening on Excel but I was getting that the file is damaged and then repaired the file to read it. How do I remove the issue of xlsx repair when xlsx is opened.
import csv
from openpyxl import Workbook
def unicode_csv_reader(utf8_data, dialect=csv.excel, **kwargs):
csv_reader = csv.reader(utf8_data, dialect=dialect, **kwargs)
for row in csv_reader:
yield [unicode(cell, 'utf-8') for cell in row]
filename = 'WS_Data_0_2019_09_25_215340128516.csv'
reader = unicode_csv_reader(open(filename))
wb = Workbook()
ws = wb.active
# lets just save 10 rows in a xlsx
i = 0
for row in reader:
if i < 10:
ws.append(row)
# print row
i = i+1
wb.save('example.xlsx')
I want to read the data given in 2nd and 3rd column from XLSX file.
import xlrd
workbook = xlrd.open_workbook("C:/Users/File.xlsx","rb")
sheet = workbook.sheet_by_index(0)
for row in range(sheet.nrows):
cols = (sheet.row_values(row,1)) and (sheet.row_values(row,2))
print(cols)
But is gives below error when i executed above script..
biff_version = bk.getbof(XL_WORKBOOK_GLOBALS) File
C:\Python27\.....\xlrd_init_.py", line 1323, in getbof raise
XLRDError('Expected BOF record; found 0x%04x' % opcode)
xlrd.biffh.XLRDError: Expected BOF record; found 0x4b50
Try this
import xlrd
workbook = xlrd.open_workbook("C:/Users/File.xlsx","rb")
sheets = workbook.sheet_names()
required_data = []
for sheet_name in sheets:
sh = workbook.sheet_by_name(sheet_name)
for rownum in range(sh.nrows):
row_valaues = sh.row_values(rownum)
required_data.append((row_valaues[0], row_valaues[1]))
print required_data
This example read all the content of the excel sheet and puts it in a matrix (list of lists), then you can use the columns you need:
import xlrd
workbook = xlrd.open_workbook("C:/Users/File.xlsx","rb")
sheet = workbook.sheet_by_index(0)
rows = []
for i in range(sheet.nrows):
columns = []
for j in range(sheet.ncols):
columns.append(sheet.cell(i, j).value)
rows.append(columns)
print rows