I am trying to write data to an Excel file, during a for loop.
But what I am getting is a single line containing the last data received by the loop.
I have tried a couple of different methods but came short..
2 tries are list below
Any Ideas ?
def write_excel(x):
workbook = xlsxwriter.Workbook('ID_Num.xlsx')
worksheet = workbook.add_worksheet()
df = pd.DataFrame(
{'ID':[x],
'mail_one':[Email],
'second_mail':[second_mail],
'Num':[Num],
'date':[Date]})
row_num = 0
for key, value in df.items():
worksheet.write(0, row_num, key)
worksheet.write_row(1, row_num, value)
row_num += 1
workbook.close()
#df = pd.DataFrame(
# {'ID':[x],
# 'mail_one':[Email],
# 'second_mail':[second_mail],
# 'Num':[Num],
# 'date':[Date]})
# writer = ExcelWriter('ID_Num.xlsx')
# df.to_excel(writer,'ID_Num',index=False)
# writer.save()
if __name__ == "__main__":
for x in List:
my_dic = {}
my_dict["ID"] = x
my_dict["mail_one"] = Email
my_dict["second_mail"] = second_mail
my_dict["Num"] = str(Num)
my_dict["date"] = Date
print(my_dict)
write_excel(x)
I don't have xlsxwriter so I cannot test. The documentation says that it cannot modify an existing file so I suspect that every iteration of for x in List: you are over-writing your file (workbook = xlsxwriter.Workbook('ID_Num.xlsx')).
You can make multiple files with these changes:
def write_excel(x,i):
workbook = xlsxwriter.Workbook(f'ID_Num{i}.xlsx')
...
# and
for i,x in enumerate(List):
...
write_excel(x,i)
Or you could accumulate multiple dictionaries and pass all of them to your function
data = []
for x in List:
my_dic = {}
...
data.append(my_dic)
write_excel(data)
Changing the function to iterate over those dicts; making a new sheet for each one
def write_excel(data):
workbook = xlsxwriter.Workbook('ID_Num.xlsx')
for sht in data:
worksheet = workbook.add_worksheet()
df = pd.DataFrame(...
row_num = 0
for key, value in df.items():
worksheet.write(...
worksheet.write_row(...
row_num += 1
workbook.close()
Related
I'm trying to have this kind of result :
Here is the csv-file :
OsmID,NewName,IdLocal
1020287758,NN1,Id0001
1021229973,NN2,Id0002
1025409497,NN3,Id0003
I'm using the code below:
import csv
input = r'C:\Users\_M92\csvFiles\csv0001.csv'
fileRead = open(input, 'r')
with open(input, 'r') as csv:
headerLine = fileRead.readline()
header = headerLine.split(",")
#print(header)
nameIndex = header.index("OsmID")
output = {}
for line in fileRead.readlines():
values = line.split(",")
output[values[nameIndex]] = values
print(output)
And it results in the following error:
File "c:\Users\_M92\Scripts\CsvToDict.py",
line 19, in <module>
nameIndex = header.index("OsmID")
ValueError: 'OsmID' is not in list
Instead of manually splitting each line by commas, use the CSV module that you've imported. This module contains a DictReader class that will yield dictionaries for each row. Then, you just need to add this to your output dictionary.
# Create an empty dictionary
# We will add keys to this as needed
output = {}
# Keep track of number of rows, so we can add an empty column if needed
row_count = 0
# This function adds a row to the output dictionary
def add_row(row_dict):
global row_count # Need to declare this as global because we're assigning to the variable in this function
if not row_dict: return # If row is empty, do nothing
for k, v in row_dict.items():
# Loop over all key-value pairs in the row to add
if k not in output: # If the output doesn't contain this column, create a blank column
output[k] = [None] * row_count
output[k].append(v) # Append the value to the correct column in output
row_count += 1
input_file = r'C:\Users\_M92\csvFiles\csv0001.csv'
with open(input_file, 'r') as fh:
reader = csv.DictReader(fh) # Create a DictReader
for row in reader:
add_row(row) # Add every row to the output
This gives the following output:
{'OsmID': ['1020287758', '1021229973', '1025409497'],
'NewName': ['NN1', 'NN2', 'NN3'],
'IdLocal': ['Id0001', 'Id0002', 'Id0003']}
Note: I removed the blank lines in the input csv you provided, but it doesn't make a difference to the program, since a blank line will yield an empty dictionary from DictReader, and add_row doesn't do anything with empty dicts
Note 2: You could discard the row_count variable if you dynamically count the number of rows like so:
def add_row(row_dict):
row_count = 0
for first_key, first_val in output.items():
row_count = len(first_val)
break # We can just break out here because all keys should have the same number of values
# Create keys that do not yet exist in output but do exist in the new row
existing_keys = set(output.keys())
new_row_keys = set(row_dict.keys())
keys_to_create = new_row_keys - existing_keys
for key in keys_to_create:
output[key] = [None] * row_count
# Append to each column in output
for key in output:
output[key].append(row_dict.get(key, None)) # If the key doesn't exist in the current row, append None
You could use Pandas
import pandas as pd
f = r'C:\Users\_M92\csvFiles\csv0001.csv'
df = pd.read_csv(f).to_dict('list')
Try to go from this snippet for you. This is the 'From scratch' method. Please use a lib to do it properly!:
import os
input_path = r'test.csv'
header_line = 0
sep_csv_line = "\n\n"
sep_csv_column = ","
with open(os.path.join(os.path.dirname(__file__), input_path), 'r') as csv:
content = csv.read()
split = content.split(sep_csv_line)
columns = split[header_line].split(sep_csv_column)
print(f"{columns = }")
output = {}
for column in columns:
output[column] = []
for line in split[header_line+1:]:
print(f"{line = }")
elements = line.split(sep_csv_column)
print(f"{elements = }")
for i, column in enumerate(columns):
element = elements[i]
print(f"{element = }")
output[column].append(element)
print(f"{output = }")
print(f"{output['OsmID'] = }")
Here is the output console:
columns = ['OsmID', 'NewName', 'IdLocal']
line = '1020287758,NN1,Id0001'
elements = ['1020287758', 'NN1', 'Id0001']
element = '1020287758'
element = 'NN1'
element = 'Id0001'
line = '1021229973,NN2,Id0002'
elements = ['1021229973', 'NN2', 'Id0002']
element = '1021229973'
element = 'NN2'
element = 'Id0002'
line = '1025409497,NN3,Id0003'
elements = ['1025409497', 'NN3', 'Id0003']
element = '1025409497'
element = 'NN3'
element = 'Id0003'
output = {'OsmID': ['1020287758', '1021229973', '1025409497'], 'NewName': ['NN1', 'NN2', 'NN3'], 'IdLocal': ['Id0001', 'Id0002', 'Id0003']}
output['OsmID'] = ['1020287758', '1021229973', '1025409497']
I would like to fill different data in different sheets in excel. So far i am writing everything in 1 sheet.
I want to have for each i in for loop a different sheet( sheet1,sheet2...)
for i in range(1,11):
for k, v in drd_dictionary.items() :
if str(i) + "_" in k :
an.append(k)
reqs.append(v)
sheet_number=i
write_to_excel(an,reqs,str(sheet_number))
def write_to_excel(an,req_text,sheet_number):
workbook = xlsxwriter.Workbook('path.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write_column('A1', an)
worksheet.write_column('B1', req_text)
workbook.close()
Looks like you're overwriting the workbook with each iteration. Pass the workbook as a parameter to the function. Initialize the workbook before the loop.
def write_to_excel(workbook, an,req_text,sheet_number):
worksheet = workbook.add_worksheet(sheet_number)
worksheet.write_column('A1', an)
worksheet.write_column('B1', req_text)
Here is the full working code:
import xlsxwriter
def write_to_excel(workbook, an,req_text,sheet_number):
worksheet = workbook.add_worksheet(sheet_number)
for i in range(len(an)):
worksheet.write(0,i, an[i])
for j in range(len(req_text)):
worksheet.write(0,j, req_text[j])
drd_dictionary = {"1_": "a", "2_": "2"}
an,reqs = [],[]
workbook = xlsxwriter.Workbook("data.xlsx")
for i in range(1,11):
for k, v in drd_dictionary.items() :
if str(i) + "_" in k :
an.append(k)
reqs.append(v)
sheet_number=i
write_to_excel(workbook, an,reqs,str(sheet_number))
workbook.close()
I have some code to open an excel file and save it as a pandas dataframe, it was originally used in Python 2.7 and I am currently trying to make it work under Python 3.
Originally, I used the code in #myidealab from this other post: From password-protected Excel file to pandas DataFrame.
It currently looks like this:
data_file = <path_for_file>
# Load excel file
xlApp = win32com.client.Dispatch("Excel.Application")
xlApp.Visible = False
pswd = getpass.getpass('password: ')
xldatabase = xlApp.Workbooks.Open(data_file, False, True, None, pswd)
dfdatabase = []
for sh in xldatabase.Sheets:
xlsheet = xldatabase.Worksheets(sh.Name)
# Get last_row
row_num = 0
cell_val = ''
while cell_val != None:
row_num += 1
cell_val = xlsheet.Cells(row_num, 1).Value
last_row = row_num - 1
# Get last_column
col_num = 0
cell_val = ''
while cell_val != None:
col_num += 1
cell_val = xlsheet.Cells(1, col_num).Value
last_col = col_num - 1
# Get content
content = xlsheet.Range(xlsheet.Cells(1, 1), xlsheet.Cells(last_row, last_col)).Value
# Load each sheet as a dataframe
dfdatabase.append(pd.DataFrame(list(content[1:]), columns=content[0]))
Now, I am getting the following error:
AttributeError: 'pywintypes.datetime' object has no attribute
'nanosecond'
The problem seems to boil down to the lines bellow:
# Get content
content = xlsheet.Range(xlsheet.Cells(1, 1), xlsheet.Cells(last_row, last_col)).Value
# Load each sheet as a dataframe
dfdatabase.append(pd.DataFrame(list(content[1:]), columns=content[0]))
The xlsheet.Range().Value is reading the data and assigning pywintymes descriptors to the data, which pd.DataFrame() fails to interpret.
Did anyone ran into this issue before? Is there a way that I can specifically tell xlsheet.Range().Value how to read the values in a way that pandas can interpret?
Any help will be welcome!
Thank you.
This solves the issue, assuming you know beforehand the size/formatting of your dates/times in the excel sheet.
Might be there are other more elegant ways to solve it, nonetheless.
Note: content is initially a tuple. Position [0] is the array containing the headers and the remaining positions contain the data.
import datetime
import pywintypes
...
content = xlsheet.Range(xlsheet.Cells(1, 1), xlsheet.Cells(last_row, last_col)).Value
head = content[0]
data = list(content[1:])
for x in range(0,len(data)):
data[x] = list(data[x])
for y in range(0,len(data[x])):
if isinstance(data[x][y], pywintypes.TimeType):
temp = str(data[x][y]).rstrip("+00:00").strip()
if len(temp)>10:
data[x][y] = datetime.datetime.strptime(temp, "%Y-%m-%d%H:%M")
elif len(temp)>5 and len(temp)<=10:
data[x][y] = datetime.datetime.strptime(temp, "%Y-%m-%d")
elif len(temp)<=5:
data[x][y] = datetime.datetime.strptime(temp, "%H:%M")
print(data[x][y])
# Load each sheet as a dataframe
dfdatabase.append(pd.DataFrame(data, columns=head))
Used this as references:
python-convert-pywintyptes-datetime-to-datetime-datetime
so, toward the end of my first file; we'll call /file.py.
def get_excel_data(self):
"""Places excel data into pandas dataframe"""
# excel_data = pandas.read_excel(self.find_file())
for extracted_archive in self.find_file():
excel_data = pandas.read_excel(extracted_archive)
# print(excel_data)
columns = pandas.DataFrame(columns=excel_data.columns.tolist())
excel_data = pandas.concat([excel_data, columns])
excel_data.columns = excel_data.columns.str.strip()
excel_data.columns = excel_data.columns.str.replace("/", "_")
excel_data.columns = excel_data.columns.str.replace(" ", "_")
total_records = 0
num_valid_records = 0
num_invalid_records = 0
for row in excel_data.itertuples():
mrn = row.MRN
total_records += 1
if mrn in ("", " ", "N/A", "NaT", "NaN", None) or math.isnan(mrn):
# print(f"Invalid record: {row}")
num_invalid_records += 1
# total_invalid = num_invalid_records + dup_count
excel_data = excel_data.drop(excel_data.index[row.Index])
# continue
else:
# print(mrn) # outputs all MRN ids
for row in excel_data.itertuples():
num_valid_records += 1
continue
with open("./logs/metrics.csv", "a", newline="\n") as f:
csv_writer = DictWriter(f, ['date', 'total_records', 'processed', 'skipped', 'success_rate'])
# csv_writer.writeheader()
currentDT = datetime.datetime.now()
success_rate = num_valid_records / total_records * 100
csv_writer.writerow(dict(date=currentDT,
total_records=total_records,
processed=num_valid_records,
skipped=num_invalid_records,
success_rate=num_valid_records / total_records * 100))
return self.clean_data_frame(excel_data)
def clean_data_frame(self, data_frame):
"""Cleans up dataframes"""
for col in data_frame.columns:
if "date" in col.lower():
data_frame[col] = pandas.to_datetime(data_frame[col],
errors='coerce', infer_datetime_format=True)
data_frame[col] = data_frame[col].dt.date
data_frame['MRN'] = data_frame['MRN'].astype(int).astype(str)
return data_frame
def get_mapping_data(self):
map_data = pandas.read_excel(config.MAPPING_DOC, sheet_name='main')
columns = pandas.DataFrame(columns=map_data.columns.tolist())
return pandas.concat([map_data, columns])
in my second file I would like to keep that end state; and do another iteration for instance.... second_file.py
def process_records(self, records, map_data, completed=None, errors=None):
"""Code to execute after webdriver initialization."""
series_not_null = False
try:
num_attempt = 0
for record in data_frame.itertuples(): # not working
print(record)
series_not_null = True
mrn = record.MRN
self.navigate_to_search(num_attempt)
self.navigate_to_member(mrn)
self.navigate_to_assessment()
self.add_assessment(record, map_data)
self.driver.switch_to.parent_frame() # not working
sleep(.5)
error_flag = self.close_member_tab(self.driver, mrn, error_flag)
except Exception as exc:
if series_not_null:
errors = self.process_series_error(exc)
return completed, error
both have import pandas
you can save your dataframe in a pickle file like this. it is also worth noting that you can store most anything in a pickle file. here is a link to some info here: pickle info
import pandas as pd
import pickle
x = pd.DataFrame({'a':[1,2,3],'b':[4,5,6],'c':[7,8,9]})
#this will create a file called pickledata.p that will store the data frame
with open('pickledata.p', 'wb') as fh: #notice that you need the 'wb' for the dump
pickle.dump(x, fh)
#to load the file do this
with open('pickledata.p', 'rb') as fh: #you need to use 'rb' to read
df = pickle.load(fh)
#you can now use df like a normal dataframe
print(df)
you dont actually need the '.p' extension for a pickle file, i just like it.
so you save your dataframe at the end of script one, and then load it in at the start of script 2.
Use Dataframe.to_pickle and pandas.read_pickle:
To persist
df.to_pickle('./dataframe.pkl')
To load
df = pd.read_pickle('./dataframe.pkl')
I'm parsing a XML String into CSV string but it's going very slow:
INDEX_COLUMN = "{urn:schemas-microsoft-com:office:spreadsheet}Index"
CELL_ELEMENT = "Cell"
DATA_ELEMENT = "Data"
def parse_to_csv_string(xml):
print('parse_to_csv_string')
csv = []
parsed_data = serialize_xml(xml)
rows = list(parsed_data[1][0])
header = get_cells_text(rows[0])
rows.pop(0)
csv.append(join(",", header))
for row in rows:
values = get_cells_text(row)
csv.append(join(",", values))
return join("\n", csv)
def serialize_xml(xml):
return ET.fromstring(xml)
def get_cells_text(row):
keys = []
cells = normalize_row_cells(row)
for elm in cells:
keys.append(elm[0].text or "")
while len(keys) < 92:
keys.append("")
return keys
def normalize_row_cells(row):
cells = list(row)
updated_cells = copy.deepcopy(cells)
pos = 1
for elm in cells:
strIndexAttr = elm.get(INDEX_COLUMN)
index = int(strIndexAttr) if strIndexAttr else pos
while index > pos:
empty_elm = ET.Element(CELL_ELEMENT)
child = ET.SubElement(empty_elm, DATA_ELEMENT)
child.text = ""
updated_cells.insert(pos - 1, empty_elm)
pos += 1
pos += 1
return updated_cells
The XML String sometimes miss a few columns and I need to iterate it to fill missing columns - every row must have 92 columns. That's why I have some helper functions to manipulate XML.
Right now I'm running my function with 4GB as Lambda and still getting timeout :(
Any idea on how to improve performance?
The normalize_row_cells constructs ElementTree Element instances but get_cells_text is only interested in each instance's child's text attribute, so I would consider changing normalize_row_cells to just return the text. Also, it's performing copies and calling list.insert: inserting elements into the middle of lists can be expensive, because each element after the insertion point must be moved.
Something like this (untested code) avoids making copies and insertions and returns only the required text, making get_cells_text redundant.
def normalize_row_cells(row):
cells = list(row)
updated_cells = []
pos = 1
for _ in range(0, 92):
elm = cells[pos - 1]
strIndexAttr = elm.get(INDEX_COLUMN)
index = int(strIndexAttr) if strIndexAttr else pos
if index == pos:
updated_cells.append(elm[0].text)
pos += 1
else:
update_cells.append("")
return updated_cells
If you can match your cells to their header names then using csv.DictWriter from the standard library might be even better (you need to profile to be sure).
import csv
import io
def parse_to_csv_string(xml):
print('parse_to_csv_string')
csv = []
parsed_data = serialize_xml(xml)
rows = list(parsed_data[1][0])
header = get_cells_text(rows[0])
with io.StringIO() as f:
writer = csv.DictWriter(f, fieldnames=header)
for row in rows:
row = get_cells_text(row)
writer.writerow(row)
f.seek(0)
data = f.read()
return data
def get_cells_text(row):
row_dict = {}
for cell in row:
column_name = get_column_name(cell) # <- can this be done?
row_dict[column_name] = elm[0].text or ""
return row_dict