Proper convert from excel to csv - python

I have a excel (.xls) with 1 sheet (Sheet1) and I want to convert that into csv using python 3. I found this code:
import xlrd
import csv
def csv_from_excel():
wb = xlrd.open_workbook('safir/fisier-safir.xls')
sh = wb.sheet_by_name('Sheet1')
your_csv_file = open('safir.csv', 'w')
wr = csv.writer(your_csv_file, quoting=csv.QUOTE_ALL)
for rownum in range(sh.nrows):
wr.writerow(sh.row_values(rownum))
your_csv_file.close()
csv_from_excel()
But the problem with this is that this approach will not preserve Excel formatting of certain numbers, 1 will become 1.00, 001 -> 1.0, 0,9 ->1, and all of this format problems, if I do the conversion excel -> csv manually I won't found this issues, and I got similar issues with others script also, do someone have a proper one ? Thank you!

Can you please try the below code. It's keep the formatting as is
import win32com.client
xl=win32com.client.Dispatch("Excel.Application")
xl.DisplayAlerts = False
xl.Workbooks.Open(Filename='C:\\pscript\Copy.xlsx',ReadOnly=1)
wb = xl.Workbooks(1)
print(wb)
wb.SaveAs(Filename='C:\\pscript\out.csv', FileFormat='6') #6 means csv
wb.Close(False)
xl.Application.Quit()
wb=None
xl=None
print("done")

Related

Python search corresponding data in multiple excel and paste to a new excel worksheet

i have some excel files in a folder, there's already a worksheet call "service" in each file
Notes_111.xlsx
Notes_222.xlsx
Notes_888.xlsx
Workflow : I want to open each .xlsx file, for example, Notes_111.xlsx, then add a new worksheet, name as "code_city", then based on file's name 111, extract only the code = 111 data from the master dataframe and paste to the new worksheet. then save.
Sample master dataframe in another excel file
code city
0 111 NY
1 111 CA
2 222 NJ
3 888 WE
4 888 TL
i don't know how to write a logic within a loop to search corresponding data
import pandas as pd
import numpy as np
import glob
from openpyxl import load_workbook
for f in glob.glob(path + "Notes_*.xlsx"):
wb = load_workbook(f)
ws = wb.create_sheet('code_city')
ws['A1'] = 'how to search corresponding data and paste here???'
wb.save(f)
please help.
Use pandas its much easier to manipulate, I believe it uses openpyxl under the hood anyway.
import glob
import pandas as pd
import os
for f in glob.glob('Notes_*.xlsx'):
dda = re.findall('\d+', f) #matches digits in the filename
df_each = pd.read_excel(f) # have to save the data first, coz ExcelWriter will clear up and create a new excel, so, you paste the saved data back to new sheet
df_1_dda = df_master[df_master['code'] == int(dda[0])] #select only those records with code in the filename
writer = pd.ExcelWriter(f)
df_each.to_excel(writer, 'service', index = False) # paste the saved data back to new sheet
df_1_dda.to_excel(writer, 'code_city', index = False)
writer.close()
Hope that helps!
using python 3.6.4 Anaconda - 32-bit
from openpyxl import load_workbook
for f in glob.glob("Notes_*.xlsx"):
code = re.findall('\d+', f) #matches digits in the filename
df_1_dda = df_master[df_master['code'] == int(code[0])] #select only those records with code from the master dataframe
#create new worksheet using openpyxl
wb = load_workbook(f)
ws = wb.create_sheet('code_city')
wb.save(f)
# reload the file and paste data I need
writer = pd.ExcelWriter(f)
df_1_dda.to_excel(writer, 'code_city')
writer.save()

csv Writer using Datafields returned by Pandas

Hello I'm working on a project that reads an excel worksheet, collects columns of data based on header title, and then writes that data to a much leaner csv file which I'll be using for more fun later.
I'm getting a syntax error while trying to write my new csv file, I think it has something to do with the datafields I'm using to get my columns in pandas.
I'm new to Python so any help you can provide would be great, thanks!
import pandas
import xlrd
import csv
def csv_from_excel():
wb = xlrd.open_workbook("C:\\Python27\\Work\\spreadsheet.xlsx")
sh = wb.sheet_by_name('Sheet1')
spoofingFile = open('spoofing.csv', 'wb')
wr = csv.writer(spoofingFile, quoting=csv.QUOTE_ALL)
for rownum in xrange(sh.nrows):
wr.writerow(sh.row_values(rownum))
spoofingFile.close()
csv_from_excel()
df = pandas.read_csv('C:\\Python27\\Work\\spoofing.csv')
time = df["InviteTime (Oracle)"]
orignum = df["Orig Number"]
origip = df["Orig IP Address"]
destnum = df["Dest Number"]
sheet0bj = csv.writer(open("complete.csv", "wb")
sheet0bj.writerow([time,orignum,origip,destnum])
The syntax error is thus:
file c:\python27\work\formatsheettest.py, line36
sheet0bj.writerow([time, orignum, origip, destnum])
^
Syntax error: Invalid syntax
You're missing a closing paren on the second to last line.
sheet0bj = csv.writer(open("complete.csv", "wb")
should be
sheet0bj = csv.writer(open("complete.csv", "wb"))
I assume you've figured that out by now, though.

How to write/update data into cells of existing XLSX workbook using xlsxwriter in python

I am able to write into new xlsx workbook using
import xlsxwriter
def write_column(csvlist):
workbook = xlsxwriter.Workbook("filename.xlsx",{'strings_to_numbers': True})
worksheet = workbook.add_worksheet()
row = 0
col = 0
for i in csvlist:
worksheet.write(col,row, i)
col += 1
workbook.close()
but couldn't find the way to write in an existing workbook.
Please help me to write/update cells in existing workbook using xlswriter or any alternative.
Quote from xlsxwriter module documentation:
This module cannot be used to modify or write to an existing Excel
XLSX file.
If you want to modify existing xlsx workbook, consider using openpyxl module.
See also:
Modify an existing Excel file using Openpyxl in Python
Use openpyxl to edit a Excel2007 file (.xlsx) without changing its own styles?
you can use this code to open (test.xlsx) file and modify A1 cell and then save it with a new name
import openpyxl
xfile = openpyxl.load_workbook('test.xlsx')
sheet = xfile.get_sheet_by_name('Sheet1')
sheet['A1'] = 'hello world'
xfile.save('text2.xlsx')
Note that openpyxl does not have a large toolbox for manipulating and editing images. Xlsxwriter has methods for images, but on the other hand cannot import existing worksheets...
I have found that this works for rows...
I'm sure there's a way to do it for columns...
import openpyxl
oxl = openpyxl.load_workbook('File Loction Here')
xl = oxl.['SheetName']
x=0
col = "A"
row = x
while (row <= 100):
y = str(row)
cell = col + row
xl[cell] = x
row = row + 1
x = x + 1
You can do by xlwings as well
import xlwings as xw
for book in xlwings.books:
print(book)
If you have issue with writing into an existing xls file because it is already created you need to put checking part like below:
PATH='filename.xlsx'
if os.path.isfile(PATH):
print "File exists and will be overwrite NOW"
else:
print "The file is missing, new one is created"
...
and here part with the data you want to add

Modify an existing Excel file using Openpyxl in Python

I am basically trying to copy some specific columns from a CSV file and paste those
in an existing excel file[*.xlsx] using python. Say for example, you have a CSV file like this :
col_1 col_2 col_3 col_4
1 2 3 4
5 6 7 8
9 10 11 12
So, i wanted to copy the both col_3 and col_4 and paste those in col_8 and col_9 in an existing excel file [which is a .XLSX format].
I have tried this in various way to solve, but could not find out the exact way.
i tried something like this :
with open( read_x_csv, 'rb') as f:
reader = csv.reader(f)
for row in reader:
list1 = row[13]
queue1.append(list1)
list2 = row[14]
queue2.append(list2)
list3 = row[15]
queue3.append(list3)
list4 = row[16]
queue4.append(list4)
and then
rb = open_workbook("Exact file path.....")
wb = copy(rb)
ws = wb.get_sheet(0)
row_no = 0
for item in queue1:
if(item != ""):
ii = int(item)
ws.write(row_no,12,ii)
row_no = row_no + 1
#ws.write(item)
print item
else:
ws.write(row_no,12,item)
row_no = row_no + 1
wb.save("Output.xls")
but problem with this solution is it does not allow me to save as *.XLSX format which is
strictly required for me.
I have tried to use Openpyxl as it can handle *.XLSX format, but could not find out a way to modify the existing excel file. can anyone please help on this?
Doubt :
1) Can we really read a whole column from a CSV file and store into an array/list
using python?
2) Can we modify the existing excel file which is in .XLSX format using
openpyxl or any other package?
You can try the following implementation
from openpyxl import load_workbook
import csv
def update_xlsx(src, dest):
#Open an xlsx for reading
wb = load_workbook(filename = dest)
#Get the current Active Sheet
ws = wb.get_active_sheet()
#You can also select a particular sheet
#based on sheet name
#ws = wb.get_sheet_by_name("Sheet1")
#Open the csv file
with open(src) as fin:
#read the csv
reader = csv.reader(fin)
#enumerate the rows, so that you can
#get the row index for the xlsx
for index,row in enumerate(reader):
#Assuming space separated,
#Split the row to cells (column)
row = row[0].split()
#Access the particular cell and assign
#the value from the csv row
ws.cell(row=index,column=7).value = row[2]
ws.cell(row=index,column=8).value = row[3]
#save the csb file
wb.save(dest)
Can we really read a whole column from a CSV file and store into an array/list using python? No, because files are read sequentially, csv reader cannot read a column of data to a row. Instead you may read the whole content and use izip and islice to get a particular column. You can also use numpy.array
Can we modify the existing excel file which is in .XLSX format using openpyxl or any other package? Yes, see the example above
As it is 2021, get_sheet_by_name is deprecated and raises an DeprecationWarning with the following message:
Call to deprecated function get_sheet_by_name (Use wb[sheetname]).
The following snippet can be used in order to not raise the warning.
from openpyxl import load_workbook
file_path = 'test.xlsx'
wb = load_workbook(file_path)
ws = wb['SHEET_NAME'] # or wb.active
ws['G6'] = 123
wb.save(file_path)
from openpyxl import load_workbook
# Class to manage excel data with openpyxl.
class Copy_excel:
def __init__(self,src):
self.wb = load_workbook(src)
#self.ws = self.wb.get_sheet_by_name("Sheet1") # Deprecated
self.ws = self.wb["Sheet1"]
self.dest="destination.xlsx"
# Write the value in the cell defined by row_dest+column_dest
def write_workbook(self,row_dest,column_dest,value):
c = self.ws.cell(row = row_dest, column = column_dest)
c.value = value
# Save excel file
def save_excel(self) :
self.wb.save(self.dest)
Open an existing excel file (Using load_workbook(...))
As simple as that!
from openpyxl import load_workbook
wb = load_workbook('test.xlsx')
See docs: https://openpyxl.readthedocs.io/en/stable/tutorial.html#loading-from-a-file
Append data at the end (keeping the old data)
work_sheet = wb.active # Get active sheet
work_sheet.append(['John', 'Customer', 'He likes football'])
Save modified workbook in test.xlsx
wb.save('test.xlsx')

How can I open an Excel file in Python?

How do I open a file that is an Excel file for reading in Python?
I've opened text files, for example, sometextfile.txt with the reading command. How do I do that for an Excel file?
Edit:
In the newer version of pandas, you can pass the sheet name as a parameter.
file_name = # path to file + file name
sheet = # sheet name or sheet number or list of sheet numbers and names
import pandas as pd
df = pd.read_excel(io=file_name, sheet_name=sheet)
print(df.head(5)) # print first 5 rows of the dataframe
Check the docs for examples on how to pass sheet_name: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html
Old version:
you can use pandas package as well....
When you are working with an excel file with multiple sheets, you can use:
import pandas as pd
xl = pd.ExcelFile(path + filename)
xl.sheet_names
>>> [u'Sheet1', u'Sheet2', u'Sheet3']
df = xl.parse("Sheet1")
df.head()
df.head() will print first 5 rows of your Excel file
If you're working with an Excel file with a single sheet, you can simply use:
import pandas as pd
df = pd.read_excel(path + filename)
print df.head()
Try the xlrd library.
[Edit] - from what I can see from your comment, something like the snippet below might do the trick. I'm assuming here that you're just searching one column for the word 'john', but you could add more or make this into a more generic function.
from xlrd import open_workbook
book = open_workbook('simple.xls',on_demand=True)
for name in book.sheet_names():
if name.endswith('2'):
sheet = book.sheet_by_name(name)
# Attempt to find a matching row (search the first column for 'john')
rowIndex = -1
for cell in sheet.col(0): #
if 'john' in cell.value:
break
# If we found the row, print it
if row != -1:
cells = sheet.row(row)
for cell in cells:
print cell.value
book.unload_sheet(name)
This isn't as straightforward as opening a plain text file and will require some sort of external module since nothing is built-in to do this. Here are some options:
http://www.python-excel.org/
If possible, you may want to consider exporting the excel spreadsheet as a CSV file and then using the built-in python csv module to read it:
http://docs.python.org/library/csv.html
There's the openpxyl package:
>>> from openpyxl import load_workbook
>>> wb2 = load_workbook('test.xlsx')
>>> print wb2.get_sheet_names()
['Sheet2', 'New Title', 'Sheet1']
>>> worksheet1 = wb2['Sheet1'] # one way to load a worksheet
>>> worksheet2 = wb2.get_sheet_by_name('Sheet2') # another way to load a worksheet
>>> print(worksheet1['D18'].value)
3
>>> for row in worksheet1.iter_rows():
>>> print row[0].value()
You can use xlpython package that requires xlrd only.
Find it here https://pypi.python.org/pypi/xlpython
and its documentation here https://github.com/morfat/xlpython
This may help:
This creates a node that takes a 2D List (list of list items) and pushes them into the excel spreadsheet. make sure the IN[]s are present or will throw and exception.
this is a re-write of the Revit excel dynamo node for excel 2013 as the default prepackaged node kept breaking. I also have a similar read node. The excel syntax in Python is touchy.
thnx #CodingNinja - updated : )
###Export Excel - intended to replace malfunctioning excel node
import clr
clr.AddReferenceByName('Microsoft.Office.Interop.Excel, Version=15.0.0.0, Culture=neutral, PublicKeyToken=71e9bce111e9429c')
##AddReferenceGUID("{00020813-0000-0000-C000-000000000046}") ''Excel C:\Program Files\Microsoft Office\Office15\EXCEL.EXE
##Need to Verify interop for version 2015 is 15 and node attachemnt for it.
from Microsoft.Office.Interop import * ##Excel
################################Initialize FP and Sheet ID
##Same functionality as the excel node
strFileName = IN[0] ##Filename
sheetName = IN[1] ##Sheet
RowOffset= IN[2] ##RowOffset
ColOffset= IN[3] ##COL OFfset
Data=IN[4] ##Data
Overwrite=IN[5] ##Check for auto-overwtite
XLVisible = False #IN[6] ##XL Visible for operation or not?
RowOffset=0
if IN[2]>0:
RowOffset=IN[2] ##RowOffset
ColOffset=0
if IN[3]>0:
ColOffset=IN[3] ##COL OFfset
if IN[6]<>False:
XLVisible = True #IN[6] ##XL Visible for operation or not?
################################Initialize FP and Sheet ID
xlCellTypeLastCell = 11 #####define special sells value constant
################################
xls = Excel.ApplicationClass() ####Connect with application
xls.Visible = XLVisible ##VISIBLE YES/NO
xls.DisplayAlerts = False ### ALerts
import os.path
if os.path.isfile(strFileName):
wb = xls.Workbooks.Open(strFileName, False) ####Open the file
else:
wb = xls.Workbooks.add# ####Open the file
wb.SaveAs(strFileName)
wb.application.visible = XLVisible ####Show Excel
try:
ws = wb.Worksheets(sheetName) ####Get the sheet in the WB base
except:
ws = wb.sheets.add() ####If it doesn't exist- add it. use () for object method
ws.Name = sheetName
#################################
#lastRow for iterating rows
lastRow=ws.UsedRange.SpecialCells(xlCellTypeLastCell).Row
#lastCol for iterating columns
lastCol=ws.UsedRange.SpecialCells(xlCellTypeLastCell).Column
#######################################################################
out=[] ###MESSAGE GATHERING
c=0
r=0
val=""
if Overwrite == False : ####Look ahead for non-empty cells to throw error
for r, row in enumerate(Data): ####BASE 0## EACH ROW OF DATA ENUMERATED in the 2D array #range( RowOffset, lastRow + RowOffset):
for c, col in enumerate (row): ####BASE 0## Each colmn in each row is a cell with data ### in range(ColOffset, lastCol + ColOffset):
if col.Value2 >"" :
OUT= "ERROR- Cannot overwrite"
raise ValueError("ERROR- Cannot overwrite")
##out.append(Data[0]) ##append mesage for error
############################################################################
for r, row in enumerate(Data): ####BASE 0## EACH ROW OF DATA ENUMERATED in the 2D array #range( RowOffset, lastRow + RowOffset):
for c, col in enumerate (row): ####BASE 0## Each colmn in each row is a cell with data ### in range(ColOffset, lastCol + ColOffset):
ws.Cells[r+1+RowOffset,c+1+ColOffset].Value2 = col.__str__()
##run macro disbled for debugging excel macro
##xls.Application.Run("Align_data_and_Highlight_Issues")
import pandas as pd
import os
files = os.listdir('path/to/files/directory/')
desiredFile = files[i]
filePath = 'path/to/files/directory/%s'
Ofile = filePath % desiredFile
xls_import = pd.read_csv(Ofile)
Now you can use the power of pandas DataFrames!
This code worked for me with Python 3.5.2. It opens and saves and excel. I am currently working on how to save data into the file but this is the code:
import csv
excel = csv.writer(open("file1.csv", "wb"))

Categories