stripping data from two XLXS cells to csv - python

im having an issue where im trying to take data from two cells in an excel spread sheet and put them into a csv file. the data is lat and lon coordinates so they have to be side by side to be read by the program. here is what i have:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xlwt
import xlrd
import csv
import os, openpyxl, glob
from openpyxl import Workbook
with open ('test.csv', 'wb') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=',')
for file in glob.glob ("/test"):
wb = openpyxl.load_workbook('test-data.xlsx')
ws = wb.active
def lat():
for row in ws.iter_rows('Q2:Q65536'):
for cell in row:
lat = cell.value
return lat
def lon():
for row in ws.iter_rows('R2:R65536'):
for cell in row:
lon = cell.value
return lon
cord=lat()+","+lon()
print (lat()+","+lon()) #just to see if its working
#spamwriter.writerow([cord]) uncomment to write to file
however it only gives me the first row of data not the rest of the rows (test-data has around 1500 rows). how would i make it to finish going through the file?

This may not be the most dynamic way, but I would use pandas for this task. It has built in pd.read_excel() and pd.to_csv() functions.
import pandas as pd
import string
latColumn = string.lowercase.index('q') # determine index that corresponds to Excel Column letter (user lower case)
longColumn = string.lowercase.index('r') # Does not work for AA, BB, ...
data = pd.read_excel('test-data.xlsx', 'Sheet1', parse_cols=[latColumn,longColumn])
# Total number of rows being read in 65536 - 2 = 65534
csvOut = "foo.csv"
data[:65534].to_csv(csvOut, index=False, header=False)
If you need append to the file and not replace it, change the data[:65534].to_csv(....) to
open(csvOut, 'a') as f: #append to the .csv file of your likings
data[:65534].to_csv(f, index=False, header=False)

Related

Writing multiple data frames to multiple excel sheets but they turn up empty

I have the following code where I want to read data from first sheet of an excel file, and then, according to some category, split each category in a separate sheet.
All is good and the program doesn't show an error, but all the sheets it produces are empty.
import pandas
import os
from openpyxl import load_workbook
import pandas as pd
import xlsxwriter
path = r"C:\Users\acer pc\Desktop\rrrr.xlsx"
os.chdir(r"C:\Users\acer pc\Desktop")
data = pandas.read_excel("rrrr.xlsx")
FileNumber = data["number"].unique()
print(FileNumber)
wb2 = load_workbook('rrrr.xlsx')
for i in FileNumber:
wb2.create_sheet(f'{i}')
wb2.save(r"C:\Users\acer pc\Desktop\rrrr.xlsx")
for i in FileNumber:
rslt_df = data[data['number'] == i]
print(rslt_df)
writer = pd.ExcelWriter(r"C:\Users\acer pc\Desktop\rrrr.xlsx", engine='xlsxwriter')
rslt_df.to_excel(writer, sheet_name=f'{i}', index=False)
wb2.save(r"C:\Users\acer pc\Desktop\rrrr.xlsx")
wb2.close()

Python search corresponding data in multiple excel and paste to a new excel worksheet

i have some excel files in a folder, there's already a worksheet call "service" in each file
Notes_111.xlsx
Notes_222.xlsx
Notes_888.xlsx
Workflow : I want to open each .xlsx file, for example, Notes_111.xlsx, then add a new worksheet, name as "code_city", then based on file's name 111, extract only the code = 111 data from the master dataframe and paste to the new worksheet. then save.
Sample master dataframe in another excel file
code city
0 111 NY
1 111 CA
2 222 NJ
3 888 WE
4 888 TL
i don't know how to write a logic within a loop to search corresponding data
import pandas as pd
import numpy as np
import glob
from openpyxl import load_workbook
for f in glob.glob(path + "Notes_*.xlsx"):
wb = load_workbook(f)
ws = wb.create_sheet('code_city')
ws['A1'] = 'how to search corresponding data and paste here???'
wb.save(f)
please help.
Use pandas its much easier to manipulate, I believe it uses openpyxl under the hood anyway.
import glob
import pandas as pd
import os
for f in glob.glob('Notes_*.xlsx'):
dda = re.findall('\d+', f) #matches digits in the filename
df_each = pd.read_excel(f) # have to save the data first, coz ExcelWriter will clear up and create a new excel, so, you paste the saved data back to new sheet
df_1_dda = df_master[df_master['code'] == int(dda[0])] #select only those records with code in the filename
writer = pd.ExcelWriter(f)
df_each.to_excel(writer, 'service', index = False) # paste the saved data back to new sheet
df_1_dda.to_excel(writer, 'code_city', index = False)
writer.close()
Hope that helps!
using python 3.6.4 Anaconda - 32-bit
from openpyxl import load_workbook
for f in glob.glob("Notes_*.xlsx"):
code = re.findall('\d+', f) #matches digits in the filename
df_1_dda = df_master[df_master['code'] == int(code[0])] #select only those records with code from the master dataframe
#create new worksheet using openpyxl
wb = load_workbook(f)
ws = wb.create_sheet('code_city')
wb.save(f)
# reload the file and paste data I need
writer = pd.ExcelWriter(f)
df_1_dda.to_excel(writer, 'code_city')
writer.save()

Take specific rows from a csv file and put in an excel file using python

Good day,
I want to take some specific column(i.e column 3 and column 7) from a csv file and these two columns need to put into an output excel file.
I think that my issue is to put data in excel file.
The code that I tried to use is below.
import csv
from openpyxl import *
from openpyxl.cell import get_column_letter
file1=open('WebShop_Export_2016_Feb_19.csv','r')
readfile=csv.reader(file1,delimiter=';')
data=[]
wb=Workbook()
dest_filename = 'Name of OUPUT excel.xlsx'
ws = wb.active
ws.title="Output XLSX"
for row in readfile:
data=row[2],row[6] # take column 3 and 7 from source file
print(data)# this line is just to see the info that will be put in the excel output
for row_index, item in enumerate(data):
for column_index, cell in enumerate(item):
column_letter = get_column_letter((column_index + 1))
ws.cell('%s%s'%(column_letter, (row_index + 1))).value = cell
file1.close()
wb.save('Name of OUPUT excel.xlsx')
Thank you in advance!

Python/Excel - Slice extracted excel data - exclude rows maintain structure

So I'm attempting exclude the top three rows during a data extraction.
for col_num in xrange(sheet.ncols):
col = sheet.col_values(col_num, start_rowx=3, end_rowx=None)
writer.writerow(col) #this syntax also may be skewing my results as well
This for loop eliminates the top 3 rows put then turns the rows into columns.
Any advice on how to maintain the data structure but at the same time eliminate rows?
Full script below:
import glob
import os
import xlrd
import csv
ROOTDIR = r'C:\Users\username\Desktop\Mults'
wb_pattern = os.path.join(ROOTDIR, '*.xlsx')
workbooks = glob.glob(wb_pattern)
with open('merged.csv', 'wb') as outcsv:
writer = csv.writer(outcsv)
for wb in workbooks:
book_path = os.path.join(ROOTDIR, wb)
book = xlrd.open_workbook(book_path)
sheet = book.sheet_by_index(0)
for colx in xrange(sheet.ncols):
col = sheet.col_values(colx, start_rowx=2, end_rowx=None)
writer.writerow(col) #this syntax also may be skewing my results
Thank you!
Any help is much appreciated!
If you want row values, why are you pulling the columns to write as rows? Pull the row values and write those:
import glob
import os
import xlrd
import csv
ROOTDIR = r'C:\Users\username\Desktop\Mults'
wb_pattern = os.path.join(ROOTDIR, '*.xlsx')
workbooks = glob.glob(wb_pattern)
start_rownum = 3 # or wherever you want to start copying
with open('merged.csv', 'wb') as outcsv:
writer = csv.writer(outcsv)
for wb in workbooks:
book_path = os.path.join(ROOTDIR, wb)
book = xlrd.open_workbook(book_path)
sheet = book.sheet_by_index(0)
for rownum in xrange(start_rownum, sheet.numrows):
row = sheet.row_values(rownum)
writer.writerow(row)

How can I open an Excel file in Python?

How do I open a file that is an Excel file for reading in Python?
I've opened text files, for example, sometextfile.txt with the reading command. How do I do that for an Excel file?
Edit:
In the newer version of pandas, you can pass the sheet name as a parameter.
file_name = # path to file + file name
sheet = # sheet name or sheet number or list of sheet numbers and names
import pandas as pd
df = pd.read_excel(io=file_name, sheet_name=sheet)
print(df.head(5)) # print first 5 rows of the dataframe
Check the docs for examples on how to pass sheet_name: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html
Old version:
you can use pandas package as well....
When you are working with an excel file with multiple sheets, you can use:
import pandas as pd
xl = pd.ExcelFile(path + filename)
xl.sheet_names
>>> [u'Sheet1', u'Sheet2', u'Sheet3']
df = xl.parse("Sheet1")
df.head()
df.head() will print first 5 rows of your Excel file
If you're working with an Excel file with a single sheet, you can simply use:
import pandas as pd
df = pd.read_excel(path + filename)
print df.head()
Try the xlrd library.
[Edit] - from what I can see from your comment, something like the snippet below might do the trick. I'm assuming here that you're just searching one column for the word 'john', but you could add more or make this into a more generic function.
from xlrd import open_workbook
book = open_workbook('simple.xls',on_demand=True)
for name in book.sheet_names():
if name.endswith('2'):
sheet = book.sheet_by_name(name)
# Attempt to find a matching row (search the first column for 'john')
rowIndex = -1
for cell in sheet.col(0): #
if 'john' in cell.value:
break
# If we found the row, print it
if row != -1:
cells = sheet.row(row)
for cell in cells:
print cell.value
book.unload_sheet(name)
This isn't as straightforward as opening a plain text file and will require some sort of external module since nothing is built-in to do this. Here are some options:
http://www.python-excel.org/
If possible, you may want to consider exporting the excel spreadsheet as a CSV file and then using the built-in python csv module to read it:
http://docs.python.org/library/csv.html
There's the openpxyl package:
>>> from openpyxl import load_workbook
>>> wb2 = load_workbook('test.xlsx')
>>> print wb2.get_sheet_names()
['Sheet2', 'New Title', 'Sheet1']
>>> worksheet1 = wb2['Sheet1'] # one way to load a worksheet
>>> worksheet2 = wb2.get_sheet_by_name('Sheet2') # another way to load a worksheet
>>> print(worksheet1['D18'].value)
3
>>> for row in worksheet1.iter_rows():
>>> print row[0].value()
You can use xlpython package that requires xlrd only.
Find it here https://pypi.python.org/pypi/xlpython
and its documentation here https://github.com/morfat/xlpython
This may help:
This creates a node that takes a 2D List (list of list items) and pushes them into the excel spreadsheet. make sure the IN[]s are present or will throw and exception.
this is a re-write of the Revit excel dynamo node for excel 2013 as the default prepackaged node kept breaking. I also have a similar read node. The excel syntax in Python is touchy.
thnx #CodingNinja - updated : )
###Export Excel - intended to replace malfunctioning excel node
import clr
clr.AddReferenceByName('Microsoft.Office.Interop.Excel, Version=15.0.0.0, Culture=neutral, PublicKeyToken=71e9bce111e9429c')
##AddReferenceGUID("{00020813-0000-0000-C000-000000000046}") ''Excel C:\Program Files\Microsoft Office\Office15\EXCEL.EXE
##Need to Verify interop for version 2015 is 15 and node attachemnt for it.
from Microsoft.Office.Interop import * ##Excel
################################Initialize FP and Sheet ID
##Same functionality as the excel node
strFileName = IN[0] ##Filename
sheetName = IN[1] ##Sheet
RowOffset= IN[2] ##RowOffset
ColOffset= IN[3] ##COL OFfset
Data=IN[4] ##Data
Overwrite=IN[5] ##Check for auto-overwtite
XLVisible = False #IN[6] ##XL Visible for operation or not?
RowOffset=0
if IN[2]>0:
RowOffset=IN[2] ##RowOffset
ColOffset=0
if IN[3]>0:
ColOffset=IN[3] ##COL OFfset
if IN[6]<>False:
XLVisible = True #IN[6] ##XL Visible for operation or not?
################################Initialize FP and Sheet ID
xlCellTypeLastCell = 11 #####define special sells value constant
################################
xls = Excel.ApplicationClass() ####Connect with application
xls.Visible = XLVisible ##VISIBLE YES/NO
xls.DisplayAlerts = False ### ALerts
import os.path
if os.path.isfile(strFileName):
wb = xls.Workbooks.Open(strFileName, False) ####Open the file
else:
wb = xls.Workbooks.add# ####Open the file
wb.SaveAs(strFileName)
wb.application.visible = XLVisible ####Show Excel
try:
ws = wb.Worksheets(sheetName) ####Get the sheet in the WB base
except:
ws = wb.sheets.add() ####If it doesn't exist- add it. use () for object method
ws.Name = sheetName
#################################
#lastRow for iterating rows
lastRow=ws.UsedRange.SpecialCells(xlCellTypeLastCell).Row
#lastCol for iterating columns
lastCol=ws.UsedRange.SpecialCells(xlCellTypeLastCell).Column
#######################################################################
out=[] ###MESSAGE GATHERING
c=0
r=0
val=""
if Overwrite == False : ####Look ahead for non-empty cells to throw error
for r, row in enumerate(Data): ####BASE 0## EACH ROW OF DATA ENUMERATED in the 2D array #range( RowOffset, lastRow + RowOffset):
for c, col in enumerate (row): ####BASE 0## Each colmn in each row is a cell with data ### in range(ColOffset, lastCol + ColOffset):
if col.Value2 >"" :
OUT= "ERROR- Cannot overwrite"
raise ValueError("ERROR- Cannot overwrite")
##out.append(Data[0]) ##append mesage for error
############################################################################
for r, row in enumerate(Data): ####BASE 0## EACH ROW OF DATA ENUMERATED in the 2D array #range( RowOffset, lastRow + RowOffset):
for c, col in enumerate (row): ####BASE 0## Each colmn in each row is a cell with data ### in range(ColOffset, lastCol + ColOffset):
ws.Cells[r+1+RowOffset,c+1+ColOffset].Value2 = col.__str__()
##run macro disbled for debugging excel macro
##xls.Application.Run("Align_data_and_Highlight_Issues")
import pandas as pd
import os
files = os.listdir('path/to/files/directory/')
desiredFile = files[i]
filePath = 'path/to/files/directory/%s'
Ofile = filePath % desiredFile
xls_import = pd.read_csv(Ofile)
Now you can use the power of pandas DataFrames!
This code worked for me with Python 3.5.2. It opens and saves and excel. I am currently working on how to save data into the file but this is the code:
import csv
excel = csv.writer(open("file1.csv", "wb"))

Categories