Python/Excel - Slice extracted excel data - exclude rows maintain structure - python

So I'm attempting exclude the top three rows during a data extraction.
for col_num in xrange(sheet.ncols):
col = sheet.col_values(col_num, start_rowx=3, end_rowx=None)
writer.writerow(col) #this syntax also may be skewing my results as well
This for loop eliminates the top 3 rows put then turns the rows into columns.
Any advice on how to maintain the data structure but at the same time eliminate rows?
Full script below:
import glob
import os
import xlrd
import csv
ROOTDIR = r'C:\Users\username\Desktop\Mults'
wb_pattern = os.path.join(ROOTDIR, '*.xlsx')
workbooks = glob.glob(wb_pattern)
with open('merged.csv', 'wb') as outcsv:
writer = csv.writer(outcsv)
for wb in workbooks:
book_path = os.path.join(ROOTDIR, wb)
book = xlrd.open_workbook(book_path)
sheet = book.sheet_by_index(0)
for colx in xrange(sheet.ncols):
col = sheet.col_values(colx, start_rowx=2, end_rowx=None)
writer.writerow(col) #this syntax also may be skewing my results
Thank you!
Any help is much appreciated!

If you want row values, why are you pulling the columns to write as rows? Pull the row values and write those:
import glob
import os
import xlrd
import csv
ROOTDIR = r'C:\Users\username\Desktop\Mults'
wb_pattern = os.path.join(ROOTDIR, '*.xlsx')
workbooks = glob.glob(wb_pattern)
start_rownum = 3 # or wherever you want to start copying
with open('merged.csv', 'wb') as outcsv:
writer = csv.writer(outcsv)
for wb in workbooks:
book_path = os.path.join(ROOTDIR, wb)
book = xlrd.open_workbook(book_path)
sheet = book.sheet_by_index(0)
for rownum in xrange(start_rownum, sheet.numrows):
row = sheet.row_values(rownum)
writer.writerow(row)

Related

How can we get file name, sheet name, max rows, and max columns for all Excel files in a folder?

I am trying to get the file name, sheet name, max rows, and max columns of each sheet in each Excel file. I did some research today on how to use Python to take an inventory of Excel files in a folder. I put together the code below and it seems to get me the file name and sheet name, but it gets stuck on the rows and columns. As I know, the rows and columns are strings, right. I'm trying to accommodate that requirement, but something seems to be off here. Can someone tell me what's wrong here?
import openpyxl
import glob
import pandas as pd
inventory = []
all_data = pd.DataFrame()
path = '\\Users\\ryans\\OneDrive\\Desktop\\sample\\*.xlsx'
for f in glob.glob(path):
print(f)
inventory.append(f)
theFile = openpyxl.load_workbook(f)
sheetnames = theFile.active
for sheet in sheetnames:
print(sheet)
inventory.append(sheet)
row_count = str(sheet.max_row)
col_count = str(sheet.max_col)
inventory.append(row_count)
inventory.append(col_count)
print(inventory)
To iterate over the worksheets in a workbook, you should use for sheet in theFile.worksheets. Your current attempt is actually iterating over all of the rows in your workbook, starting at the active sheet.
sheet.max_col is also the incorrect function, use sheet.max_column
So your working code is now:
import openpyxl
import glob
inventory = []
path = '\\Users\\ryans\\OneDrive\\Desktop\\sample\\*.xlsx'
for f in glob.glob(path):
# print(f)
inventory.append(f)
theFile = openpyxl.load_workbook(f)
sheetnames = theFile.active
for sheet in theFile.worksheets:
# print(sheet)
inventory.append(sheet)
row_count = str(sheet.max_row)
col_count = str(sheet.max_column)
inventory.append(row_count)
inventory.append(col_count)
print(inventory)

Read from mutiple excel and write to one file

I am trying to read data from multiple xls files and write it to one single file.
My code below is writing only the first file. Not sure what I am missing.
import glob import os import pandas as pd
def list_files(dir):
r = []
for root, dirs, files in os.walk(dir):
for name in files:
r.append(os.path.join(root, name))
return r
files = list_files("C:\\Users\\12345\\BOFS")
for file in files:
df = pd.read_excel(file)
new_header = df.iloc[1]
df = df[2:]
df.columns = new_header
with pd.ExcelWriter("C:\\Users\\12345\\Test\\Test.xls", mode='a') as writer:
df.to_excel(writer,index=False, header=True,)
Documentation says:
ExcelWriter can also be used to append to an existing Excel file:
with pd.ExcelWriter('output.xlsx',
mode='a') as writer:
df.to_excel(writer, sheet_name='Sheet_name_3')
And that probably replaces given sheet
But you could use pd.concat(<dataframes>) to concatenate dataframes and write all data at once in a single sheet.
I tested this piece of code, hopefully its work in your case.
import glob, os
os.chdir("D:/Data Science/stackoverflow")
for file in glob.glob("*.xlsx"):
df = pd.read_excel(file)
all_data = all_data.append(df,ignore_index=True)
# now save the data frame
writer = pd.ExcelWriter('output.xlsx')
all_data.to_excel(writer,'sheet1')
writer.save()

Multiple tabs in single excel

I am using below code to create a single excel with multiple tab based on the csv files present on path. I have two files in my path. so instead of getting two tabs in a single excel getting one tab with blank. Please help me to fix this code.
import os
import glob
import xlsxwriter
import csv
import pandas
path='/axp/buanalytics/csgsn/dev/GSN/VGEN_Files/Test/Tulu/VG/Data/'
flist = [os.path.basename(x) for x in glob.glob(os.getcwd() + '/axp/buanalytics/csgsn/dev/GSN/VGEN_Files/Test/Tulu/VG/Data/*.csv')]
workbook = xlsxwriter.Workbook('/axp/buanalytics/csgsn/dev/GSN/VGEN_Files/Test/Tulu/VG/Data/split_book.xlsx')
for sh in flist:
worksheet = workbook.add_worksheet(sh)
with open(sh, 'rb') as f:
reader = csv.reader(f)
for r, row in enumerate(reader):
for c, col in enumerate(row):
worksheet.write(r, c, col)
workbook.close()
Three problems:
1) flist = [os.path.basename(x) for x in glob.glob(os.getcwd() + '/axp/buanalytics/csgsn/dev/GSN/VGEN_Files/Test/Tulu/VG/Data/*.csv')]
Assuming that os.getcwd() is the same as your path, you will end up with the pathname twice. This means that flist will be empty. Since you have gone through the trouble of setting path, why not just
flist = [os.path.basename(x) for x in glob.glob(path + '*.csv')]
2) Same as above
workbook = path + 'split_book.xlsx'
3) The file should be opened as a text file
with open(sh, 'r') as f
Try that and your program should work. You don't need pandas for this - is that for later?
Read the files using pandas and combine all of them
import os
import pandas as pd
csv_names = [files for files in os.listdir("Your Directory/")] #get names of csv files in directory "Directory/"
writer = pd.ExcelWriter('Multiple Workbooks.xlsx', engine='xlsxwriter')
for files in csv_names:
df = pd.read_csv(os.path.join("Your Directory",files)) #read csv file
filename = files[:-4] #remove ".csv" from filename
df.to_excel(writer, sheet_name=filename) #add to workbook
writer.save()
In short you can add a tab to an dataframe using
writer = pd.ExcelWriter('Multiple Workbooks.xlsx', engine='xlsxwriter')
df1.to_excel(writer, sheet_name="SheetName")
df2.to_excel(writer, sheet_name="SheetName2")

stripping data from two XLXS cells to csv

im having an issue where im trying to take data from two cells in an excel spread sheet and put them into a csv file. the data is lat and lon coordinates so they have to be side by side to be read by the program. here is what i have:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xlwt
import xlrd
import csv
import os, openpyxl, glob
from openpyxl import Workbook
with open ('test.csv', 'wb') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=',')
for file in glob.glob ("/test"):
wb = openpyxl.load_workbook('test-data.xlsx')
ws = wb.active
def lat():
for row in ws.iter_rows('Q2:Q65536'):
for cell in row:
lat = cell.value
return lat
def lon():
for row in ws.iter_rows('R2:R65536'):
for cell in row:
lon = cell.value
return lon
cord=lat()+","+lon()
print (lat()+","+lon()) #just to see if its working
#spamwriter.writerow([cord]) uncomment to write to file
however it only gives me the first row of data not the rest of the rows (test-data has around 1500 rows). how would i make it to finish going through the file?
This may not be the most dynamic way, but I would use pandas for this task. It has built in pd.read_excel() and pd.to_csv() functions.
import pandas as pd
import string
latColumn = string.lowercase.index('q') # determine index that corresponds to Excel Column letter (user lower case)
longColumn = string.lowercase.index('r') # Does not work for AA, BB, ...
data = pd.read_excel('test-data.xlsx', 'Sheet1', parse_cols=[latColumn,longColumn])
# Total number of rows being read in 65536 - 2 = 65534
csvOut = "foo.csv"
data[:65534].to_csv(csvOut, index=False, header=False)
If you need append to the file and not replace it, change the data[:65534].to_csv(....) to
open(csvOut, 'a') as f: #append to the .csv file of your likings
data[:65534].to_csv(f, index=False, header=False)

How to concatenate three excels files xlsx using python?

Hello I would like to concatenate three excels files xlsx using python.
I have tried using openpyxl, but I don't know which function could help me to append three worksheet into one.
Do you have any ideas how to do that ?
Thanks a lot
Here's a pandas-based approach. (It's using openpyxl behind the scenes.)
import pandas as pd
# filenames
excel_names = ["xlsx1.xlsx", "xlsx2.xlsx", "xlsx3.xlsx"]
# read them in
excels = [pd.ExcelFile(name) for name in excel_names]
# turn them into dataframes
frames = [x.parse(x.sheet_names[0], header=None,index_col=None) for x in excels]
# delete the first row for all frames except the first
# i.e. remove the header row -- assumes it's the first
frames[1:] = [df[1:] for df in frames[1:]]
# concatenate them..
combined = pd.concat(frames)
# write it out
combined.to_excel("c.xlsx", header=False, index=False)
I'd use xlrd and xlwt. Assuming you literally just need to append these files (rather than doing any real work on them), I'd do something like: Open up a file to write to with xlwt, and then for each of your other three files, loop over the data and add each row to the output file. To get you started:
import xlwt
import xlrd
wkbk = xlwt.Workbook()
outsheet = wkbk.add_sheet('Sheet1')
xlsfiles = [r'C:\foo.xlsx', r'C:\bar.xlsx', r'C:\baz.xlsx']
outrow_idx = 0
for f in xlsfiles:
# This is all untested; essentially just pseudocode for concept!
insheet = xlrd.open_workbook(f).sheets()[0]
for row_idx in xrange(insheet.nrows):
for col_idx in xrange(insheet.ncols):
outsheet.write(outrow_idx, col_idx,
insheet.cell_value(row_idx, col_idx))
outrow_idx += 1
wkbk.save(r'C:\combined.xls')
If your files all have a header line, you probably don't want to repeat that, so you could modify the code above to look more like this:
firstfile = True # Is this the first sheet?
for f in xlsfiles:
insheet = xlrd.open_workbook(f).sheets()[0]
for row_idx in xrange(0 if firstfile else 1, insheet.nrows):
pass # processing; etc
firstfile = False # We're done with the first sheet.
When I combine excel files (mydata1.xlsx, mydata2.xlsx, mydata3.xlsx) for data analysis, here is what I do:
import pandas as pd
import numpy as np
import glob
all_data = pd.DataFrame()
for f in glob.glob('myfolder/mydata*.xlsx'):
df = pd.read_excel(f)
all_data = all_data.append(df, ignore_index=True)
Then, when I want to save it as one file:
writer = pd.ExcelWriter('mycollected_data.xlsx', engine='xlsxwriter')
all_data.to_excel(writer, sheet_name='Sheet1')
writer.save()
Solution with openpyxl only (without a bunch of other dependencies).
This script should take care of merging together an arbitrary number of xlsx documents, whether they have one or multiple sheets. It will preserve the formatting.
There's a function to copy sheets in openpyxl, but it is only from/to the same file. There's also a function insert_rows somewhere, but by itself it won't insert any rows. So I'm afraid we are left to deal (tediously) with one cell at a time.
As much as I dislike using for loops and would rather use something compact and elegant like list comprehension, I don't see how to do that here as this is a side-effect show.
Credit to this answer on copying between workbooks.
#!/usr/bin/env python3
#USAGE
#mergeXLSX.py <a bunch of .xlsx files> ... output.xlsx
#
#where output.xlsx is the unified file
#This works FROM/TO the xlsx format. Libreoffice might help to convert from xls.
#localc --headless --convert-to xlsx somefile.xls
import sys
from copy import copy
from openpyxl import load_workbook,Workbook
def createNewWorkbook(manyWb):
for wb in manyWb:
for sheetName in wb.sheetnames:
o = theOne.create_sheet(sheetName)
safeTitle = o.title
copySheet(wb[sheetName],theOne[safeTitle])
def copySheet(sourceSheet,newSheet):
for row in sourceSheet.rows:
for cell in row:
newCell = newSheet.cell(row=cell.row, column=cell.col_idx,
value= cell.value)
if cell.has_style:
newCell.font = copy(cell.font)
newCell.border = copy(cell.border)
newCell.fill = copy(cell.fill)
newCell.number_format = copy(cell.number_format)
newCell.protection = copy(cell.protection)
newCell.alignment = copy(cell.alignment)
filesInput = sys.argv[1:]
theOneFile = filesInput.pop(-1)
myfriends = [ load_workbook(f) for f in filesInput ]
#try this if you are bored
#myfriends = [ openpyxl.load_workbook(f) for k in range(200) for f in filesInput ]
theOne = Workbook()
del theOne['Sheet'] #We want our new book to be empty. Thanks.
createNewWorkbook(myfriends)
theOne.save(theOneFile)
Tested with openpyxl 2.5.4, python 3.4.
You can simply use pandas and os library to do this.
import pandas as pd
import os
#create an empty dataframe which will have all the combined data
mergedData = pd.DataFrame()
for files in os.listdir():
#make sure you are only reading excel files
if files.endswith('.xlsx'):
data = pd.read_excel(files, index_col=None)
mergedData = mergedData.append(data)
#move the files to other folder so that it does not process multiple times
os.rename(files, 'path to some other folder')
mergedData DF will have all the combined data which you can export in a separate excel or csv file. Same code will work with csv files as well. just replace it in the IF condition
Just to add to p_barill's answer, if you have custom column widths that you need to copy, you can add the following to the bottom of copySheet:
for col in sourceSheet.column_dimensions:
newSheet.column_dimensions[col] = sourceSheet.column_dimensions[col]
I would just post this in a comment on his or her answer but my reputation isn't high enough.

Categories