Excel sniffing, regex, and substring matching - python

I have this code that searches for a "phrase" in a "column" within all of the spreadsheets within a directory, and then outputs the matching date, time, and position into an "output.csv" (the position is on the same line, but the date and time are in the same row, 0-7 rows up from the 'phrase' row position). I need for it to be able to find the "phrase" within a cell, but right now, it only works for exact matches. If a cell in column 20 contained "phrase one", the example below wouldn't write the to the output file.
import os
import xlrd
from xlrd import open_workbook
import datetime
from datetime import time
import csv
# edit these params
outputfile = 'output.csv'
phrase = 'phrase'
column = 20
rootdir = '.'
def writeToCSV(datalist,outputfile):
with open(outputfile, 'w') as f:
for sublist in datalist:
for item in sublist:
f.write(item + ',')
f.write('\n')
def getdata(filename,row):
# print(row)
# print(filename,'filename')
wb = open_workbook(filename)
items = []
for sheet in wb.sheets():
number_of_rows = sheet.nrows
number_of_columns = sheet.ncols
rows = []
# print(filename,' file')
for row1 in range(row,row-10, -1):
# print()
if row1 >= 0 and row1 < number_of_rows:
rowNo = sheet.cell(row1, 2).value
try :
if rowNo != '' and int(rowNo):
datetime1 = datetime.datetime(*xlrd.xldate_as_tuple(sheet.cell_value(rowx=row1, colx=0), wb.datemode))
date_values = xlrd.xldate_as_tuple(sheet.cell_value(rowx=row1, colx=1), wb.datemode)
time_value = time(*date_values[3:])
# print(time_value)
items.append(str(rowNo))
items.append(str(datetime1))
items.append(str(time_value))
# items[str(rowNo)]= str(datetime1)+'-'+str(time_value)
break
except Exception as e:
pass
# print(e)
# print(items)
return items
def extractData(filename,searchString,column):
wb = open_workbook(filename)
dataList = []
for sheet in wb.sheets():
number_of_rows = sheet.nrows
number_of_columns = sheet.ncols
items = []
rows = []
for row in range(1, number_of_rows):
rowdata = []
for col in range(number_of_columns):
value = (sheet.cell(row, col).value)
if value == searchString :
if col == column :
data = getdata(filename,row)
dataList.append(data)
# print(value)
# rowdata.append(value)
# print(len(rowdata))
return dataList
def main():
for subdir, dirs, files in os.walk(rootdir):
for file in files:
fullname =os.path.join(subdir, file)
list = subdir.split('\\')
date = ''
if len(list) > 2 :
date = list[1].split('-')[1] +'-'+ list[2]
# print(date)
# print(file)
if date != '' :
namelist = file.split('-')
if len(namelist)> 2:
if (namelist[0] in date) and (namelist[1] in date):
# print(file)
data = extractData(fullname,phrase,column)
if len(data) > 0 :
writeToCSV(data,outputfile)
if __name__ == '__main__':
main() # call main method
I understand that regex can easily find substrings within a string, but I don't get exactly where to make the modification within the code. In a different language or if the code was written differently, I would try to add an if statement that would write the data to the output file if the string contained "phrase", but I can't determine where the code tries to qualify that the phrase matches the cell value. Any insight on this is appreciated.

In the function extractData you make the comparison if value == searchString :. That is where you check if the string value (from your Excel file) is the same as the searchString (your "pharse").
You can replace that with Pythons searchString in value. The line should look like if searchString in value: You do not need regex if you only looking for substrings.

There's a few things that seem to be causing issues, but the main one might be that your extractData module has found your search string in a row in a specific sheet, and your getData module is using the same row in all of the sheets in the workbook, without validating whether every sheet goes up to that row. It would be better to pass along which sheet you found the search string in, and have "getData" just search that specific sheet.

Related

TabError: inconsistent use of tabs and spaces in indentation when adding to a dictionary

I am trying to move selected images from nested subdirectories. I am match sku from an excel file to the image name (which is also the sku number). Any that matches are then moved into a new folder.
My challenge when I try to create a dictionary to save my full directory I am being faced with the following error message.
File "c:\printing\python\data_clean.py", line 56
fullpath_filelist = {file: os.path.join(root,dirs, file}
^
TabError: inconsistent use of tabs and spaces in indentation
#! python 3
# Create clean version of data file
import openpyxl, webbrowser, sys,re, os, shutil
print('Opening workbook')
#*********************
Main_Searchterm = 'Find'
Sub_Searchterm = 'Marine'
Data_path = 'C:\Printing\Python\data\datafile.xlsx'
Image_folder = 'C:\Printing\Python\data\images'
Sorted_folder ='C:\Printing\Python\data\sorted'
#**********************
def find_category():
wb = openpyxl.load_workbook(Data_path)
sheet = wb['Sheet1']
#This looks for the main search term and put it into column 6
for rowNum in range(2, sheet.max_row+1):
category = sheet['E' + str(rowNum)].value #This control which column to search from
keywordRegex= re.compile(Main_Searchterm)
mo = keywordRegex.search(category)
try:
if mo.group() == Main_Searchterm:
sheet.cell(row = rowNum, column = 6).value = Main_Searchterm #This control which column to add the new search term
except:
pass
#This looks for the sub search term and put it into column 7
for rowNum in range(2, sheet.max_row+1):
category = sheet['E' + str(rowNum)].value #This control which column to search from
keywordRegex= re.compile(Sub_Searchterm)
mo = keywordRegex.search(category)
try:
if mo.group() == Sub_Searchterm:
sheet.cell(row = rowNum, column = 7).value = Sub_Searchterm #This control which column to add the new search term
except:
pass
wb.save(Data_path)
wb = openpyxl.load_workbook(Data_path)
sheet = wb['Sheet1']
filelist = [] #List of all files in directory and subdirectory
fullpath_filelist ={}
for root, dirs, files in os.walk(Image_folder):
for file in files:
#append the file name to the list
filelist.append(file)
fullpath_filelist = {file: os.path.join(root,dirs, file}
for filename in filelist:
for rowNum in range(2, sheet.max_row+1):
#for rowNum in range(2, 3):
image = sheet['H' + str(rowNum)].value #This control which column to search from
final_path = os.path.join(root,Main_Searchterm,Sub_Searchterm,filename)
if str(image) == str(filename):
shutil.move(filename,final_path)
find_category()
Depending on the IDE, ctrl-F for the '\t' and replace with ' ' (4 spaces)

Get all PDF files name under same folder and save in excel according to PDF file name

I have PDF files in same folder. How to get all PDF file names and save as excel file according to PDF file name.
This is what I have tried
def get_files(pdf_path):
import os
os.chdir(pdf_path)
files = os.listdir()
files = [x for x in files if x.endswith(".pdf")]
return files
files = get_files(pdf_path)
for i in files:
save_as_excel(pdf_path, i)
As discussed on chat, this is the continuation of your previous question, which I answered. In the previous question I answered how you can extract text from the pdf file which contains multiple data entity. Now you want to extract the text and parse the content to save the data as csv/xlsx for all pdf files present in the folder.
Please go through all the steps below, all you need to change below is the path of your directory to pdf files path_of_pdf_files
Assumption and logic would remain same from my previous answer.
I have moved the data and methods and encapsulated to a class PdfExtractor.
Please follow the below steps to extract text from pdf and save as xlsx.
Before moving ahead install the packages pdfplumber, xlsxwriter
Save the below code with filename PdfExtractor.py
import pdfplumber
import xlsxwriter
import re
# regex pattern for keys in line1 of data entity
my_regex_dict_line1 = {
'Our Ref' : r'Our Ref :(.*?)Name',
'Name' : r'Name:(.*?)Ref 1',
'Ref 1' : r'Ref 1 :(.*?)Ref 2',
'Ref 2' : r'Ref 2:(.*?)$'
}
# regex pattern for keys in line2 of data entity
my_regex_dict_line2 = {
'Amount' : r'Amount:(.*?)Total Paid',
'Total Paid' : r'Total Paid:(.*?)Balance',
'Balance' : r'Balance:(.*?)Date of A/C',
'Date of A/C' : r'Date of A/C:(.*?)Date Received',
'Date Received' : r'Date Received:(.*?)$'
}
# regex pattern for keys in line3 of data entity
my_regex_dict_line3 ={
'Last Paid' : r'Last Paid:(.*?)Amt Last Paid',
'Amt Last Paid' : r'Amt Last Paid:(.*?)A/C\s+Status',
'A/C Status': r'A/C\s+Status:(.*?)Collector',
'Collector' : r'Collector :(.*?)$'
}
class PdfExtractor:
data_entity_sep_pattern = r'(?=Our Ref.*?Name.*?Ref 1.*?Ref 2)'
def __init__(self, pdf_path):
self.pdf_path = pdf_path
self.json_data = {}
self.pdf_text = ''
def __preprocess_data(self, data):
return [el.strip() for el in data.splitlines() if el.strip()]
def __get_header_data(self, text):
header_data_list = self.__preprocess_data(text)
# third line in text of header contains Date Created field
self.json_data['Date Created'] = re.search(r'Date Created:(.*?)$', header_data_list[2]).group(1).strip()
# fourth line in text contains Number of Pages, Client Code, Client Name
self.json_data['Number of Pages'] = re.search(r'Number of Pages:(.*?)$', header_data_list[3]).group(1).strip()
# fifth line in text contains Client Code and ClientName
self.json_data['Client Code'] = re.search(r'Client Code - (.*?)Client Name', header_data_list[4]).group(1).strip()
self.json_data['ClientName'] = re.search(r'Client Name - (.*?)$', header_data_list[4]).group(1).strip()
def __iterate_through_regex_and_populate_dictionaries(self, data_dict, regex_dict, text):
''' For the given pattern of regex_dict, this function iterates through each regex pattern and adds the key value to regex_dict dictionary '''
for key, regex in regex_dict.items():
matched_value = re.search(regex, text)
if matched_value is not None:
data_dict[key] = matched_value.group(1).strip()
def __populate_date_notes(self, data_dict, text):
''' This function populates date and Notes in the data chunk in the form of list to data_dict dictionary '''
data_dict['Date'] = []
data_dict['Notes'] = []
iter = 4
while(iter < len(text)):
date_match = re.search(r'(\d{2}/\d{2}/\d{4})',text[iter])
data_dict['Date'].append(date_match.group(1).strip())
notes_match = re.search(r'\d{2}/\d{2}/\d{4}\s*(.*?)$',text[iter])
data_dict['Notes'].append(notes_match.group(1).strip())
iter += 1
def get_pdf_text(self):
data_index = 1
with pdfplumber.open(self.pdf_path) as pdf:
index = 0
while(index < len(pdf.pages)):
page = pdf.pages[index]
self.pdf_text += '\n' + page.extract_text()
index += 1
split_on_data_entity = re.split(self.data_entity_sep_pattern, self.pdf_text.strip())
# first data in the split_on_data_entity list will contain the header information
self.__get_header_data(split_on_data_entity[0])
while(data_index < len(split_on_data_entity)):
data_entity = {}
data_processed = self.__preprocess_data(split_on_data_entity[data_index])
self.__iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line1, data_processed[0])
self.__iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line2, data_processed[1])
self.__iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line3, data_processed[2])
if(len(data_processed) > 3 and data_processed[3] != None and 'Date' in data_processed[3] and 'Notes' in data_processed[3]):
self.__populate_date_notes(data_entity, data_processed)
self.json_data['data_entity' + str(data_index)] = data_entity
data_index += 1
return self.json_data
def save_as_xlsx(self, file_name):
if(not self.json_data):
print("Data was not read from PDF")
return
workbook = xlsxwriter.Workbook(file_name)
worksheet = workbook.add_worksheet("Sheet 1")
row = 0
col = 0
# write column
columns = ['Account History Report', 'All Notes'] + [ key for key in self.json_data.keys() if 'data_entity' not in key ] + list(self.json_data['data_entity1'].keys())
worksheet.write_row(row, col, tuple(columns))
row += 1
column_index_map = {}
for index, col in enumerate(columns):
column_index_map[col] = index
# write the header
worksheet.write(row, column_index_map['Date Created'], self.json_data['Date Created'])
worksheet.write(row, column_index_map['Number of Pages'], self.json_data['Number of Pages'])
worksheet.write(row, column_index_map['Client Code'], self.json_data['Client Code'])
worksheet.write(row, column_index_map['ClientName'], self.json_data['ClientName'])
data_entity_index = 1
#iterate through each data entity and for each key insert the values in the sheet
while True:
data_entity_key = 'data_entity' + str(data_entity_index)
row_size = 1
if(self.json_data.get(data_entity_key) != None):
for key, value in self.json_data.get(data_entity_key).items():
if(type(value) == list):
worksheet.write_column(row, column_index_map[key], tuple(value))
row_size = len(value)
else:
worksheet.write(row, column_index_map[key], value)
else:
break
data_entity_index += 1
row += row_size
workbook.close()
print(file_name + " saved successfully")
Execute the below code, it reads all the pdf files inside the folder path_of_pdf_files and saves the data in a xlsx file in the same directory. Also note that the below code should be executed in the same folder where you saved the file PdfExtractor.py
import os
from PdfExtractor import PdfExtractor
path_of_pdf_files = r'C:\Users\hpoddar\Desktop\Temp' # Directory path for your pdf files
files = os.listdir(path_of_pdf_files)
for file in files:
if(not file.endswith(".pdf")):
continue
filename = os.path.splitext(file)[0]
pdf_obj = PdfExtractor(os.path.join(path_of_pdf_files, file))
pdf_text = pdf_obj.get_pdf_text()
pdf_obj.save_as_xlsx(os.path.join(path_of_pdf_files, filename + '.xlsx'))
Output :
C:\Users\hpoddar\Desktop\Temp\sample.xlsx saved successfully
C:\Users\hpoddar\Desktop\Temp\sample2.xlsx saved successfully
C:\Users\hpoddar\Desktop\Temp\sample3.xlsx saved successfully
Lets say you have following pdf files in the directory sample.pdf, sample2.pdf, sample3.pdf. The xlsx files will be created in the same folder with following filename sample.xlsx, sample2.xlsx, sample3.xlsx
Let me know if you have any doubts in the above code.
If you mean saving each filename as an empty excel file, try this :
import os
import openpyxl
pdf_path = '.'
def get_files(pdf_path):
os.chdir(pdf_path)
files = os.listdir()
files = [x for x in files if x.endswith(".pdf")]
return files
files = get_files(pdf_path)
# create an empty workbook (excel file)
wb = openpyxl.workbook.Workbook()
for i in files:
output_path = os.path.join(pdf_path, i).replace('.pdf', '.xlsx')
# save as an excel file with filename
wb.save(output_path)
print(output_path)

How do I get user input into an excel spreadsheet via input() either in a csv or xlsx spreadsheet?

So far, I have been able to access csv and xlsx files in python, but I am unsure how to put in user inputs input() to add data to the spreadsheet.
I would also want this input() to only be enterable once per day but for different columns in my spreadsheet. (this is a separate issue)
Here is my code so far, first for csv, second for xlsx, I don't need both just either will do:
# writing to a CSV file
import csv
def main():
filename = "EdProjDBeg.csv"
header = ("Ans1", "Ans2", "Ans3")
data = [(0, 0, 0)]
writer(header, data, filename, "write")
updater(filename)
def writer(header, data, filename, option):
with open(filename, "w", newline = "") as csvfile:
if option == "write":
clidata = csv.writer(csvfile)
clidata.writerow(header)
for x in data:
clidata.writerow(x)
elif option == "update":
writer = csv.DictWriter(csvfile, fieldnames = header)
writer.writeheader()
writer.writerows(data)
else:
print("Option is not known")
# Updating the CSV files with new data
def updater(filename):
with open(filename, newline= "") as file:
readData = [row for row in csv.DictReader(file)]
readData[0]['Ans2'] = 0
readHeader = readData[0].keys()
writer(readHeader, readData, filename, "update")
# Reading and updating xlsx files
import openpyxl
theFile = openpyxl.load_workbook(r'C:\Users\joe_h\OneDrive\Documents\Data Analysis STUDYING\Excel\EdProjDBeg.xlsx')
print(theFile.sheetnames)
currentsheet = theFile['Customer1']
print(currentsheet['B3'].value)
wb = openpyxl.load_workbook(r'C:\Users\joe_h\OneDrive\Documents\Data Analysis STUDYING\Excel\EdProjDBeg.xlsx')
ws = wb.active
i = 0
cell_val = ''
# Finds which row is blank first
while cell_val != '':
cell_val = ws['A' + i].value
i += 1
# Modify Sheet, Starting With Row i
wb.save(r'C:\Users\joe_h\OneDrive\Documents\Data Analysis STUDYING\Excel\EdProjDBeg.xlsx')
x = input('Prompt: ')
This works for inputting data into an xlsx file.
Just use:
ws['A1'] = "data"
to input into cell A1
See code below for example using your original code:
wb = openpyxl.load_workbook('sample.xlsx')
print(wb.sheetnames)
currentsheet = wb['Sheet']
ws = currentsheet
#ws = wb.active <-- defaults to first sheet
i = 0
cell_val = ''
# Finds which row is blank first
while cell_val != None:
i += 1
cell_val = ws['A' + str(i)].value
print(cell_val)
x = input('Prompt: ')
#sets A column of first blank row to be user input
ws['A' + str(i)] = x
#saves spreadsheet
wb.save("sample.xlsx")
Also just made a few edits to your original while loop in the above code:
When a cell is blank, 'None' is returned
A1 is the first cell on the left, not A0 (moved i += 1 above finding value of cell)
Converted variable 'i' to a string when accessing the cell
See https://openpyxl.readthedocs.io/en/stable/ for the full documentation

all data variables in the same row CSV with Python

# coding=utf-8
# Libreria RegEx de Python.
import re
# Libreria para rutas.
import os
import csv
# function betwwen: return the value between two words a and b
def between(value, a, b):
pos_a = value.find(a) # Find and validate before-part.
if pos_a == -1: return "" # Find and validate after part.
pos_b = value.rfind(b)
if pos_b == -1: return "" # Return middle part.
adjusted_pos_a = pos_a + len(a)
if adjusted_pos_a >= pos_b: return ""
return value[adjusted_pos_a:pos_b]
# function scan folder DiarioOficial
def scan_folder():
# directory 'path'
path = '/Users/anna/PycharmProjects/extractData/DiarioOficial'
# contador de ficheros del path
count = 0
# creation csv as csvFile
with open('All_Companies1.csv', 'a') as csvFile:
# iterate all paths in the folder DiarioOficial without name
for (path, dirnames, file_names) in os.walk(path):
# iterate over all the files in the path (+ file_name)
for file_name in file_names:
# Add extension that is required
if file_name.endswith(".txt"):
# summatory count files in DiarioOficial folder
count = count + 1
# concatenation path + file name
file_path=os.path.join(path, file_name)
#print(file_path)
# open and read the file path
mensaje = open(file_path).read()
# Replace a newline for a space
mensaje = mensaje.replace("\n","")
# Company Name
keywords_cap = ['SpA', 'SPA', 'LIMITADA', 'LTDA', 'S.A.', 'E.I.R.L.', 'S.L.']
# re.escape to solve the problem with metacharacters in keyword_obj
keywords_cap = map(re.escape, keywords_cap)
# sorting the items by lengh in descending order
keywords_cap.sort(key=len, reverse=True)
obj = re.compile(r'[:,;.]\s*"?([^:,;.]*?(?<!\w)(?:{}))'.format('|'.join(keywords_cap)))
if obj:
# To obtain the first match obj.search(mensaje).group(1)
company_name = obj.search(mensaje)
else:
company_name = "None"
# CVE Number of the file
regex = r"\s*CVE\s+([^|]*)"
matches = re.search(regex, mensaje)
if matches:
company_cve = matches.group(1).strip()
else:
company_cve = "None"
# Section of diariooficial.interior.gob.cl
company_sect = between(mensaje, 'SECCIÓN', 'Núm.')
if company_sect:
company_sect = company_sect
else:
company_sect = "None"
# Name of the person that constitutes the company
company_ceo = re.search(r'\sante mí,\s+([^,]*)', mensaje)
if company_ceo:
company_ceo = company_ceo.group(1)
else:
company_ceo = "None"
# File Number from Section
num_reg = r'\sNúm.\s+([^|]*)'
match_num = re.search(num_reg, mensaje)
if match_num:
company_numsect = match_num.group(1)
else:
company_numsect = "None"
# Social Capital ($)
cap = r"\s*(CAPITAL:\s+([^-]*)|Capital social:\s+([^-]*)|Capital:\s+([^-]*)|Capital:\s+([^,]*))"
caps = re.search(cap, mensaje)
if caps:
company_capital = caps.group()
else:
company_capital = 'None'
csvData = [company_name, company_cve, company_sect, company_ceo, company_numsect, company_capital]
headers = ['COMPANY NAME', 'CVE', 'SECTION','CEO NAME','NUMBER SECTOR','COMPANY CAPITAL']
writer = csv.writer(csvFile, delimiter=',') # create a csv delimited by comma
writer.writerow(headers) # print the header row
writer.writerow(csvData) # print the Data in csv
# Number of txt files
print (count)
scan_folder()
I have this script that create a csv with the data extracted from a text in specific path. In spite of the errors that can be on RegEx, mainly it extracts parts of text that it keeps them in variables and the printa in a csv. Each company must have a single line in this csv. In this way, when the csv is opened, the number of companies and all the information can be visualized by variables.
My problem is that when I see the CSV called, in this case, All_companies1, the data is not put in the same row, they jump.
Also, the titles are repeated, and I do not want them to repeat themselves
First try changing the mode for the csvFile from a (append) to w (write), also check if the editor you're using actual uses the comma as the column delimiter for csv files, since in the above picture is seems as if the comma is seen by the editor as a normal character.
Also remove any carriage return characters (\n \r) from your string before printing it, this can be done in the following code.
csvData = [str(data).replace('\n', '').replace('\r', '') for data in csvData]
Note:
if by any chance this works, there might be a problem with with having empty rows in the csv file beteen each two elements, this can be fixed by changing with open('All_Companies1.csv', 'a') as csvFile to with open('All_Companies1.csv', 'a', newline='') as csvFile

Trying to iterate through CSV files in a directory and grab a particular cell then post results to a single csv

I am trying to iterate through several CSV files in a directory and grab a particular cell (same cell location) from each CSV file (cell location found when opened in Excel) and then post all similar cells in a single CSV or xls file, one after the other.
I have writen the code below (with some researched help) but I am just iterating over the first csv file in my list and printing the same value each time, dependant on the number of CSV files in my list. Could anybody point me in the right direction?
Here's my poor attempt!
import xlwt
import xlrd
import csv
import glob
import os
files = ['1_IQ_QTA.csv','2_IQ_QTA.csv','3_IQ_QTA.csv','4_IQ_QTA.csv']
n = 0
row = 0
filename = ('outputList.csv', 'a')
fname = files[n]
workbookr = xlrd.open_workbook(fname)
sheetr = workbookr.sheet_by_index(0)
workbookw = xlwt.Workbook()
sheetw = workbookw.add_sheet('test')
while n<len(files):
fname = files[n]
workbookr = xlrd.open_workbook(fname[n])
data = [sheetr.cell_value(12, 1) for col in range(sheetr.ncols)]
for index, value in enumerate(data):
sheetw.write(row, index, value)
workbookw.save('outputList.csv')
row = row +1
n = n+1
workbookw.save('outputList.csv')
My code is still a bit messy, I may have leftover code from my various attempts!
Thanks
MikG
Assuming you are just trying to make a CSV file of the same cells from each file. So if you had 4 files, your output file will have 4 entries.
files = ['1_IQ_QTA.csv','2_IQ_QTA.csv','3_IQ_QTA.csv','4_IQ_QTA.csv']
n = 0
row = 0
outputfile = open('outputList.csv', 'w')
cellrow = 12 #collect the cell (12, 1) from each file and put it in the output list
cellcolumn = 1
while n<len(files):
fname = files[n]
currentfile = open(fname,'r')
for i in range (cellrow):
currentrow = currentfile.readline()
# print currentrow #for testing
columncnt=0
currentcell = ''
openquote = False
for char in currentrow:
if char == '"' and not openquote:
openquote = True
elif char == '"' and openquote:
openquote = False
elif char == ',' and not openquote:
columncnt+=1
if columncnt == cellcolumn:
cellvalue = currentcell
# print cellvalue #for testing
currentcell=''
else:
currentcell += char
outputfile.write (cellvalue + ',')
currentfile.close()
n += 1
outputfile.close()
It seemed to me that since you already had a CSV it would be easier to deal with as a regular file and parse through to find the right information, plus nothing to import. Happy coding!
I think you have an error at this line in the while loop:
workbookr = xlrd.open_workbook(fname[n])
must be:
workbookr = xlrd.open_workbook(fname)
otherwise your workbookr remains as you set it before outside the loop:
fname = files[n]
workbookr = xlrd.open_workbook(fname)
which is the first file in your list.
Since they are just csv files, there is no need for the excel libraries.
#!/usr/bin/env python
import argparse, csv
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='merge csv files on field', version='%(prog)s 1.0')
parser.add_argument('infile', nargs='+', type=str, help='list of input files')
parser.add_argument('--col', type=int, default=0, help='Column to grab')
parser.add_argument('--row', type=int, default=0, help='Row to grab')
parser.add_argument('--out', type=str, default='temp.csv', help='name of output file')
args = parser.parse_args()
data = []
for fname in args.infile:
with open(fname, 'rb') as df:
reader = csv.reader(df)
for index, line in enumerate(reader):
if index == args.row:
data.push(line[args.column])
del reader
writer = csv.writer(open(args.out, "wb"), dialect='excel')
writer.writerows(data)
del writer

Categories