all data variables in the same row CSV with Python - python

# coding=utf-8
# Libreria RegEx de Python.
import re
# Libreria para rutas.
import os
import csv
# function betwwen: return the value between two words a and b
def between(value, a, b):
pos_a = value.find(a) # Find and validate before-part.
if pos_a == -1: return "" # Find and validate after part.
pos_b = value.rfind(b)
if pos_b == -1: return "" # Return middle part.
adjusted_pos_a = pos_a + len(a)
if adjusted_pos_a >= pos_b: return ""
return value[adjusted_pos_a:pos_b]
# function scan folder DiarioOficial
def scan_folder():
# directory 'path'
path = '/Users/anna/PycharmProjects/extractData/DiarioOficial'
# contador de ficheros del path
count = 0
# creation csv as csvFile
with open('All_Companies1.csv', 'a') as csvFile:
# iterate all paths in the folder DiarioOficial without name
for (path, dirnames, file_names) in os.walk(path):
# iterate over all the files in the path (+ file_name)
for file_name in file_names:
# Add extension that is required
if file_name.endswith(".txt"):
# summatory count files in DiarioOficial folder
count = count + 1
# concatenation path + file name
file_path=os.path.join(path, file_name)
#print(file_path)
# open and read the file path
mensaje = open(file_path).read()
# Replace a newline for a space
mensaje = mensaje.replace("\n","")
# Company Name
keywords_cap = ['SpA', 'SPA', 'LIMITADA', 'LTDA', 'S.A.', 'E.I.R.L.', 'S.L.']
# re.escape to solve the problem with metacharacters in keyword_obj
keywords_cap = map(re.escape, keywords_cap)
# sorting the items by lengh in descending order
keywords_cap.sort(key=len, reverse=True)
obj = re.compile(r'[:,;.]\s*"?([^:,;.]*?(?<!\w)(?:{}))'.format('|'.join(keywords_cap)))
if obj:
# To obtain the first match obj.search(mensaje).group(1)
company_name = obj.search(mensaje)
else:
company_name = "None"
# CVE Number of the file
regex = r"\s*CVE\s+([^|]*)"
matches = re.search(regex, mensaje)
if matches:
company_cve = matches.group(1).strip()
else:
company_cve = "None"
# Section of diariooficial.interior.gob.cl
company_sect = between(mensaje, 'SECCIÓN', 'Núm.')
if company_sect:
company_sect = company_sect
else:
company_sect = "None"
# Name of the person that constitutes the company
company_ceo = re.search(r'\sante mí,\s+([^,]*)', mensaje)
if company_ceo:
company_ceo = company_ceo.group(1)
else:
company_ceo = "None"
# File Number from Section
num_reg = r'\sNúm.\s+([^|]*)'
match_num = re.search(num_reg, mensaje)
if match_num:
company_numsect = match_num.group(1)
else:
company_numsect = "None"
# Social Capital ($)
cap = r"\s*(CAPITAL:\s+([^-]*)|Capital social:\s+([^-]*)|Capital:\s+([^-]*)|Capital:\s+([^,]*))"
caps = re.search(cap, mensaje)
if caps:
company_capital = caps.group()
else:
company_capital = 'None'
csvData = [company_name, company_cve, company_sect, company_ceo, company_numsect, company_capital]
headers = ['COMPANY NAME', 'CVE', 'SECTION','CEO NAME','NUMBER SECTOR','COMPANY CAPITAL']
writer = csv.writer(csvFile, delimiter=',') # create a csv delimited by comma
writer.writerow(headers) # print the header row
writer.writerow(csvData) # print the Data in csv
# Number of txt files
print (count)
scan_folder()
I have this script that create a csv with the data extracted from a text in specific path. In spite of the errors that can be on RegEx, mainly it extracts parts of text that it keeps them in variables and the printa in a csv. Each company must have a single line in this csv. In this way, when the csv is opened, the number of companies and all the information can be visualized by variables.
My problem is that when I see the CSV called, in this case, All_companies1, the data is not put in the same row, they jump.
Also, the titles are repeated, and I do not want them to repeat themselves

First try changing the mode for the csvFile from a (append) to w (write), also check if the editor you're using actual uses the comma as the column delimiter for csv files, since in the above picture is seems as if the comma is seen by the editor as a normal character.
Also remove any carriage return characters (\n \r) from your string before printing it, this can be done in the following code.
csvData = [str(data).replace('\n', '').replace('\r', '') for data in csvData]
Note:
if by any chance this works, there might be a problem with with having empty rows in the csv file beteen each two elements, this can be fixed by changing with open('All_Companies1.csv', 'a') as csvFile to with open('All_Companies1.csv', 'a', newline='') as csvFile

Related

TabError: inconsistent use of tabs and spaces in indentation when adding to a dictionary

I am trying to move selected images from nested subdirectories. I am match sku from an excel file to the image name (which is also the sku number). Any that matches are then moved into a new folder.
My challenge when I try to create a dictionary to save my full directory I am being faced with the following error message.
File "c:\printing\python\data_clean.py", line 56
fullpath_filelist = {file: os.path.join(root,dirs, file}
^
TabError: inconsistent use of tabs and spaces in indentation
#! python 3
# Create clean version of data file
import openpyxl, webbrowser, sys,re, os, shutil
print('Opening workbook')
#*********************
Main_Searchterm = 'Find'
Sub_Searchterm = 'Marine'
Data_path = 'C:\Printing\Python\data\datafile.xlsx'
Image_folder = 'C:\Printing\Python\data\images'
Sorted_folder ='C:\Printing\Python\data\sorted'
#**********************
def find_category():
wb = openpyxl.load_workbook(Data_path)
sheet = wb['Sheet1']
#This looks for the main search term and put it into column 6
for rowNum in range(2, sheet.max_row+1):
category = sheet['E' + str(rowNum)].value #This control which column to search from
keywordRegex= re.compile(Main_Searchterm)
mo = keywordRegex.search(category)
try:
if mo.group() == Main_Searchterm:
sheet.cell(row = rowNum, column = 6).value = Main_Searchterm #This control which column to add the new search term
except:
pass
#This looks for the sub search term and put it into column 7
for rowNum in range(2, sheet.max_row+1):
category = sheet['E' + str(rowNum)].value #This control which column to search from
keywordRegex= re.compile(Sub_Searchterm)
mo = keywordRegex.search(category)
try:
if mo.group() == Sub_Searchterm:
sheet.cell(row = rowNum, column = 7).value = Sub_Searchterm #This control which column to add the new search term
except:
pass
wb.save(Data_path)
wb = openpyxl.load_workbook(Data_path)
sheet = wb['Sheet1']
filelist = [] #List of all files in directory and subdirectory
fullpath_filelist ={}
for root, dirs, files in os.walk(Image_folder):
for file in files:
#append the file name to the list
filelist.append(file)
fullpath_filelist = {file: os.path.join(root,dirs, file}
for filename in filelist:
for rowNum in range(2, sheet.max_row+1):
#for rowNum in range(2, 3):
image = sheet['H' + str(rowNum)].value #This control which column to search from
final_path = os.path.join(root,Main_Searchterm,Sub_Searchterm,filename)
if str(image) == str(filename):
shutil.move(filename,final_path)
find_category()
Depending on the IDE, ctrl-F for the '\t' and replace with ' ' (4 spaces)

Get all PDF files name under same folder and save in excel according to PDF file name

I have PDF files in same folder. How to get all PDF file names and save as excel file according to PDF file name.
This is what I have tried
def get_files(pdf_path):
import os
os.chdir(pdf_path)
files = os.listdir()
files = [x for x in files if x.endswith(".pdf")]
return files
files = get_files(pdf_path)
for i in files:
save_as_excel(pdf_path, i)
As discussed on chat, this is the continuation of your previous question, which I answered. In the previous question I answered how you can extract text from the pdf file which contains multiple data entity. Now you want to extract the text and parse the content to save the data as csv/xlsx for all pdf files present in the folder.
Please go through all the steps below, all you need to change below is the path of your directory to pdf files path_of_pdf_files
Assumption and logic would remain same from my previous answer.
I have moved the data and methods and encapsulated to a class PdfExtractor.
Please follow the below steps to extract text from pdf and save as xlsx.
Before moving ahead install the packages pdfplumber, xlsxwriter
Save the below code with filename PdfExtractor.py
import pdfplumber
import xlsxwriter
import re
# regex pattern for keys in line1 of data entity
my_regex_dict_line1 = {
'Our Ref' : r'Our Ref :(.*?)Name',
'Name' : r'Name:(.*?)Ref 1',
'Ref 1' : r'Ref 1 :(.*?)Ref 2',
'Ref 2' : r'Ref 2:(.*?)$'
}
# regex pattern for keys in line2 of data entity
my_regex_dict_line2 = {
'Amount' : r'Amount:(.*?)Total Paid',
'Total Paid' : r'Total Paid:(.*?)Balance',
'Balance' : r'Balance:(.*?)Date of A/C',
'Date of A/C' : r'Date of A/C:(.*?)Date Received',
'Date Received' : r'Date Received:(.*?)$'
}
# regex pattern for keys in line3 of data entity
my_regex_dict_line3 ={
'Last Paid' : r'Last Paid:(.*?)Amt Last Paid',
'Amt Last Paid' : r'Amt Last Paid:(.*?)A/C\s+Status',
'A/C Status': r'A/C\s+Status:(.*?)Collector',
'Collector' : r'Collector :(.*?)$'
}
class PdfExtractor:
data_entity_sep_pattern = r'(?=Our Ref.*?Name.*?Ref 1.*?Ref 2)'
def __init__(self, pdf_path):
self.pdf_path = pdf_path
self.json_data = {}
self.pdf_text = ''
def __preprocess_data(self, data):
return [el.strip() for el in data.splitlines() if el.strip()]
def __get_header_data(self, text):
header_data_list = self.__preprocess_data(text)
# third line in text of header contains Date Created field
self.json_data['Date Created'] = re.search(r'Date Created:(.*?)$', header_data_list[2]).group(1).strip()
# fourth line in text contains Number of Pages, Client Code, Client Name
self.json_data['Number of Pages'] = re.search(r'Number of Pages:(.*?)$', header_data_list[3]).group(1).strip()
# fifth line in text contains Client Code and ClientName
self.json_data['Client Code'] = re.search(r'Client Code - (.*?)Client Name', header_data_list[4]).group(1).strip()
self.json_data['ClientName'] = re.search(r'Client Name - (.*?)$', header_data_list[4]).group(1).strip()
def __iterate_through_regex_and_populate_dictionaries(self, data_dict, regex_dict, text):
''' For the given pattern of regex_dict, this function iterates through each regex pattern and adds the key value to regex_dict dictionary '''
for key, regex in regex_dict.items():
matched_value = re.search(regex, text)
if matched_value is not None:
data_dict[key] = matched_value.group(1).strip()
def __populate_date_notes(self, data_dict, text):
''' This function populates date and Notes in the data chunk in the form of list to data_dict dictionary '''
data_dict['Date'] = []
data_dict['Notes'] = []
iter = 4
while(iter < len(text)):
date_match = re.search(r'(\d{2}/\d{2}/\d{4})',text[iter])
data_dict['Date'].append(date_match.group(1).strip())
notes_match = re.search(r'\d{2}/\d{2}/\d{4}\s*(.*?)$',text[iter])
data_dict['Notes'].append(notes_match.group(1).strip())
iter += 1
def get_pdf_text(self):
data_index = 1
with pdfplumber.open(self.pdf_path) as pdf:
index = 0
while(index < len(pdf.pages)):
page = pdf.pages[index]
self.pdf_text += '\n' + page.extract_text()
index += 1
split_on_data_entity = re.split(self.data_entity_sep_pattern, self.pdf_text.strip())
# first data in the split_on_data_entity list will contain the header information
self.__get_header_data(split_on_data_entity[0])
while(data_index < len(split_on_data_entity)):
data_entity = {}
data_processed = self.__preprocess_data(split_on_data_entity[data_index])
self.__iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line1, data_processed[0])
self.__iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line2, data_processed[1])
self.__iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line3, data_processed[2])
if(len(data_processed) > 3 and data_processed[3] != None and 'Date' in data_processed[3] and 'Notes' in data_processed[3]):
self.__populate_date_notes(data_entity, data_processed)
self.json_data['data_entity' + str(data_index)] = data_entity
data_index += 1
return self.json_data
def save_as_xlsx(self, file_name):
if(not self.json_data):
print("Data was not read from PDF")
return
workbook = xlsxwriter.Workbook(file_name)
worksheet = workbook.add_worksheet("Sheet 1")
row = 0
col = 0
# write column
columns = ['Account History Report', 'All Notes'] + [ key for key in self.json_data.keys() if 'data_entity' not in key ] + list(self.json_data['data_entity1'].keys())
worksheet.write_row(row, col, tuple(columns))
row += 1
column_index_map = {}
for index, col in enumerate(columns):
column_index_map[col] = index
# write the header
worksheet.write(row, column_index_map['Date Created'], self.json_data['Date Created'])
worksheet.write(row, column_index_map['Number of Pages'], self.json_data['Number of Pages'])
worksheet.write(row, column_index_map['Client Code'], self.json_data['Client Code'])
worksheet.write(row, column_index_map['ClientName'], self.json_data['ClientName'])
data_entity_index = 1
#iterate through each data entity and for each key insert the values in the sheet
while True:
data_entity_key = 'data_entity' + str(data_entity_index)
row_size = 1
if(self.json_data.get(data_entity_key) != None):
for key, value in self.json_data.get(data_entity_key).items():
if(type(value) == list):
worksheet.write_column(row, column_index_map[key], tuple(value))
row_size = len(value)
else:
worksheet.write(row, column_index_map[key], value)
else:
break
data_entity_index += 1
row += row_size
workbook.close()
print(file_name + " saved successfully")
Execute the below code, it reads all the pdf files inside the folder path_of_pdf_files and saves the data in a xlsx file in the same directory. Also note that the below code should be executed in the same folder where you saved the file PdfExtractor.py
import os
from PdfExtractor import PdfExtractor
path_of_pdf_files = r'C:\Users\hpoddar\Desktop\Temp' # Directory path for your pdf files
files = os.listdir(path_of_pdf_files)
for file in files:
if(not file.endswith(".pdf")):
continue
filename = os.path.splitext(file)[0]
pdf_obj = PdfExtractor(os.path.join(path_of_pdf_files, file))
pdf_text = pdf_obj.get_pdf_text()
pdf_obj.save_as_xlsx(os.path.join(path_of_pdf_files, filename + '.xlsx'))
Output :
C:\Users\hpoddar\Desktop\Temp\sample.xlsx saved successfully
C:\Users\hpoddar\Desktop\Temp\sample2.xlsx saved successfully
C:\Users\hpoddar\Desktop\Temp\sample3.xlsx saved successfully
Lets say you have following pdf files in the directory sample.pdf, sample2.pdf, sample3.pdf. The xlsx files will be created in the same folder with following filename sample.xlsx, sample2.xlsx, sample3.xlsx
Let me know if you have any doubts in the above code.
If you mean saving each filename as an empty excel file, try this :
import os
import openpyxl
pdf_path = '.'
def get_files(pdf_path):
os.chdir(pdf_path)
files = os.listdir()
files = [x for x in files if x.endswith(".pdf")]
return files
files = get_files(pdf_path)
# create an empty workbook (excel file)
wb = openpyxl.workbook.Workbook()
for i in files:
output_path = os.path.join(pdf_path, i).replace('.pdf', '.xlsx')
# save as an excel file with filename
wb.save(output_path)
print(output_path)

Excel sniffing, regex, and substring matching

I have this code that searches for a "phrase" in a "column" within all of the spreadsheets within a directory, and then outputs the matching date, time, and position into an "output.csv" (the position is on the same line, but the date and time are in the same row, 0-7 rows up from the 'phrase' row position). I need for it to be able to find the "phrase" within a cell, but right now, it only works for exact matches. If a cell in column 20 contained "phrase one", the example below wouldn't write the to the output file.
import os
import xlrd
from xlrd import open_workbook
import datetime
from datetime import time
import csv
# edit these params
outputfile = 'output.csv'
phrase = 'phrase'
column = 20
rootdir = '.'
def writeToCSV(datalist,outputfile):
with open(outputfile, 'w') as f:
for sublist in datalist:
for item in sublist:
f.write(item + ',')
f.write('\n')
def getdata(filename,row):
# print(row)
# print(filename,'filename')
wb = open_workbook(filename)
items = []
for sheet in wb.sheets():
number_of_rows = sheet.nrows
number_of_columns = sheet.ncols
rows = []
# print(filename,' file')
for row1 in range(row,row-10, -1):
# print()
if row1 >= 0 and row1 < number_of_rows:
rowNo = sheet.cell(row1, 2).value
try :
if rowNo != '' and int(rowNo):
datetime1 = datetime.datetime(*xlrd.xldate_as_tuple(sheet.cell_value(rowx=row1, colx=0), wb.datemode))
date_values = xlrd.xldate_as_tuple(sheet.cell_value(rowx=row1, colx=1), wb.datemode)
time_value = time(*date_values[3:])
# print(time_value)
items.append(str(rowNo))
items.append(str(datetime1))
items.append(str(time_value))
# items[str(rowNo)]= str(datetime1)+'-'+str(time_value)
break
except Exception as e:
pass
# print(e)
# print(items)
return items
def extractData(filename,searchString,column):
wb = open_workbook(filename)
dataList = []
for sheet in wb.sheets():
number_of_rows = sheet.nrows
number_of_columns = sheet.ncols
items = []
rows = []
for row in range(1, number_of_rows):
rowdata = []
for col in range(number_of_columns):
value = (sheet.cell(row, col).value)
if value == searchString :
if col == column :
data = getdata(filename,row)
dataList.append(data)
# print(value)
# rowdata.append(value)
# print(len(rowdata))
return dataList
def main():
for subdir, dirs, files in os.walk(rootdir):
for file in files:
fullname =os.path.join(subdir, file)
list = subdir.split('\\')
date = ''
if len(list) > 2 :
date = list[1].split('-')[1] +'-'+ list[2]
# print(date)
# print(file)
if date != '' :
namelist = file.split('-')
if len(namelist)> 2:
if (namelist[0] in date) and (namelist[1] in date):
# print(file)
data = extractData(fullname,phrase,column)
if len(data) > 0 :
writeToCSV(data,outputfile)
if __name__ == '__main__':
main() # call main method
I understand that regex can easily find substrings within a string, but I don't get exactly where to make the modification within the code. In a different language or if the code was written differently, I would try to add an if statement that would write the data to the output file if the string contained "phrase", but I can't determine where the code tries to qualify that the phrase matches the cell value. Any insight on this is appreciated.
In the function extractData you make the comparison if value == searchString :. That is where you check if the string value (from your Excel file) is the same as the searchString (your "pharse").
You can replace that with Pythons searchString in value. The line should look like if searchString in value: You do not need regex if you only looking for substrings.
There's a few things that seem to be causing issues, but the main one might be that your extractData module has found your search string in a row in a specific sheet, and your getData module is using the same row in all of the sheets in the workbook, without validating whether every sheet goes up to that row. It would be better to pass along which sheet you found the search string in, and have "getData" just search that specific sheet.

Python -- How to split headers/chapters into separate files automatically

I'm converting text directly to epub and I'm having a problem automatically splitting the HTML book file into separate header/chapter files. At the moment, the code below partially works but only creates every other chapter file. So half the header/chapter files are missing from the output. Here is the code:
def splitHeaderstoFiles(fpath):
infp = open(fpath, 'rt', encoding=('utf-8'))
for line in infp:
# format and split headers to files
if '<h1' in line:
#-----------format header file names and other stuff ------------#
# create a new file for the header/chapter section
path = os.getcwd() + os.sep + header
with open(path, 'wt', encoding=('utf-8')) as outfp:
# write html top meta headers
outfp = addMetaHeaders(outfp)
# add the header
outfp = outfp.write(line)
# add the chapter/header bodytext
for line in infp:
if '<h1' not in line:
outfp.write(line)
else:
outfp.write('</body>\n</html>')
break
else:
continue
infp.close()
The problem occurs in the second 'for loop' at the bottom of the code, when I look for the next h1 tag to stop the split. I cannot use seek() or tell() to rewind or move back one line so the program can find the next header/chapter on the next iteration. Apparently you cannot use these in python in a for loop containing an implicit iter or next object in operation. Just gives a 'can't do non-zero cur-relative seeks' error.
I've also tried the while line != ' ' + readline() combination in the code which also gives the same error as above.
Does anyone know an easy way to split HTML headers/chapters of varying lengths into separate files in python? Are there any special python modules(such as pickles) that could help make this task easier?
I'm using Python 3.4
My grateful thanks in advance for any solutions to this problem...
I ran into similar problem a while ago, here is a simplified solution:
from itertools import count
chapter_number = count(1)
output_file = open('000-intro.html', 'wb')
with open('index.html', 'rt') as input_file:
for line in input_file:
if '<h1' in line:
output_file.close()
output_file = open('{:03}-chapter'.format(next(chapter_number)), 'wb')
output_file.write(line)
output_file.close()
In this approach, the first block of text leading to the first h1 block is written into 000-intro.html, the first chapter will be written into 001-chapter.html and so on. Please modify it to taste.
The solution is a simple one: Upon encountering the h1 tag, close the last output file and open a new one.
You are looping over your input file twice, which is likely causing your problems:
for line in infp:
...
with open(path, 'wt', encoding=('utf-8')) as outfp:
...
for line in infp:
...
Each for is going to have it's own iterator, so you are going to loop over the file many times.
You might try transforming your for loop into a while so you're not using two different iterators:
while infp:
line = infp.readline()
if '<h1' in line:
with open(...) as outfp:
while infp:
line = infp.readline()
if '<h1' in line:
break
outfp.writeline(...)
Alternatively, you may wish to use an HTML parser (i.e., BeautifulSoup). Then you can do something like what is described here: https://stackoverflow.com/a/8735688/65295.
Update from comment - essentially, read the entire file all at once so you can freely move back or forward as necessary. This probably won't be a performance issue unless you have a really really big file (or very little memory).
lines = infp.readlines() # read the entire file
i = 0
while i < len(lines):
if '<h1' in lines[i]:
with open(...) as outfp:
j = i + 1
while j < len(lines):
if '<h1' in lines[j]:
break
outfp.writeline(lines[j])
# line j has an <h1>, set i to j so we detect the it at the
# top of the next loop iteration.
i = j
else:
i += 1
I eventually found the answer to the above problem. The code below does alot more that just get the file header. It also simultaneously loads two parallel list arrays with formatted file name data(with extension) and pure header name data respectively so I can use these lists to fill in the and formatted filename extension in these html files within a while loop in one hit. The code now works well and is shown below.
def splitHeaderstoFiles(dir, inpath):
count = 1
t_count = 0
out_path = ''
header = ''
write_bodytext = False
file_path_names = []
pure_header_names = []
inpath = dir + os.sep + inpath
with open(inpath, 'rt', encoding=('utf-8')) as infp:
for line in infp:
if '<h1' in line:
#strip html tags, convert to start caps
p = re.compile(r'<.*?>')
header = p.sub('', line)
header = capwords(header)
line_save = header
# Add 0 for count below 10
if count < 10:
header = '0' + str(count) + '_' + header
else:
header = str(count) + '_' + header
# remove all spaces + add extension in header
header = header.replace(' ', '_')
header = header + '.xhtml'
count = count + 1
#create two parallel lists used later
out_path = dir + os.sep + header
outfp = open(out_path, 'wt', encoding=('utf-8'))
file_path_names.insert(t_count, out_path)
pure_header_names.insert(t_count, line_save)
t_count = t_count + 1
# Add html meta headers and write it
outfp = addMainHeaders(outfp)
outfp.write(line)
write_bodytext = True
# add header bodytext
elif write_bodytext == True:
outfp.write(line)
# now add html titles and close the html tails on all files
max_num_files = len(file_path_names)
tmp = dir + os.sep + 'temp1.tmp'
i = 0
while i < max_num_files:
outfp = open(tmp, 'wt', encoding=('utf-8'))
infp = open(file_path_names[i], 'rt', encoding=('utf-8'))
for line in infp:
if '<title>' in line:
line = line.strip(' ')
line = line.replace('<title></title>', '<title>' + pure_header_names[i] + '</title>')
outfp.write(line)
else:
outfp.write(line)
# add the html tail
if '</body>' in line or '</html>' in line:
pass
else:
outfp.write(' </body>' + '\n</html>')
# clean up
infp.close()
outfp.close()
shutil.copy2(tmp, file_path_names[i])
os.remove(tmp)
i = i + 1
# now rename just the title page
if os.path.isfile(file_path_names[0]):
title_page_name = file_path_names[0]
new_title_page_name = dir + os.sep + '01_Title.xhtml'
os.rename(title_page_name, new_title_page_name)
file_path_names[0] = '01_Title.xhtml'
else:
logmsg27(DEBUG_FLAG)
os._exit(0)
# xhtml file is no longer needed
if os.path.isfile(inpath):
os.remove(inpath)
# returned list values are also used
# later to create epub opf and ncx files
return(file_path_names, pure_header_names)
#Hai Vu and #Seth -- Thanks for all your help.

Trying to iterate through CSV files in a directory and grab a particular cell then post results to a single csv

I am trying to iterate through several CSV files in a directory and grab a particular cell (same cell location) from each CSV file (cell location found when opened in Excel) and then post all similar cells in a single CSV or xls file, one after the other.
I have writen the code below (with some researched help) but I am just iterating over the first csv file in my list and printing the same value each time, dependant on the number of CSV files in my list. Could anybody point me in the right direction?
Here's my poor attempt!
import xlwt
import xlrd
import csv
import glob
import os
files = ['1_IQ_QTA.csv','2_IQ_QTA.csv','3_IQ_QTA.csv','4_IQ_QTA.csv']
n = 0
row = 0
filename = ('outputList.csv', 'a')
fname = files[n]
workbookr = xlrd.open_workbook(fname)
sheetr = workbookr.sheet_by_index(0)
workbookw = xlwt.Workbook()
sheetw = workbookw.add_sheet('test')
while n<len(files):
fname = files[n]
workbookr = xlrd.open_workbook(fname[n])
data = [sheetr.cell_value(12, 1) for col in range(sheetr.ncols)]
for index, value in enumerate(data):
sheetw.write(row, index, value)
workbookw.save('outputList.csv')
row = row +1
n = n+1
workbookw.save('outputList.csv')
My code is still a bit messy, I may have leftover code from my various attempts!
Thanks
MikG
Assuming you are just trying to make a CSV file of the same cells from each file. So if you had 4 files, your output file will have 4 entries.
files = ['1_IQ_QTA.csv','2_IQ_QTA.csv','3_IQ_QTA.csv','4_IQ_QTA.csv']
n = 0
row = 0
outputfile = open('outputList.csv', 'w')
cellrow = 12 #collect the cell (12, 1) from each file and put it in the output list
cellcolumn = 1
while n<len(files):
fname = files[n]
currentfile = open(fname,'r')
for i in range (cellrow):
currentrow = currentfile.readline()
# print currentrow #for testing
columncnt=0
currentcell = ''
openquote = False
for char in currentrow:
if char == '"' and not openquote:
openquote = True
elif char == '"' and openquote:
openquote = False
elif char == ',' and not openquote:
columncnt+=1
if columncnt == cellcolumn:
cellvalue = currentcell
# print cellvalue #for testing
currentcell=''
else:
currentcell += char
outputfile.write (cellvalue + ',')
currentfile.close()
n += 1
outputfile.close()
It seemed to me that since you already had a CSV it would be easier to deal with as a regular file and parse through to find the right information, plus nothing to import. Happy coding!
I think you have an error at this line in the while loop:
workbookr = xlrd.open_workbook(fname[n])
must be:
workbookr = xlrd.open_workbook(fname)
otherwise your workbookr remains as you set it before outside the loop:
fname = files[n]
workbookr = xlrd.open_workbook(fname)
which is the first file in your list.
Since they are just csv files, there is no need for the excel libraries.
#!/usr/bin/env python
import argparse, csv
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='merge csv files on field', version='%(prog)s 1.0')
parser.add_argument('infile', nargs='+', type=str, help='list of input files')
parser.add_argument('--col', type=int, default=0, help='Column to grab')
parser.add_argument('--row', type=int, default=0, help='Row to grab')
parser.add_argument('--out', type=str, default='temp.csv', help='name of output file')
args = parser.parse_args()
data = []
for fname in args.infile:
with open(fname, 'rb') as df:
reader = csv.reader(df)
for index, line in enumerate(reader):
if index == args.row:
data.push(line[args.column])
del reader
writer = csv.writer(open(args.out, "wb"), dialect='excel')
writer.writerows(data)
del writer

Categories