pyExcelerator and utf8?

pyExcelerator and utf8? - python

I have stolen found the following code on stackoverflow (but forgot where, sorry):
#!/usr/local/bin/python
import string
import sys
import getopt
import re
import os
import os.path
import csv
from pyExcelerator import *
def usage():
""" Display the usage """
print "Usage: " + sys.argv[0] + " [OPTIONS] csvfile"
print "OPTIONS:"
print "--title|-t: If set, the first line is the title line"
print "--lines|-l n: Split output into files of n lines or less each"
print "--sep|-s c [def:,] : The character to use for field delimiter"
print "--output|o : output file name/pattern"
print "--help|h : print this information"
sys.exit(2)
def openExcelSheet(outputFileName):
""" Opens a reference to an Excel WorkBook and Worksheet objects """
workbook = Workbook()
worksheet = workbook.add_sheet("Sheet 1")
return workbook, worksheet
def writeExcelHeader(worksheet, titleCols):
""" Write the header line into the worksheet """
cno = 0
for titleCol in titleCols:
worksheet.write(0, cno, titleCol)
cno = cno + 1
def writeExcelRow(worksheet, lno, columns):
""" Write a non-header row into the worksheet """
cno = 0
for column in columns:
worksheet.write(lno, cno, column)
cno = cno + 1
def closeExcelSheet(workbook, outputFileName):
""" Saves the in-memory WorkBook object into the specified file """
workbook.save(outputFileName)
def getDefaultOutputFileName(inputFileName):
""" Returns the name of the default output file based on the value
of the input file. The default output file is always created in
the current working directory. This can be overriden using the
-o or --output option to explicitly specify an output file """
baseName = os.path.basename(inputFileName)
rootName = os.path.splitext(baseName)[0]
return string.join([rootName, "xls"], '.')
def renameOutputFile(outputFileName, fno):
""" Renames the output file name by appending the current file number
to it """
dirName, baseName = os.path.split(outputFileName)
rootName, extName = os.path.splitext(baseName)
backupFileBaseName = string.join([string.join([rootName, str(fno)], '-'), extName], '')
backupFileName = os.path.join(dirName, backupFileBaseName)
try:
os.rename(outputFileName, backupFileName)
except OSError:
print "Error renaming output file:", outputFileName, "to", backupFileName, "...aborting"
sys.exit(-1)
def validateOpts(opts):
""" Returns option values specified, or the default if none """
titlePresent = False
linesPerFile = -1
outputFileName = ""
sepChar = ","
for option, argval in opts:
if (option in ("-t", "--title")):
titlePresent = True
if (option in ("-l", "--lines")):
linesPerFile = int(argval)
if (option in ("-s", "--sep")):
sepChar = argval
if (option in ("-o", "--output")):
outputFileName = argval
if (option in ("-h", "--help")):
usage()
return titlePresent, linesPerFile, sepChar, outputFileName
def main():
""" This is how we are called """
try:
opts,args = getopt.getopt(sys.argv[1:], "tl:s:o:h", ["title", "lines=", "sep=", "output=", "help"])
except getopt.GetoptError:
usage()
if (len(args) != 1):
usage()
inputFileName = args[0]
try:
inputFile = open(inputFileName, 'r')
except IOError:
print "File not found:", inputFileName, "...aborting"
sys.exit(-1)
titlePresent, linesPerFile, sepChar, outputFileName = validateOpts(opts)
if (outputFileName == ""):
outputFileName = getDefaultOutputFileName(inputFileName)
workbook, worksheet = openExcelSheet(outputFileName)
fno = 0
lno = 0
titleCols = []
reader = csv.reader(inputFile, delimiter=sepChar)
for line in reader:
if (lno == 0 and titlePresent):
if (len(titleCols) == 0):
titleCols = line
writeExcelHeader(worksheet, titleCols)
else:
writeExcelRow(worksheet, lno, line)
lno = lno + 1
if (linesPerFile != -1 and lno >= linesPerFile):
closeExcelSheet(workbook, outputFileName)
renameOutputFile(outputFileName, fno)
fno = fno + 1
lno = 0
workbook, worksheet = openExcelSheet(outputFileName)
inputFile.close()
closeExcelSheet(workbook, outputFileName)
if (fno > 0):
renameOutputFile(outputFileName, fno)
if __name__ == "__main__":
main()
My problem is, when using it to convert a utf8 file (containing e.g. 'LATIN SMALL LETTER O WITH ACUTE' (U+00F3) "ó"), it produces the byte sequence 0x43 0x04, which is rendered by both open office and ms excel as a "y" (0x43, while just dropping the 0x04).
does anyone know what I or pyExcelerator are doing wrong?

You should manually encode/decode data from utf-8 file:
reader = csv.reader(inputFile, delimiter=sepChar)
for line in reader:
if (lno == 0 and titlePresent):
if (len(titleCols) == 0):
titleCols = line
writeExcelHeader(worksheet, titleCols)
else:
# writeExcelRow(worksheet, lno, line)
# in unicode function source file encoding should be passed
writeExcelRow(worksheet, lno, [unicode(cell, 'utf-8') for cell in line])
def writeExcelRow(worksheet, lno, columns):
""" Write a non-header row into the worksheet """
cno = 0
for column in columns:
#worksheet.write(lno, cno, column)
worksheet.write(lno, cno, column.encode('utf-8'))
cno = cno + 1
Check examples here (unicode_csv_reader, utf_8_encoder): http://docs.python.org/2/library/csv.html#examples

Related

CS50 PSET6 - DNA - Works fine on SMALL but not for LARGE database

I'm taking CS50 and got stuck on this pset6.
I made this code and it's working fine for 'small' given database.
On 'large' one i get wrong values in my DNA sequence.
Like, using debug50 i got that Albus sequence should be 15,49,38... and my seq is 21, 55, 64...
whats wrong? AND why it works fine on small database and not in large one?
Thanks for the help!
# Import ARGV and CSV library
from sys import argv, exit
import pandas as pd
import csv
# Check if argv has 3 arguments (program name, cvs file and dna sequence)
while True:
if len(argv) != 3:
print("Usage: python dna.py data.csv sequence.txt")
exit(1)
else:
break
with open(argv[2], 'r', encoding="UTF-8") as txt:
dna_seq = txt.read()
#Find the number of STR - AGATC,TTTTTTCT,AATG,TCTAG,GATA,TATC,GAAA,TCTG
AGATC = dna_seq.count("AGATC")
TTTTTTCT = dna_seq.count("TTTTTTCT")
AATG = dna_seq.count("AATG")
TCTAG = dna_seq.count("TCTAG")
GATA = dna_seq.count("GATA")
TATC = dna_seq.count("TATC")
GAAA = dna_seq.count("GAAA")
TCTG = dna_seq.count("TCTG")
name = 0
if argv[1] == "databases/small.csv":
with open(argv[1], 'r') as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
dna_db = row['name'], row['AGATC'], row['AATG'], row['TATC']
dna_db = list(dna_db)
seq = [AGATC, AATG, TATC]
seq = [str(x) for x in seq]
if dna_db[1:4] == seq:
name = dna_db[:1]
break
else:
name = "No match"
elif argv[1] == "databases/large.csv":
with open(argv[1], 'r') as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
dna_db = row['name'], row['AGATC'], row['TTTTTTCT'], row['AATG'], row['TCTAG'],
row['GATA'], row['TATC'], row['GAAA'], row['TCTG']
dna_db = list(dna_db)
seq = [AGATC,TTTTTTCT,AATG,TCTAG,GATA,TATC,GAAA,TCTG]
seq = [str(x) for x in seq]
if dna_db[1:9] == seq:
name = dna_db[:1]
break
else:
name = "No match"
print(name)

Parsing a dictionary in Python to my current table

I have a table that contains a few categories and two of them are: mac address and device name. I had a the list of my mac address written in my code (hardcoded) with their corresponding device names (ie deviceDict['00:00:00:00:00:00']= name)
Now, I passed those mac addresses and device names to a text file to be read from that same Python code and parse it onto my table. The code currently recognizes the text file but it is not parsing that information onto the table.
Here is the code:
# File: WapLogParser.py
# Desc: Parses a WAP log file and pulls out information relating to connected clients
# Usage: python WapLogParser.py [file glob]
import re
import sys
import glob
import os
deviceDict = dict()
# Base table for storing client info
# All names must match what is in the Wap Log file
# Exceptions: Date, Wap Name, Device Name - which are provided outside of the result parsing
table = [["Ssid", "Vlan", "Mac Address", "Connected Time", "Ip Address", "Rssi", "Date", "Wap Name", "Device Name"]]
def ParseResult(result, date, wapName):
lines = result.split('\n')
lines = list(filter(None, lines))
# Any useful info will be at least 2 lines long
if len(lines) == 1:
return
# create empty row
data = [""] * len(table[0])
# for each item in the result place it in the correct spot in the row
for line in lines:
if line != "":
# Parse the key/value pair
m = re.match(r"(.*):\s\.*\s?(.*)", line)
if m is not None:
for idx in range(len(table[0])):
if table[0][idx].lower() == m[1].lower():
data[idx] = m[2]
else:
break
# Remove the '(dBm)' from the RSSI value
data[5] = data[5].split()[0]
# Append WAP specific items to row
data[6] = date
data[7] = wapName
data[8] = GetDeviceName(data[2].upper())
# Add row to table
table.append(data)
def ParseFile(path):
with open(path) as f:
lines = f.readlines()
result = ""
command = ""
date = ""
# WAP name is always on the first line 16 characters in with 4
# unnecessary characters trailing
wapName = lines[0].strip()[16:-4]
for line in lines:
line = line.strip()
# Is an issued command?
if line.startswith("/#"):
if command != "":
ParseResult(result, date, wapName)
command = ""
# reset the result for the new command
result = ""
m = re.match(r"^/#.*show\sclient.*stats$", line)
if m is not None:
command = line
# Anything that is not a command add to the result
else:
result += line + "\n"
# Do we have the date?
if line.startswith("Current date:"):
date = line.replace("Current date: ", "")
# Print output to stderr
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
# Print a 2d array in a csv format
def PrintAsCsv(table):
for row in table:
print(",".join(row))
def Main():
InitDeviceDict()
numArgs = len(sys.argv)
for filename in glob.iglob(sys.argv[numArgs - 1], recursive=True):
# Globs get directories too
if os.path.isfile(filename):
eprint("Parsing " + filename)
try:
ParseFile(filename)
except Exception as e: # Mainly for if we see a binary file
eprint("Bad file: " + e)
# Print in a format we can use
PrintAsCsv(table)
def GetDeviceName(macAddress):
if macAddress in deviceDict:
return deviceDict[macAddress]
manufacturerPart = macAddress[:8]
if manufacturerPart in deviceDict:
return deviceDict[manufacturerPart]
return 'Unknown Device'
def InitDeviceDict():
with open('try.txt','r') as fo:
for line in fo:
deviceDict = {}
line = line.split(',')
macAddress = line[0].strip()
manufacturerPart = line[1].strip()
if macAddress in deviceDict:
deviceDict[macAddress].append(manufacturerPart)
else:
deviceDict[macAddress]=(manufacturerPart)
print(deviceDict)
# entry point
# script arguments:
# WapLogParser.py [file glob]
if __name__ == "__main__":
Main()
The issue is on the functions GetDeviceName and InitDeviceDict. When I run the code and then a batch file to display my info on excel, I keep getting "unknown device" (as if it is not recognizing the mac address I entered to produce the device name)
Any way I can correct this? Thank you

The deviceDict that is populated in InitDeviceDict is not the global deviceDict. You are only modifying a function-local dictionary (and resetting it every line as well). Remove deviceDict = {} from that function and, at the top of the function use global deviceDict to declare that you are modifying the global.
def InitDeviceDict():
global deviceDict
with open('try.txt','r') as fo:
for line in fo:
line = line.split(',')
macAddress = line[0].strip()
manufacturerPart = line[1].strip()
if macAddress in deviceDict:
deviceDict[macAddress].append(manufacturerPart)
else:
deviceDict[macAddress]=[manufacturerPart]

how to import state of pandas dataframe to second .py file

so, toward the end of my first file; we'll call /file.py.
def get_excel_data(self):
"""Places excel data into pandas dataframe"""
# excel_data = pandas.read_excel(self.find_file())
for extracted_archive in self.find_file():
excel_data = pandas.read_excel(extracted_archive)
# print(excel_data)
columns = pandas.DataFrame(columns=excel_data.columns.tolist())
excel_data = pandas.concat([excel_data, columns])
excel_data.columns = excel_data.columns.str.strip()
excel_data.columns = excel_data.columns.str.replace("/", "_")
excel_data.columns = excel_data.columns.str.replace(" ", "_")
total_records = 0
num_valid_records = 0
num_invalid_records = 0
for row in excel_data.itertuples():
mrn = row.MRN
total_records += 1
if mrn in ("", " ", "N/A", "NaT", "NaN", None) or math.isnan(mrn):
# print(f"Invalid record: {row}")
num_invalid_records += 1
# total_invalid = num_invalid_records + dup_count
excel_data = excel_data.drop(excel_data.index[row.Index])
# continue
else:
# print(mrn) # outputs all MRN ids
for row in excel_data.itertuples():
num_valid_records += 1
continue
with open("./logs/metrics.csv", "a", newline="\n") as f:
csv_writer = DictWriter(f, ['date', 'total_records', 'processed', 'skipped', 'success_rate'])
# csv_writer.writeheader()
currentDT = datetime.datetime.now()
success_rate = num_valid_records / total_records * 100
csv_writer.writerow(dict(date=currentDT,
total_records=total_records,
processed=num_valid_records,
skipped=num_invalid_records,
success_rate=num_valid_records / total_records * 100))
return self.clean_data_frame(excel_data)
def clean_data_frame(self, data_frame):
"""Cleans up dataframes"""
for col in data_frame.columns:
if "date" in col.lower():
data_frame[col] = pandas.to_datetime(data_frame[col],
errors='coerce', infer_datetime_format=True)
data_frame[col] = data_frame[col].dt.date
data_frame['MRN'] = data_frame['MRN'].astype(int).astype(str)
return data_frame
def get_mapping_data(self):
map_data = pandas.read_excel(config.MAPPING_DOC, sheet_name='main')
columns = pandas.DataFrame(columns=map_data.columns.tolist())
return pandas.concat([map_data, columns])
in my second file I would like to keep that end state; and do another iteration for instance.... second_file.py
def process_records(self, records, map_data, completed=None, errors=None):
"""Code to execute after webdriver initialization."""
series_not_null = False
try:
num_attempt = 0
for record in data_frame.itertuples(): # not working
print(record)
series_not_null = True
mrn = record.MRN
self.navigate_to_search(num_attempt)
self.navigate_to_member(mrn)
self.navigate_to_assessment()
self.add_assessment(record, map_data)
self.driver.switch_to.parent_frame() # not working
sleep(.5)
error_flag = self.close_member_tab(self.driver, mrn, error_flag)
except Exception as exc:
if series_not_null:
errors = self.process_series_error(exc)
return completed, error
both have import pandas

you can save your dataframe in a pickle file like this. it is also worth noting that you can store most anything in a pickle file. here is a link to some info here: pickle info
import pandas as pd
import pickle
x = pd.DataFrame({'a':[1,2,3],'b':[4,5,6],'c':[7,8,9]})
#this will create a file called pickledata.p that will store the data frame
with open('pickledata.p', 'wb') as fh: #notice that you need the 'wb' for the dump
pickle.dump(x, fh)
#to load the file do this
with open('pickledata.p', 'rb') as fh: #you need to use 'rb' to read
df = pickle.load(fh)
#you can now use df like a normal dataframe
print(df)
you dont actually need the '.p' extension for a pickle file, i just like it.
so you save your dataframe at the end of script one, and then load it in at the start of script 2.

Use Dataframe.to_pickle and pandas.read_pickle:
To persist
df.to_pickle('./dataframe.pkl')
To load
df = pd.read_pickle('./dataframe.pkl')

Python code, not able to write into xls

When the below code is executed in Python 3.5 and xlwt 1.2.0, following error is generated:
"Cannot convert byte objects to str implicitly"
The code works fine for python 2.7.
Anyone please let me know what could be the problem.
Thanks in Advance!!
import xlwt
import re
import os
wb = xlwt.Workbook()
ws = wb.add_sheet('A Test Sheet')
ws_1 = wb.add_sheet('A Test Sheet_B')
cnt_row = 0
cnt_col_1 = 0
cnt_col_2 = 0
path = "E:\Python_Scripts"
files = os.listdir("E:\Python_Scripts")
for filename in files:
if filename.endswith(".ptu"):
fo = open(os.path.join(path, filename), 'r')
while(1):
str = fo.readline()
if (str == ""):
print ("file finished")
break
else:
matchObj = re.match(r'\s* TEST (.*?).*', str)
if (matchObj):
str = str.split('TEST', 1)[1]
ws.write(cnt_row, 0, str)
matchObj_Author = re.match(r' (.*) Author (.*?).*', str)
if (matchObj_Author):
str = str.split('(', 1)[1]
str = str.rsplit(')', 1)
ws.write(cnt_row, 1, str)
cnt_row = cnt_row + 1
fo.close()
wb.save('example.xls')

Your data input has changed. And one or more of its lines contain multiple strings.
If you're reading a file where a line has multiple entires then your str will be a list not a string. If it is a list, this will cause the error when invoking wb.save('example.xls'): TypeError: must be str, not bytes
Here's a pared down version of your program that I used to test this out:
import xlwt
wb = xlwt.Workbook()
ws = wb.add_sheet('A Test Sheet')
ws_1 = wb.add_sheet('A Test Sheet_B')
cnt_row = 0
cnt_col_1 = 0
cnt_col_2 = 0
f = open('<an xml file with one string per line except the last line which has two strings', 'r', encoding='utf-8')
while 1:
str = f.readline()
wb.save('example.xls')
if str == "":
print ("file finished")
break
str = str.split('<', 1)[1]
str = str.rsplit('<', 1)
ws.write(cnt_row, 1, str)
cnt_row = cnt_row + 1
print('debug:last')
print(str)
print(type(str))
wb.save('example.xls')
f.close()

xlrd array index out of range for second file

i have a directory with 5+ invalid CSV files. i have no problems reading the files and then writing them as "good" CSV files one at a time. But when i try to process a second file i get "IndexError: array index out of range"
import xlrd
import csv, sys, os
import datetime, time
import logging
import Gmail_email
program = "CleanCSV"
date = datetime.datetime(1899, 12, 30)
argv0=""
argv1 = 'c:/tmp/checkEmail/' #input directory
argv2 = "f:/foo/in/bar-" #output directory
sys.argv = [argv0, argv1, argv2]
inDir = sys.argv[1]#input directory
outDir = sys.argv[2] #output directory
lList = [] #holder list to hold names of files to be processed
def processFiles():
try: #Makes list of local files in lDir, Populates lList
if os.listdir(inDir) == []: #checks for files in lDir
logging.info('No Files to upload')
exit()
else:
for file_name in os.listdir(inDir):
#print file_name
if os.path.isfile(inDir+file_name):
lList.append(file_name) # populate local dir list
if 'Thumbs.db' in lList: #remove windows thumbs file
lList.remove('Thumbs.db')
logging.info('Files to be checked')
logging.info('%s', lList )
#print lList, 'lList'
except Exception, e:
Gmail_email.email(e, program)
logging.warning('Error with local files')
logging.warning('%s', e)
exit()
for each in lList: #calls on cleanup method for each file in lLIst
filePath= inDir+each
print filePath, "filepath"
testFile(filePath)
def testFile(filePath):
try:
with open(filePath, "rb") as csvfile:
spamreader= csv.reader(csvfile, delimiter=' ', quotechar='|')
for row in spamreader:
#print "good file, most likely"
pass
except Exception, e:
logging.warning('Error with local files')
logging.warning('%s', e)
#print "cleaing bad file", filePath
cleanBadFile(filePath)
def cleanBadFile(filePath):
timestr = time.strftime("%Y%m%d-%H%M%S")
#print "bad file trying to clean"
f = open(outDir+timestr+".csv", 'ab')
try: #can i read the file
workbook = xlrd.open_workbook(filePath)
#will error here if bad xlrd cannot open it
print workbook.sheet_names()
#print workbook
except Exception, e:
#print e, " error"
pass
worksheet = workbook.sheet_by_name('Sheet')
num_rows = worksheet.nrows - 1
num_cells = worksheet.ncols - 1
#print worksheet.ncols, 'num cells'
curr_row = -1
while curr_row < num_rows: #goes over every row
num_cells = worksheet.ncols - 1
curr_row += 1
row = worksheet.row(curr_row)
print row, "row"
curr_cell = -1
print worksheet.row_len(curr_row), "row len"
print curr_row, curr_cell, "curr row, curr cell"
cell_type = worksheet.cell_type(curr_row, curr_cell)
cell_value = worksheet.cell_value(curr_row, curr_cell)
print ' ', cell_type, ':', cell_value
values= []
if cell_type == 0: #tests if first value in row is data
#assuming that good rows will have a value in the first cell of each row
#if no data row is not copied to new file
print "bad line"
pass
else:
while curr_cell < num_cells:
curr_cell += 1
# Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank
print curr_row, "; ",curr_cell, " row and cell"
cell_type = worksheet.cell_type(curr_row, curr_cell)
cell_value = worksheet.cell_value(curr_row, curr_cell)
#print cell_type, ":", cell_value
if cell_type == xlrd.XL_CELL_DATE:
cell_value=datetime.timedelta(int(cell_value))
cell_value = str(date + cell_value)[:10]
#print cell_value, "cell value, cell date"
values.append(cell_value)
#print values, "values"
csv.writer(f, delimiter=',',
quotechar=',', quoting=csv.QUOTE_MINIMAL).writerow( values )
f.close()
print f.closed
print "ah"
curr_cell= 0
curr_row = 0
#print "checking file:", readFile
processFiles()
#print "exit"
exit
The error messsage
Traceback (most recent call last):
File "F:\cleanCSV.py", line 132, in <module>
processFiles()
File "F:\cleanCSV.py", line 51, in processFiles
testFile(filePath)
File "F:\cleanCSV.py", line 64, in testFile
cleanBadFile(filePath)
File "F:\cleanCSV.py", line 106, in cleanBadFile
cell_type = worksheet.cell_type(curr_row, curr_cell)
File "C:\Python27\lib\site-packages\xlrd\sheet.py", line 413, in cell_type
return self._cell_types[rowx][colx]
IndexError: array index out of range
I feel like I need to "reset" a counting variable to but think i have them all. I don't know what to do.

Two lines before the line causing the exception curr_cell is set to -1 which can't be a valid cell index. A comment some lines further down suggests you expect that to be the first cell in the row, so the index should be 0 instead of -1.

I moved my +1 (curr_cell+=1) down 3 lines.
while curr_cell < num_cells:
# Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank
#print curr_row, "; ",curr_cell, " row and cell"
cell_type = worksheet.cell_type(curr_row, curr_cell)
cell_value = worksheet.cell_value(curr_row, curr_cell)
print cell_type, ":", cell_value
curr_cell += 1
if cell_type == xlrd.XL_CELL_DATE:
cell_value=datetime.timedelta(int(cell_value))
cell_value = str(date + cell_value)[:10]
#print cell_value, "cell value, cell date"

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

pyExcelerator and utf8? - python

Related

CS50 PSET6 - DNA - Works fine on SMALL but not for LARGE database

Parsing a dictionary in Python to my current table

how to import state of pandas dataframe to second .py file

Python code, not able to write into xls

xlrd array index out of range for second file

Categories

Resources