Python non-ascii characters - python

I have a python file that creates and populates a table in ms sql. The only sticking point is that the code breaks if there are any non-ascii characters or single apostrophes (and there are quite a few of each). Although I can run the replace function to rid the strings of apostrophes, I would prefer to keep them intact. I have also tried converting the data into utf-8, but no luck there either.
Below are th error messages I get:
"'ascii' codec can't encode character u'\2013' in position..." (for non-ascii characters)
and for the single quotes
class 'pyodbc.ProgrammingError'>: ('42000', "[42000] [Microsoft][ODBC SQL Server Driver][SQL Server] Incorrect syntax near 'S, 230 X 90M.; Eligibilty....
When I try to encode string in utf-8, I instead get the following error message:
<type 'exceptions.UnicodeDecodeError'>: ascii' codec can't decode byte 0xe2 in position 219: ordinal not in range(128)
The python code is included below. I believe the point in the code where this break occurs is after the following line: InsertValue = str(row.GetValue(CurrentField['Name'])).
# -*- coding: utf-8 -*-
import pyodbc
import sys
import arcpy
import arcgisscripting
gp = arcgisscripting.create(9.3)
SQL_KEYWORDS = ['PERCENT', 'SELECT', 'INSERT', 'DROP', 'TABLE']
#SourceFGDB = '###'
#SourceTable = '###'
SourceTable = sys.argv[1]
TempInputName = sys.argv[2]
SourceTable2 = sys.argv[3]
#---------------------------------------------------------------------------------------------------------------------
# Target Database Settings
#---------------------------------------------------------------------------------------------------------------------
TargetDatabaseDriver = "{SQL Server}"
TargetDatabaseServer = "###"
TargetDatabaseName = "###"
TargetDatabaseUser = "###"
TargetDatabasePassword = "###"
# Get schema from FGDB table.
# This should be an ordered list of dictionary elements [{'FGDB_Name', 'FGDB_Alias', 'FGDB_Type', FGDB_Width, FGDB_Precision, FGDB_Scale}, {}]
if not gp.Exists(SourceTable):
print ('- The source does not exist.')
sys.exit(102)
#### Should see if it is actually a table type. Could be a Feature Data Set or something...
print(' - Processing Items From : ' + SourceTable)
FieldList = []
Field_List = gp.ListFields(SourceTable)
print(' - Getting number of rows.')
result = gp.GetCount_management(SourceTable)
Number_of_Features = gp.GetCount_management(SourceTable)
print(' - Number of Rows: ' + str(Number_of_Features))
print(' - Getting fields.')
Field_List1 = gp.ListFields(SourceTable, 'Layer')
Field_List2 = gp.ListFields(SourceTable, 'Comments')
Field_List3 = gp.ListFields(SourceTable, 'Category')
Field_List4 = gp.ListFields(SourceTable, 'State')
Field_List5 = gp.ListFields(SourceTable, 'Label')
Field_List6 = gp.ListFields(SourceTable, 'DateUpdate')
Field_List7 = gp.ListFields(SourceTable, 'OBJECTID')
for Current_Field in Field_List1 + Field_List2 + Field_List3 + Field_List4 + Field_List5 + Field_List6 + Field_List7:
print(' - Field Found: ' + Current_Field.Name)
if Current_Field.AliasName in SQL_KEYWORDS:
Target_Name = Current_Field.Name + '_'
else:
Target_Name = Current_Field.Name
print(' - Alias : ' + Current_Field.AliasName)
print(' - Type : ' + Current_Field.Type)
print(' - Length : ' + str(Current_Field.Length))
print(' - Scale : ' + str(Current_Field.Scale))
print(' - Precision: ' + str(Current_Field.Precision))
FieldList.append({'Name': Current_Field.Name, 'AliasName': Current_Field.AliasName, 'Type': Current_Field.Type, 'Length': Current_Field.Length, 'Scale': Current_Field.Scale, 'Precision': Current_Field.Precision, 'Unique': 'UNIQUE', 'Target_Name': Target_Name})
# Create table in SQL Server based on FGDB table schema.
cnxn = pyodbc.connect(r'DRIVER={SQL Server};SERVER=###;DATABASE=###;UID=sql_webenvas;PWD=###')
cursor = cnxn .cursor()
#### DROP the table first?
try:
DropTableSQL = 'DROP TABLE dbo.' + TempInputName + '_Test;'
print DropTableSQL
cursor.execute(DropTableSQL)
dbconnection.commit()
except:
print('WARNING: Can not drop table - may not exist: ' + TempInputName + '_Test')
CreateTableSQL = ('CREATE TABLE ' + TempInputName + '_Test '
' (Layer varchar(500), Comments varchar(5000), State int, Label varchar(500), DateUpdate DATETIME, Category varchar(50), OBJECTID int)')
cursor.execute(CreateTableSQL)
cnxn.commit()
# Cursor through each row in the FGDB table, get values, and insert into the SQL Server Table.
# We got Number_of_Features earlier, just use that.
Number_Processed = 0
print(' - Processing ' + str(Number_of_Features) + ' features.')
rows = gp.SearchCursor(SourceTable)
row = rows.Next()
while row:
if Number_Processed % 10000 == 0:
print(' - Processed ' + str(Number_Processed) + ' of ' + str(Number_of_Features))
InsertSQLFields = 'INSERT INTO ' + TempInputName + '_Test ('
InsertSQLValues = 'VALUES ('
for CurrentField in FieldList:
InsertSQLFields = InsertSQLFields + CurrentField['Target_Name'] + ', '
InsertValue = str(row.GetValue(CurrentField['Name']))
if InsertValue in ['None']:
InsertValue = 'NULL'
# Use an escape quote for the SQL.
InsertValue = InsertValue.replace("'","' '")
if CurrentField['Type'].upper() in ['STRING', 'CHAR', 'TEXT']:
if InsertValue == 'NULL':
InsertSQLValues = InsertSQLValues + "NULL, "
else:
InsertSQLValues = InsertSQLValues + "'" + InsertValue + "', "
elif CurrentField['Type'].upper() in ['GEOMETRY']:
## We're not handling geometry transfers at this time.
if InsertValue == 'NULL':
InsertSQLValues = InsertSQLValues + '0' + ', '
else:
InsertSQLValues = InsertSQLValues + '1' + ', '
else:
InsertSQLValues = InsertSQLValues + InsertValue + ', '
InsertSQLFields = InsertSQLFields[:-2] + ')'
InsertSQLValues = InsertSQLValues[:-2] + ')'
InsertSQL = InsertSQLFields + ' ' + InsertSQLValues
## print InsertSQL
cursor.execute(InsertSQL)
cnxn.commit()
Number_Processed = Number_Processed + 1
row = rows.Next()
print(' - Processed all ' + str(Number_Processed))
del row
del rows

James, I believe the real issue is that your are not using Unicode accross the board. Try to do the following:
Make sure that your input file that you are using to populate the DB is in UTF-8 and that you are reading it with the UTF-8 encoder.
Make sure your DB is actually storing the data as Unicode
When you retrieve data from the file or from the DB or want to manipulate strings (with the + operator for instance) you need to make sure that all parts are proper Unicode. You can NOT use the str() method. You need to use unicode() as Dave pointed out. If you define strings in your code use u'my string' instead of 'my string' (otherwise it is not considered unicode).
Also, please provide us the full stack trace and the exception name.

I'm going to use my psychic debugging skills and say you are trying to str()ify something and getting an error with the ascii codec. What you really should do is to use the utf-8 codec instead like this:
insert_value_uni = unicode(row.GetValue(CurrentField['Name']))
InsertValue = insert_value_uni.encode('utf-8')

Or you can take the view that only ASCII is allowed and use the awesomely named Unicode Hammer

In general you want to convert to unicode on data input, and convert to the desired encoding on output.
So it will be easier to find your problem if you do this. This means changing all strings to unicode, 'INSERT INTO ' to u'INSERT INTO '. (Notice the "u" before the string)
Then when you send the string to be executed convert to the desired encoding, "utf8".
cursor.execute(InsertSQL.encode("utf8")) # Where InsertSQL is unicode
Also, you should add the encoding string to the top of your source code.
This means adding the encoding cookie to one of the first two lines of the file:
#!/usr/bin/python
# -*- coding: <encoding name> -*-
If your pulling data from a file to build your string you can uses codecs.open to auto convert from a specific encoding to unicode on load.

When I converted my str() to unicode, that solved the problem. A simple answer, and I appreciate everyone's help on this.

Related

list index out of range when extending list

This function takes email body as input and returns values after Application name, source and message respectively and it works fine
def parse_subject(line):
info = {}
segments = line.split(' ')
info['time'] = segments[0]+' '+segments[1]
for i in range(2, len(segments)):
key = ''
if segments[i] == 'Application name:':
key = 'appname'
elif segments[i] == 'Source:':
key = 'source'
elif segments[i] == 'Message:':
key = 'message'
if key != '':
i += 1
info[key] = segments[i]
return info
For another email body format i need to extend segments format because i need to search more lines in message body so i changed info['time'] and as soon i extend segments for more than 2 i'm getting out of range errors
info['time'] = segments[0]+' '+segments[1]+' '+segments[2]+' '+segments[3]+' '+segments[4]+' '+segments[5]......up to segment[17]
maybe i'll need to extend more
and above function fails with list index out of range
i changed code but same error:
also tried changing number to match number of segments but same:
for i in range(<number of segments>, len(segments)):
example of segments: lenght will vary because string after Message has different value, sometime it's URL string
Question
when i define lenght of the segment, let's say up to segments[17],
what i need to change in function not to throw out of index error
def parse_subject(line):
info = {}
segments = line.split(' ')
info['time'] = segments[0]+' '+segments[1] + ' ' + segments[2] + ' ' + segments[3] + ' ' + segments[4] + ' ' + segments[5] + ' ' + segments[6] + ' ' + segments[7] + ' ' + segments[8] +' ' + segments[9] + ' ' + segments[10] + ' ' + segments[11] + ' ' + segments[12] +' ' + segments[13] + ' ' + segments[14] + ' '
+ segments[15] +' ' + segments[16] + ' ' + segments[17]
for i in range(16, len(segments)):
key = ''
if segments[i] == 'name:':
key = 'appname'
elif segments[i] == 'Source:':
key = 'source'
elif segments[i] == 'Message:':
key = 'message'
if key != '':
i += 1
info[key] = segments[i]
return info
if mail["Subject"].find("PA1") > 0 or mail["Subject"].find("PA2") > 0:
body = get_autosys_body(mail)
# print(body)
for line in body.splitlines():
if 'Application Name' in line:
job_info = parse_subject(line)
break
print(job_info)
I need to pass line variable (content below)
name:Contoso.Service
Source: host15
Timestamp: 2019-01-22T00:00:43.901Z
Message:null
to parse_subject(line) function and from above output to get:
Contoso.Service as value of job_info['appname']
host15 as value of jobinfo['source']
null as value of jobinfo['message']
In your code, you need to debug it. The error is telling you exactly what is wrong.
def old_parse_subject(line):
info = {}
segments = line.split(' ')
if len(segments < 18):
raise ValueError("segments[17] won't work if segments is not that long")
You could have done a print(len(segments)) or just print (segments) right before where you know the error is.
For reading an email header, if you know it has multiple lines, you get those with split('\n') and then for each line if you know it is "name: value" you get that with split(':', 1).
The second argument to split says only split on 1 colon, because any additional colons are allowed to be part of the data. For example, timestamps have colons.
def parse_subject(headers):
info = {}
# split the full header into separate lines
for line in headers.split('\n'):
# split on colon, but only once
key, value = line.split(':', 1)
# store info
info[key] = value
return info
data = """name:Contoso.Service
Source: host15
Timestamp: 2019-01-22T00:00:43.901Z
Message:null"""
print (parse_subject(data))
{'name': 'Contoso.Service', 'Source': ' host15', 'Timestamp': ' 2019-01-22T00:00:43.901Z', 'Message': 'null'}

Python-Use binary copy table from with psycopg2

i'm trying to adapt this Use binary COPY table FROM with psycopg2
example from #Mike T to my data but with i'm having some problems.
import psycopg2
import numpy as np
from struct import pack
from io import BytesIO
from datetime import datetime
conn = psycopg2.connect(host = 'x', database = 'x', user = 'x')
curs = conn.cursor()
DROP TABLE IF EXISTS test_test;
CREATE TABLE test_test(
id_from_database INT PRIMARY KEY,
version VARCHAR,
information TEXT
);
data = [(3,1,'hello hello!!'), (2,'123','test test???!'),(3,9, 'bye bye :)')]
dtype = [('id_from_database', 'object'),('version', 'object'),('information', 'object')]
data = np.array(data,dtype=dtype)
def prepare_text(dat):
cpy = BytesIO()
for row in dat:
cpy.write('\t'.join([repr(x) for x in row]) + '\n')
return(cpy)
def prepare_binary(dat):
pgcopy_dtype = [('num_fields','>i2')]
for field, dtype in dat.dtype.descr:
pgcopy_dtype += [(field + '_length', '>i4'),
(field, dtype.replace('<', '>'))]
pgcopy = np.empty(dat.shape, pgcopy_dtype)
pgcopy['num_fields'] = len(dat.dtype)
for i in range(len(dat.dtype)):
field = dat.dtype.names[i]
pgcopy[field + '_length'] = dat.dtype[i].alignment
pgcopy[field] = dat[field]
cpy = BytesIO()
cpy.write(pack('!11sii', b'PGCOPY\n\377\r\n\0', 0, 0))
cpy.write(pgcopy.tostring()) # all rows
cpy.write(pack('!h', -1)) # file trailer
#print("cpy")
#print(cpy)
return(cpy)
###
def time_pgcopy(dat, table, binary):
print('Processing copy object for ' + table)
tstart = datetime.now()
cpy = prepare_binary(dat)
tendw = datetime.now()
print('Copy object prepared in ' + str(tendw - tstart) + '; ' +
str(cpy.tell()) + ' bytes; transfering to database')
cpy.seek(0)
curs.copy_expert('COPY ' + table + ' FROM STDIN WITH BINARY', cpy)
conn.commit()
tend = datetime.now()
print('Database copy time: ' + str(tend - tendw))
print(' Total time: ' + str(tend - tstart))
return
print(time_pgcopy(data, 'test_test', binary=True))
I'm getting this error:
curs.copy_expert('COPY ' + table + ' FROM STDIN WITH BINARY', cpy)
psycopg2.DataError: incorrect binary data format
CONTEXT: COPY test_test, line 1, column id_from_database
What am i doing wrong?
Thank you :)
(I can not comment on the original question because i don't have enough reputation)
cpgcopy might be relevant here.

Concatenating strings containing many quotations results in slashes in output

I am trying to build a string that needs to contain specific double and single quotation characters for executing a SQL expression.
I need my output to be formatted like this:
" "Full_Stree" = 'ALLENDALE RD' "
where the value of ALLENDALE RD will be a variable defined through a For Loop. In the following code sample, the variable tOS is what I am trying to pass into the query variable.
tOS = "ALLENDALE RD"
query = '" "Full_Stree" = ' + "'" + tOS + "' " + '"'
and when I print the value of query variable I get this output:
'" "Full_Stree" = \'ALLENDALE RD\' "'
The slashes are causing my query to fail. I also tried using a modulus operator to pass the value of the tOS variable, but get the same results:
where = '" "Full_Stree" = \'%s\' "' % (tOS)
print where
'" "Full_Stree" = \'ALLENDALE RD\' "'
How can I get my string concatenated into the correct format, leaving the slashes out of the expression?
What you are seeing is the repr of your string.
>>> s = '" "Full_Stree" = \'ALLENDALE RD\' "'
>>> s # without print console displays the repr
'" "Full_Stree" = \'ALLENDALE RD\' "'
>>> print s # with print the string itself is displayed
" "Full_Stree" = 'ALLENDALE RD' "
Your real problem is the extra quotes at the beginning and end of your where-clause.
This
query = '" "Full_Stree" = ' + "'" + tOS + "' " + '"'
should be
query = '"Full_Stree" = ' + "'" + tOS + "'"
It is more clearly written as
query = """"Full_Stree" = '%s'""" % tOS
The ArcGis docs recommend something more like this
dataset = '/path/to/featureclass/shapefile/or/table'
field = arcpy.AddFieldDelimiters(dataset, 'Full_Stree')
whereclause = "%s = '%s'" % (field, tOS)
arcpy.AddFieldDelimiters makes sure that the field name includes the proper quoting style for the dataset you are using (some use double-quotes and some use square brackets).
Somehow the way I already tried worked out:
where = '" "Full_Stree" = \'%s\' "' % (tOS)
print where
'" "Full_Stree" = \'ALLENDALE RD\' "'
Can't you just use triple quotes?
a=""" "Full_Street" = 'ALLENDALE RD' """
print a
"Full_Street" = 'ALLENDALE RD'

Invalid Python syntax using file.write

Trying to learn some geospatial python. More or less following the class notes here.
My Code
#!/usr/bin/python
# import modules
import ogr, sys, os
# set working dir
os.chdir('/home/jacques/misc/pythongis/data')
# create the text file we're writing to
file = open('data_export.txt', 'w')
# import the required driver for .shp
driver = ogr.GetDriverByName('ESRI Shapefile')
# open the datasource
data = driver.Open('road_surveys.shp', 1)
if data is None:
print 'Error, could not locate file'
sys.exit(1)
# grab the datalayer
layer = data.GetLayer()
# loop through the features
feature = layer.GetNextFeature()
while feature:
# acquire attributes
id = feature.GetFieldAsString('Site_Id')
date = feature.GetFieldAsString('Date')
# get coordinates
geometry = feature.GetGeometryRef()
x = str(geometry.GetX())
y = str(geometry.GetY()
# write to the file
file.Write(id + ' ' + x + ' ' + y + ' ' + cover + '\n')
# remove the current feature, and get a new one
feature.Destroy()
feature = layer.GetNextFeature()
# close the data source
datasource.Destroy()
file.close()
Running that gives me the following:
File "shape_summary.py", line 38
file.write(id + ' ' + x + ' ' + y + ' ' + cover + '\n')
^
SyntaxError: invalid syntax
Running Python 2.7.1
Any help would be fantastic!
Previous line is missing a close parenthesis:
y = str(geometry.GetY())
Also, just a style comment: it's a good idea to avoid using the variable name file in python because it actually has a meaning. Try opening a new python session and running help(file)
1)write should shouldn't be upper case in your code (Python is case sensitive)
2)make sure id is a string; if it's isn't use str(id) in your term, same for "cover" and "x" and "y"

Python reading csv problem : extra whitespace

When I tried to parse a csv which was exported by MS SQL 2005 express edition's query, the string python gives me is totally unexpected. For example if the line in the csv file is :"
aaa,bbb,ccc,dddd", then when python parsed it as string, it becomes :" a a a a , b b b , c c c, d d d d" something like that.....What happens???
I tried to remove the space in the code but don't work.
import os
import random
f1 = open('a.txt', 'r')
f2 = open('dec_sql.txt', 'w')
text = 'abc'
while(text != ''):
text = f1.readline()
if(text==''):
break
splited = text.split(',')
for i in range (0, 32):
splited[i] = splited[i].replace(' ', '')
sql = 'insert into dbo.INBOUND_RATED_DEC2010 values ('
sql += '\'' + splited[0] + '\', '
sql += '\'' + splited[1] + '\', '
sql += '\'' + splited[2] + '\', '
sql += '\'' + splited[3] + '\', '
sql += '\'' + splited[4] + '\', '
sql += '\'' + splited[5] + '\', '
sql += '\'' + splited[6] + '\', '
sql += '\'' + splited[7] + '\', '
sql += '\'' + splited[8] + '\', '
sql += '\'' + splited[9] + '\', '
sql += '\'' + splited[10] + '\', '
sql += '\'' + splited[11] + '\', '
sql += '\'' + splited[12] + '\', '
sql += '\'' + splited[13] + '\', '
sql += '\'' + splited[14] + '\', '
sql += '\'' + splited[15] + '\', '
sql += '\'' + splited[16] + '\', '
sql += '\'' + splited[17] + '\', '
sql += '\'' + splited[18] + '\', '
sql += '\'' + splited[19] + '\', '
sql += '\'' + splited[20] + '\', '
sql += '\'' + splited[21] + '\', '
sql += '\'' + splited[22] + '\', '
sql += '\'' + splited[23] + '\', '
sql += '\'' + splited[24] + '\', '
sql += '\'' + splited[25] + '\', '
sql += '\'' + splited[26] + '\', '
sql += '\'' + splited[27] + '\', '
sql += '\'' + splited[28] + '\', '
sql += '\'' + splited[29] + '\', '
sql += '\'' + splited[30] + '\', '
sql += '\'' + splited[31] + '\', '
sql += '\'' + splited[32] + '\' '
sql += ')'
print sql
f2.write(sql+'\n')
f2.close()
f1.close()
Sounds to me like the output of the MS SQL 2005 query is a unicode file. The python csv module cannot handle unicode files, but there is some sample code in the documentation for the csv module describing how to work around the problem.
Alternately, some text editors allow you to save a file with a different encoding. For example, I opened the results of a MS SQL 2005 query in Notepad++ and it told me the file was UCS-2 encoded and I was able to convert it to UTF-8 from the Encoding menu.
Try to open the file in notepad and use the replace all function to replace ' ' with ''
Your file is most likely encoded with a 2byte character encoding - most likely utf-16 (but it culd be some other encoding.
To get the CSV proper reading it, you'd open it with a codec so that it is decoded as its read - doing that you have Unicode objects (not string objects) inside your python program.
So, instead of opening the file with
my_file = open ("data.dat", "rt")
Use:
import codecs
my_file = codecs.open("data.dat", "rt", "utf-16")
And then feed this to the CSV module, with:
import csv
reader = csv.reader(my_file)
first_line = False
for line in reader:
if first_line: #skips header line
first_line = True
continue
#assemble sql query and issue it
Another thing is that your "query" being constructed into 32 lines of repetitive code is a nice thing to do when programing. Even in languages that lack rich string processing facilities, there are better ways to do it, but in Python, you can simply do:
sql = 'insert into dbo.INBOUND_RATED_DEC2010 values (%s);' % ", ".join("'%s'" % value for value in splited )
Instead of those 33 lines assembling your query. (I am telling it to insert a string inside
the parentheses on the first string. After the %operator, the string ", " is used with the "join" method so that it is used to paste together all elements on the sequence passed as a parameter to join. This sequence is made of a string, containing a value enclosed inside single quotes for each value in your splited array.
It may help to use Python's built in CSV reader. Looks like an issue with unicode, a problem that frustrated me a lot.
import tkFileDialog
import csv
ENCODING_REGEX_REPLACEMENT_LIST = [(re.compile('\xe2\x80\x99'), "'"),
(re.compile('\xe2\x80\x94'), "--"),
(re.compile('\xe2\x80\x9c'), '"'),
(re.compile('\xe2\x80\x9d'), '"'),
(re.compile('\xe2\x80\xa6'), '...')]
def correct_encoding(csv_row):
for key in csv_row.keys():
# if there is a value for the current key
if csv_row[key]:
try:
csv_row[key] = unicode(csv_row[key], errors='strict')
except ValueError:
# we have a bad encoding, try iterating through all the known
# bad encodings in the ENCODING_REGEX_REPLACEMENT and replace
# everything and then try again
for (regex, replacement) in ENCODING_REGEX_REPLACEMENT_LIST:
csv_row[key] = regex.sub(replacement,csv_row[key])
print(csv_row)
csv_row[key] = unicode(csv_row[key])
# if there is NOT a value for the current key
else:
csv_row[key] = unicode('')
return csv_row
filename = tkFileDialog.askopenfilename()
csv_reader = csv.DictReader(open(filename, "rb"), dialect='excel') # assuming similar dialect
for csv_row in csv_reader:
csv_row = correct_encoding(csv_row)
# your application logic here

Categories