access search from XML file - python

my code make a search in XML file ( the function "findTerminal") for a specific attribute name and pass it to an other function ( called DBAccess) to find that attribut in access database .
the attribute name is not concatenated in the database, i had to do that to make the comparison.
buut nothing happend when i execute the code ?
import csv
import pyodbc
from xml.dom import minidom
# *************************************
def DBAccess (Term):
MDB = 'c:/test/mydb.mdb'
DRV = '{Microsoft Access Driver (*.mdb)}'
PWD = ''
conn = pyodbc.connect('DRIVER=%s;DBQ=%s;PWD=%s' % (DRV,MDB,PWD))
curs = conn.cursor()
print 'connexion opened'
SQL = 'SELECT * FROM gdo_arc;' # insert your query here
curs.execute(SQL)
curs.execute("select nat_arc from gdo_segment")
rows = curs.fetchall()
for row in rows:
T = 'T' + row.tronson + '_' + row.noued1 + '-' + row.noued2
if T == Term :
print (' Terminal found')
curs.close()
conn.close()
#*************************************
def findTerminal():
xmldoc = minidom.parse('c:\\test\mydoc.xml')
#printing the number of blocs in my xml file
itemlist = xmldoc.getElementsByTagName('ACLineSegment')
for item in itemlist:
found = False
for child in item.childNodes:
if child.nodeName == 'Terminal':
found = True
if not found:
Term = item.getAttribute('Name')
DBAccess (Term)
#***********************************
findTerminal()

Related

how to improve the performance python script for extracting big size(3-4GB)of oracle table

I'm connecting to oracle database using python script and extracting around 10 tables. one table is having 3Gb of data it took around 4 hours to extract with below code and upload it to S3. How can we improve the performance of the below python script?
Different file format other than csv will improve the performance like parquet?
Any suggestions or solutions will be highly appreciated.
Below is the code I tried:
def extract_handler():
# Parameters defined in cloudwatch event
env = os.environ['Environment'] if 'Environment' in os.environ else 'sit'
# FTP parameters
host = f"/{env}/connet_HOSTNAME"
username = f"/{env}/connect_USERNAME"
password = f"/{env}/connect_PASSWORD"
host = get_parameters(host)
username = get_parameters(username)
password = get_parameters(password)
today = date.today()
current_date = today.strftime("%Y%m%d")
con = None
cur = None
tables = ["table1", "table2","table3"........."table10"]
bucket = "bucket_name"
for table in tables:
try:
con = cx_Oracle.connect(username, password, host, encoding="UTF-8")
cur = con.cursor()
logging.info('Successfully established the connection to Oracle db')
table_name = table.split(".")[1]
logging.info("######## Table name:"+ table +" ###### ")
logging.info("****** PROCESSING:" +table_name+" *********")
cur.execute("SELECT count(*) FROM {}".format(table))
count = cur.fetchone()[0]
logging.info("Count:", count)
if count > 0:
cur1 = con.cursor()
# Define the desired timestamp format
timestamp_format = '%Y/%m/%d %H:%M:%S'
# Execute a query to read a table
cur1.execute( "select * from {} where TRUNC(DWH_CREATED_ON)=TRUNC(SYSDATE)-1".format(table))
batch_size = 10000
rows = cur1.fetchmany(batch_size)
csv_file = f"/tmp/{table_name}.csv"
with open(csv_file, "w", newline="") as f:
# Add file_date column as the first column
writer = csv.DictWriter(f, fieldnames=['file_date'] + [col[0] for col in cur1.description],
delimiter='\t')
writer.writeheader()
logging.info("Header added to the table:" + table + "######")
while rows:
for row in rows:
row_dict = {'file_date': current_date}
for i, col in enumerate(cur1.description):
if col[1] == cx_Oracle.DATETIME:
if row[i] is not None:
row_dict[col[0]] = row[i].strftime(timestamp_format)
else:
row_dict[col[0]] = ""
else:
row_dict[col[0]] = row[i]
with open(csv_file, "a", newline="") as f:
# Add file_date column as the first column
writer = csv.DictWriter(f, fieldnames=['file_date'] + [col[0] for col in cur1.description],
delimiter='\t')
writer.writerow(row_dict)
# Fetch the next batch of 100 rows
rows = cur1.fetchmany(batch_size)
logging.info("Records written to the temp file for the table :" + table + "######")
s3_path = "NorthernRegion" + '/' + table_name + '/' + current_date + '/' + table_name + '.csv'
s3_client = boto3.client('s3', region_name='region-central-1')
s3_client.upload_file('/tmp/' + table_name + '.csv', bucket, s3_path)
logging.info(table + "File uploaded to S3 ######")
else:
logging.info('Table not having data')
return 'Data is not refreshed yet, Hence quitting..'
if cur1:
cur1.close()
except Exception as err:
#Handle or log other exceptions such as bucket doesn't exist
logging.error(err)
finally:
if cur:
cur.close()
if con:
con.close()
return "Successfully processed"

Matching Regex in Python from Excelfile

Im using Regex to match the following excel file and Im struggling with how I can
seperate each row by
Timestamp [0:00:48],
ID 20052A
and the content content (more content)
This is the excel row (one of many, so the ID can vary from row to row and the timestamp as well as the content too)
[0:00:48] 20052A: content (more content)
I get an Error code
AttributeError: 'NoneType' object has no attribute 'group
for matching my ID where I have
(r"^(.+:)(.+)|(r(\w+)?\s*\[(.*)\]\s*(\w+))", c)
Keep in mind that from time to time the ID looks something like this
[0:00:33] 30091aA: (content) 
My whole skript is (cancel out the connection to database)
import os
import re
import pymysql
pymysql.install_as_MySQLdb()
import pandas as pd
import sqlalchemy
def insert_or_update(engine, pd_table, table_name):
inserts = 0
updates = 0
for i in range(len(pd_table)):
vals_with_quotes = ["'" + str(x) + "'" for x in pd_table.loc[i, :].values]
# print(vals_with_quotes)
update_pairs = [str(c) + " = '" + str(v) + "'" for c, v in zip(pd_table.columns, pd_table.loc[i, :])]
query = f"INSERT INTO {table_name} ({', '.join(list(pd_table.columns.values))}) " \
f"VALUES ({', '.join(vals_with_quotes)}) " \
f"ON DUPLICATE KEY UPDATE {', '.join(update_pairs)}"
print(query)
result = engine.execute(query)
if result.lastrowid == 0:
updates += 1
else:
inserts += 1
print(f"Inserted {inserts} rows and updated {updates} rows.")
schema = '---'
alchemy_connect = "---"
engine = sqlalchemy.create_engine(alchemy_connect) # connect to server
engine.execute(f"USE {schema}") # select new db
# engine.execute("SET NAMES UTF8MB4;")
query = "SELECT * FROM .... where ...=..."
pm = pd.read_sql(query, engine)
rootpath = "path/"
for root, dirs, files in os.walk(rootpath):
for file in files:
print(root, dirs, files, file)
d = pd.read_excel(root + file, header=None)
d.drop(columns=[0], inplace=True)
d.rename(columns={1: "content"}, inplace=True)
participants = []
for ix, row in d.iterrows():
c = row["content"]
match = re.search(r"^(.+:)(.+)|(r(\w+)?\s*\[(.*)\]\s*(\w+))", c)
prefix = match.group(1)
only_content = match.group(2)
try:
timestamp = re.search(r"\[(\d{1,2}:\d{1,2}:\d{1,2})\]", prefix).group(1)
except:
timestamp = "-99"
# print(timestamp)
if re.search(r"\s(Versuchsleiter|ersuchsleiter|Versuchsleit|Versuch):", prefix):
id_code = "Versuchsleiter"
else:
starting_digits = re.search(r"^(\d+)", prefix)
id_code = re.search(r"(\d{2,4}.{1,3}):", prefix).group(1)
if hasattr(starting_digits, 'group'):
id_code = starting_digits.group(1) + id_code #
# get pid
participant = pm.loc[pm["id_code"] == id_code, "pid"]
try:
pid = participant.values[0]
except:
pid = "Versuchsleiter"
# print(ix, pid, id_code, only_content, timestamp)
if pid and pid not in participants and pid != "Versuchsleiter":
participants.append(pid)
d.loc[ix, "pid"] = pid
d.loc[ix, "timestamp"] = timestamp
d.loc[ix, "content"] = only_content.strip()
d.loc[ix, "is_participant"] = 0 if pid == "Versuchsleiter" else 1
d = d[["pid", "is_participant", "content", "timestamp"]]
d.loc[(d['pid'] == "Versuchsleiter"), "pid"] = participants[0]
d.loc[(d['pid'] == None), "pid"] = participants[0]
insert_or_update(engine, d, "table of sql")```
I need "Versuchsleiter" since some of the ID's are "Versuchsleiter"
Thank you!
You should take advantage from using capturing groups.
All the initial regex matching (after c = row["content"] and before # get pid) can be done with
match = re.search(r"^\[(\d{1,2}:\d{1,2}:\d{1,2})]\s+(\w+):\s*(.*)", c)
if match:
timestamp = match.group(1)
id_code = match.group(2)
only_content = match.group(3)
if re.search(r"(?:Versuch(?:sleit(?:er)?)?|ersuchsleiter)", id_code):
id_code = "Versuchsleiter"
Your timestamp will be 0:00:33, only_content will hold (content) and id_code will contain 30091aA.
See the regex demo
Thank you for your help but this gives me the following error
Traceback (most recent call last):
File "C:/Users/.../PycharmProjects/.../.../....py", line 80, in <module>
insert_or_update(engine, d, "sql table")
TypeError: not enough arguments for format string

Python SQLite3 - cursor.execute - no error

This is a piece of code which needs to perform the follow functionality:
Dump all table names in a database
From each table search for a column with either Latitude or Longitude in
Store these co-ords as a json file
The code was tested and working on a single database. However once it was put into another piece of code which calls it with different databases it now is not entering line 49. However there is no error either so I am struggling to see what the issue is as I have not changed anything.
Code snippet line 48 is the bottom line -
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
print (cursor)
for tablerow in cursor.fetchall():
I am running this in the /tmp/ dir due to an earlier error with sqlite not working outside the temp.
Any questions please ask them.
Thanks!!
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sqlite3
import os
import sys
filename = sys.argv[1]
def validateFile(filename):
filename, fileExt = os.path.splitext(filename)
print ("[Jconsole] Python: Filename being tested - " + filename)
if fileExt == '.db':
databases(filename)
elif fileExt == '.json':
jsons(fileExt)
elif fileExt == '':
blank()
else:
print ('Unsupported format')
print (fileExt)
def validate(number):
try:
number = float(number)
if -90 <= number <= 180:
return True
else:
return False
except ValueError:
pass
def databases(filename):
dbName = sys.argv[2]
print (dbName)
idCounter = 0
mainList = []
lat = 0
lon = 0
with sqlite3.connect(filename) as conn:
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
print (cursor)
for tablerow in cursor.fetchall():
print ("YAY1")
table = tablerow[0]
cursor.execute('SELECT * FROM {t}'.format(t=table))
for row in cursor:
print(row)
print ("YAY")
tempList = []
for field in row.keys():
tempList.append(str(field))
tempList.append(str(row[field]))
for i in tempList:
if i in ('latitude', 'Latitude'):
index = tempList.index(i)
if validate(tempList[index + 1]):
idCounter += 1
tempList.append(idCounter)
(current_item, next_item) = \
(tempList[index], tempList[index + 1])
lat = next_item
if i in ('longitude', 'Longitude'):
index = tempList.index(i)
if validate(tempList[index + 1]):
(current_item, next_item) = \
(tempList[index], tempList[index + 1])
lon = next_item
result = '{ "id": ' + str(idCounter) \
+ ', "content": "' + dbName + '", "title": "' \
+ str(lat) + '", "className": "' + str(lon) \
+ '", "type": "box"},'
mainList.append(result)
file = open('appData.json', 'a')
for item in mainList:
file.write('%s\n' % item)
file.close()
# {
# ...."id": 1,
# ...."content": "<a class='thumbnail' href='./img/thumbs/thumb_IMG_20161102_151122.jpg'>IMG_20161102_151122.jpg</><span><img src='./img/thumbs/thumb_IMG_20161102_151122.jpg' border='0' /></span></a>",
# ...."title": "50.7700721944444",
# ...."className": "-0.8727045",
# ...."start": "2016-11-02 15:11:22",
# ...."type": "box"
# },
def jsons(filename):
print ('JSON')
def blank():
print ('blank')
validateFile(filename)
Fixed.
The issue was up here
filename, fileExt = os.path.splitext(filename)
The filename variable was being overwritten without the file extension so when SQLite searched it didn't find the file.
Strange no error appeared but it is fixed now by changing the filename var to filename1.

Import a CSV to Google Fusion Table with python

From http://fuzzytolerance.info/blog/2012/01/13/2012-01-14-updating-google-fusion-table-from-a-csv-file-using-python/ I have edited his code to import the necessary modules, however I get the following error "AttributeError: 'module' object has no attribute 'urlencode'". I run the code and I am prompted to enter my password, I enter my own google account password, and then the code gives me the error message, pehaps I need to define a password somewhere?
I wonder if anyone can please trouble shoot my code or advise me on how to avoid this error or even advise me of an EASIER way to import a CSV into a GOOGLE FUSION TABLE that I OWN
Here is my code
import csv
from decimal import *
import getpass
from fusiontables.authorization.clientlogin import ClientLogin
from fusiontables import ftclient
nameAgeNick = 'C:\\Users\\User\\Desktop\\NameAgeNickname.txt'
# check to see if something is an integer
def isInt(s):
try:
int(s)
return True
except ValueError:
return False
# check to see if something is a float
def isFloat(s):
try:
float(s)
return True
except ValueError:
return False
# open the CSV file
ifile = open(nameAgeNick, "rb")
reader = csv.reader(ifile)
# GFT table ID
tableID = "tableid"
# your username
username = "username"
# prompt for your password - you can hardcode it but this is more secure
password = getpass.getpass("Enter your password:")
# Get token and connect to GFT
token = ClientLogin().authorize(username, password)
ft_client = ftclient.ClientLoginFTClient(token)
# Loop through the CSV data and upload
# Assumptions for my data: if it's a float less than 0, it's a percentage
# Floats are being rounded to 1 significant digit
# Non-numbers are wrapped in a single quote for string-type in the updatate statement
# The first row is the column names and matches exactly the column names in Fustion tables
# The first column is the unique ID I'll use to select the record for updating in Fusion Tables
rownum = 0
setList = list()
nid = 0
for row in reader:
# Save header row.
if rownum == 0:
header = row
else:
colnum = 0
setList[:] = []
for col in row:
thedata = col
# This bit rounds numbers and turns numbers < 1 into percentages
if isFloat(thedata):
if isInt(thedata) is False:
if float(thedata) < 1:
thedata = float(thedata) * 100
thedata = round(float(thedata), 1)
else:
thedata = "'" + thedata + "'"
# make sql where clause for row
setList.append(header[colnum] + "=" + str(thedata))
nid = row[0]
colnum += 1
# get rowid and update the record
rowid = ft_client.query("select ROWID from " + tableID + " where ID = " + nid).split("\n")[1]
print( rowid)
print( ft_client.query("update " + tableID + " set " + ",".join(map(str, setList)) + " where rowid = '" + rowid + "'"))
rownum += 1
ifile.close()​
And this is the module where the error occurs:
#!/usr/bin/python
#
# Copyright (C) 2010 Google Inc.
""" ClientLogin.
"""
__author__ = 'kbrisbin#google.com (Kathryn Brisbin)'
import urllib, urllib2
class ClientLogin():
def authorize(self, username, password):
auth_uri = 'https://www.google.com/accounts/ClientLogin'
authreq_data = urllib.urlencode({ //////HERE IS ERROR
'Email': username,
'Passwd': password,
'service': 'fusiontables',
'accountType': 'HOSTED_OR_GOOGLE'})
auth_req = urllib2.Request(auth_uri, data=authreq_data)
auth_resp = urllib2.urlopen(auth_req)
auth_resp_body = auth_resp.read()
auth_resp_dict = dict(
x.split('=') for x in auth_resp_body.split('\n') if x)
return auth_resp_dict['Auth']
​

Python-xml parse using beautifulsoup4 and writing the output to mysql db - unicode error

I'm trying to parse an xml file using beautifulsoup4.
IDE : LICLIPSE
Python version: 2.7
xml encoding : utf-8
Sample xml file : http://pastebin.com/RhjvyKDN
Below is the code I used to parse the xml files and write the extracted information to a local mysql database.
from bs4 import BeautifulSoup
import pymysql
import os, os.path
#strips apostrophes from the text and then just adds them at the beginning and end for the query
def apostro(text):
text= text.replace("'","")
text= text.replace(",","")
text = "'"+text+"'"
return text
#sets up the MYSQL connection
conn = pymysql.connect(host='127.0.0.1', user='xxxx', passwd='xxxx', db='mysql', port= 3306 )
cur = conn.cursor()
#drop all of the previous values from the database
cur.execute("DELETE FROM db WHERE title is not null")
conn.commit()
#loop through all of the files
for root, _, files in os.walk("C:/usc/xml"):
for f in files:
#j is a counter for how many sections we have processed
j=0
#fullpath is the location of the file we're parsing
fullpath = os.path.join(root, f)
print(fullpath)
#open file using BeautifulSoup
soup = BeautifulSoup(open(""+fullpath+""), 'xml')
sec = soup.find_all("section", {"style" : "-uslm-lc:I80"})
t = soup.main.title
t_num = t.num['value']
#if not clauses are needed in case there is a blank, otherwise an error is thrown
if not t.heading.text:
t_head = ''
else:
t_head = t.heading.text.encode('ascii', 'ignore').encode("UTF-8")
for element in sec:
if not element.num['value']:
section = ''
else:
section = element.num['value'].encode('ascii', 'ignore').encode("UTF-8")
if not element.heading:
s_head = ''
else:
s_head = element.heading.text.encode('ascii', 'ignore').encode("UTF-8")
if not element.text:
s_text = ''
else:
s_text = element.text.encode('ascii', 'ignore').encode("UTF-8")
#inserttest is the sql command that 'cur' executes. counter is printed every time a section is written to let me know the program is still alive
inserttest = "insert into deadlaws.usc_new (title, t_head, section, s_head, s_text) values (" + t_num + "," + apostro(t_head) + "," + apostro(section) + "," + apostro(s_head) + "," + apostro(s_text) +")"
j=j+1
cur.execute( inserttest)
conn.commit()
print(fullpath + " " +str(j))
conn.commit()
cur.close()
conn.close()
Everything went well until I noticed that the program ignores the hyphens '-' in the section numbers which makes the entire activity wrong.
I know I have used 'ignore' in the encode statement, but a hyphen '-' is a legitimate character in ascii, right? Shouldn't it be writing the character to the db instead of ignoring it?
I did a lot of reading on SO and elsewhere.
I've tried including from_encoding="utf-8" in the soup statement, 'xmlrefreplace' in the encode() statement and other methods, which have resulted in the below output : it writes this a– (some special unicode character) instead of a hyphen '-' to the database.
Sample output:
The data is huge and I'm afraid there could be other characters like - that are being ignored by the program. It's ok if it ignores special characters from the t_head, s_head and s_text fields as they are text but not the section column.
Any help in resolving this issue would be greatly appreciated.
Don't encode, the MySQL library is perfectly capable of inserting Unicode text into the database directly. Use SQL parameters, not string interpolation, and specify the character set to use when connecting to the database:
conn = pymysql.connect(host='127.0.0.1', user='xxxx', passwd='xxxx',
db='mysql', port=3306,
charset='utf8')
Don't encode:
t_head = t.heading.text or ''
for element in sec:
if not element.num['value']:
section = ''
else:
section = element.num.get('value', '')
s_head = element.heading.text or ''
s_text = element.text or ''
inserttest = "insert into deadlaws.usc_new (title, t_head, section, s_head, s_text) values (?, ?, ?, ?)"
cur.execute(inserttest, (t_num, t_head, section, s_head, s_text))

Categories