I am trying to make a program that takes changes in a dbf file then uploads them. I have got it to read the dbf file and upload them to a mysql database but its a 50 minuite upload. I have tried to get it to only upload fields that have been changed. The problem I have is that it seems i need to close and re-open the dbf file. If someone makes a change whilst its doing this, it doesnt notice theres been a change.
Is there a better/right way of doing this:
import time
import dbf
import MySQLdb
import os
source_path = r"\\path\to\file"
file_name = "\\test.Dbf"
print "Found Source DBF"
source = dbf.Table(source_path + file_name)
source.open()
print "Opened DBF"
updated = list(source)
print "Copied Source"
db = MySQLdb.connect(host = "myHost.com", port=3306, user = "user", passwd = "pass", db = "database")
cur = db.cursor()
print "Connected to database"
try:
cur.execute("DROP TABLE IF EXISTS dbftomysql")
except:
db.rollback()
print "Dropped old table"
sql = """CREATE TABLE table(
col1 VARCHAR(200) NOT NULL,
col2 VARCHAR(200),
col3 VARCHAR(200),
col4 NUMERIC(15,2),
col5 VARCHAR(200) )"""
cur.execute(sql)
print "Created new table"
for i, s in zip(source, updated):
query = """INSERT table SET col1 = %s, col2 = %s, col3 = %s, col4 = %s, col5 = %s"""
values = (i["col1"], i ["col2 "], i["col3"], i["col4"], i["col5"])
cur.execute(query, values)
db.commit()
print i["col1"], i ["col2 "], i["col3"], i["col4"], i["col5"]
print "First Upload Completed"
while True:
for i, s in zip(source, updated):
if i["col1"] != s["col1"]:
print i["col1"] + " col1Updated"
query = """UPDATE table SET col1= %s WHERE col1= %s"""
values = (i["col1"], s["col1"])
try:
cur.execute(query, values)
db.commit()
except:
db.rollback()
print "No connection to database"
if i["col2"] != s["col2"]:
print i["col2"] + " col2 Updated for " + i["col1"]
query = """UPDATE table SET col2 = %s WHERE col1= %s OR col1= %s"""
values = (i["col2"], i["col1"], s["col1"])
try:
cur.execute(query, values)
db.commit()
except:
db.rollback()
print "No connection to database"
#ect
updated = list(source)
source.close()
source.open()
time.sleep(0.2)
The dbf library will only fetch the record from the dbf file if it doesn't already exist in memory; when you do
updated = list(source)
you are effectively freezing all the rows because updated is a list of records (not a list of lists or a list of tuples; this means that when you later try to compare source and updated you are comparing the same data.
In order to make updated be a separate entity from source try
updated = [tuple(row) for row in source]
which will give you a list of tuples, or
updated = [scatter(row, dict) for row in source]
which will give you a list of dicts, which is what you need for your field comparison code further down.
Related
I am trying to find a faster method to insert data into my table, the table should end up with over 100 million rows, I have been running my code for 24 hours nearly and the table currently only has 9 million rows entered and is still in progress.
My code currently reads 300 csv files at a time, and stores the data in a list, it gets filtered for duplicate rows, then I use a for loop to place an entry in the list as a tuple and update the table one tuple at a time. This method just takes too long, is there a way for me to bulk insert all rows? I have tried looking online but the methods I am reading do not seem to help in my situation.
Many thanks,
David
import glob
import os
import csv
import mysql.connector
# MYSQL logon
mydb = mysql.connector.connect(
host="localhost",
user="David",
passwd="Sword",
database="twitch"
)
mycursor = mydb.cursor()
# list for strean data file names
streamData=[]
# This function obtains file name list from a folder, this is to open files
in other functions
def getFileNames():
global streamData
global topGames
# the folders to be scanned
#os.chdir("D://UG_Project_Data")
os.chdir("E://UG_Project_Data")
# obtains stream data file names
for file in glob.glob("*streamD*"):
streamData.append(file)
return
# List to store stream data from csv files
sData = []
# Function to read all streamData csv files and store data in a list
def streamsToList():
global streamData
global sData
# Same as gamesToList
index = len(streamData)
num = 0
theFile = streamData[0]
for x in range(index):
if (num == 301):
filterStreams(sData)
num = 0
sData.clear()
try:
theFile = streamData[x]
timestamp = theFile[0:15]
dateTime = timestamp[4:8]+"-"+timestamp[2:4]+"-"+timestamp[0:2]+"T"+timestamp[9:11]+":"+timestamp[11:13]+":"+timestamp[13:15]+"Z"
with open (theFile, encoding="utf-8-sig") as f:
reader = csv.reader(f)
next(reader) # skip header
for row in reader:
if (row != []):
col1 = row[0]
col2 = row[1]
col3 = row[2]
col4 = row[3]
col5 = row[4]
col6 = row[5]
col7 = row[6]
col8 = row[7]
col9 = row[8]
col10 = row[9]
col11 = row[10]
col12 = row[11]
col13 = dateTime
temp = col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13
sData.append(temp)
except:
print("Problem file:")
print(theFile)
print(num)
num +=1
return
def filterStreams(self):
sData = self
dataSet = set(tuple(x) for x in sData)
sData = [ list (x) for x in dataSet ]
return createStreamDB(sData)
# Function to create a table of stream data
def createStreamDB(self):
global mydb
global mycursor
sData = self
tupleList = ()
for x in sData:
tupleList = tuple(x)
sql = "INSERT INTO streams (id, user_id, user_name, game_id, community_ids, type, title, viewer_count, started_at, language, thumbnail_url, tag_ids, time_stamp) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
val = tupleList
try:
mycursor.execute(sql, val)
mydb.commit()
except:
test = 1
return
if __name__== '__main__':
getFileNames()
streamsToList()
filterStreams(sData)
If some of your rows succeeds but the some fails, Do you want your database to be left in a corrupt state? if no, try to commit out of the loop. like this:
for x in sData:
tupleList = tuple(x)
sql = "INSERT INTO streams (id, user_id, user_name, game_id, community_ids, type, title, viewer_count, started_at, language, thumbnail_url, tag_ids, time_stamp) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
val = tupleList
try:
mycursor.execute(sql, val)
except:
# do some thing
pass
try:
mydb.commit()
except:
test = 1
And if you don't. try to load your cvs file into your mysql directly.
LOAD DATA INFILE "/home/your_data.csv"
INTO TABLE CSVImport
COLUMNS TERMINATED BY ','
OPTIONALLY ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 LINES;
Also, to make you more clear. I've define three ways to insert those data, if you insistent to use python, since you have some processing with your data.
Bad way
In [18]: def inside_loop():
...: start = time.time()
...: for i in range(10000):
...: mycursor = mydb.cursor()
...: sql = "insert into t1(name, age)values(%s, %s)"
...: try:
...: mycursor.execute(sql, ("frank", 27))
...: mydb.commit()
...: except:
...: print("Failure..")
...: print("cost :{}".format(time.time() - start))
...:
Time cost:
In [19]: inside_loop()
cost :5.92155909538269
Okay way
In [9]: def outside_loop():
...: start = time.time()
...: for i in range(10000):
...: mycursor = mydb.cursor()
...: sql = "insert into t1(name, age)values(%s, %s)"
...: try:
...: mycursor.execute(sql, ["frank", 27])
...: except:
...: print("do something ..")
...:
...: try:
...: mydb.commit()
...: except:
...: print("Failure..")
...: print("cost :{}".format(time.time() - start))
Time cost:
In [10]: outside_loop()
cost :0.9959311485290527
Maybe, there are still having some better way, even best. (i.e, use pandas to process your data. and try redesign your table ...)
You might like my presentation Load Data Fast! in which I compared different methods of inserting bulk data, and did benchmarks to see which was the fastest method.
Inserting one row at a time, committing a transaction for each row, is about the worst way you can do it.
Using LOAD DATA INFILE is fastest by a wide margin. Although there are some configuration changes you need to make on a default MySQL instance to allow it to work. Read the MySQL documentation about options secure_file_priv and local_infile.
Even without using LOAD DATA INFILE, you can do much better. You can insert multiple rows per INSERT, and you can execute multiple INSERT statements per transaction.
I wouldn't try to INSERT the whole 100 million rows in a single transaction, though. My habit is to commit about once every 10,000 rows.
and having lots of trouble trying to figure out how I can update several rows in a SQLite data base.
Efectively I am getting an location on a Database I gathered, and running through Google maps to get the Latitude and Longitude. In general its working, but the loop fails!
It does it once, gets the first line that meet criteria and finish, and I can´t figure it out why it´s not keep going!! Can anyone help? The script below:
# coding=utf-8
import urllib
import sqlite3
import json
conn = sqlite3.connect('ArchDailyProjects.sqlite')
cur = conn.cursor()
#Google Prep
ServiceUrl="https://maps.googleapis.com/maps/api/geocode/json?"
FimDoURL="&key=????????????????????????????????" #I have the key right, this part works fine
#cur.execute('SELECT * FROM Lugares' )
#print type(cur)
#print cur
#row=cur.fetchone()
for row in cur.execute('SELECT * FROM LugareS' ):
print 'Entramos no While'
Loc_id = str(row[0])
Loc_Name = str(row[1])
Loc_Lat = row[2]
print Loc_Name
if Loc_Lat is None:
print Loc_Name
print Loc_Lat
print "Buscando "+Loc_Name+" no Google Maps"
try:
Url = ServiceUrl + urllib.urlencode({"sensor": "false", "address": Loc_Name}) + FimDoURL
Uh = urllib.urlopen(Url)
Dados = Uh.read()
try: js = json.loads(str(Dados))
except: js = None
except: continue
if "status" not in js or js["status"] != "OK":
print "===== Beeehhhh!!! Não conseguimos encontrar essa cidade===="
print Dados
continue
else:
Loc_FormatedAdress = js["results"][0]["formatted_address"]
Loc_Lat = js["results"][0]["geometry"]["location"]["lat"]
Loc_Lon = js["results"][0]["geometry"]["location"]["lng"]
print Dados
print 'Endereço Google: ', Loc_FormatedAdress
print 'Latitude: ', Loc_Lat
print 'Longitude: ', Loc_Lon
cur.execute('''UPDATE Lugares SET Latitude= ?, Longitude=?, GoogleLoc=? WHERE id= ?
''', (Loc_Lat, Loc_Lon, Loc_FormatedAdress, Loc_id))
#row=cur.fetchone()
else: #row=cur.fetchone()
continue
conn.commit()
Thank you guys!
If the file is large, you may not want to load the entire database into memory with "fetchall" but read only one row at a time, and update entries on the go. You can do this by creating two cursors.
import sqlite3 as sq3
conn = sq3.connect(db_name)
cur = conn.cursor()
cur2 = conn.cursor()
for row in cur.execute('SELECT * FROM Table' ):
cur2.execute('''UPDATE Table SET variable = ? WHERE id= ?''', (variable, id))
works fine.
for row in cur.execute('SELECT * FROM LugareS' ):
...
cur.execute('''UPDATE Lugares SET Latitude= ?, Longitude=?, GoogleLoc=? WHERE id= ?
You are executing a different query on the same cursor object; the UPDATE does not have any result rows.
Simply read all the data before looping over it:
cur.execute('SELECT id, Name FROM Lugares WHERE Latitude IS NULL')
empty_rows = cur.fetchall()
for row in empty_rows:
...
I have a python script that I created to update a MySQL database the insert work perfect but when I tried to update it nothing happen and it doesn't change.
The console displays this error from the try and except
Unable to print data
Can anyone help me to fix this error?
MySQL database
Database student
Table structure for table stu
Column Type Null Default
ID int(8) No
Name varchar(255) No
subject varchar(255) No
Dumping data for table stu
11 jhon python
12 jina hjsdhjsd
13 jaSDJ JHAISDJ
Python script
#!/usr/bin/python
# UPDATE AND delete some values from the database ###
import MySQLdb
db = MySQLdb.Connect("localhost", "****", "******", "student")
cursor = db.cursor()
sql = "UPDATE STU SET NAME = MAROUN, SUBJECT = C++ WHERE ID = 13 "
try:
cursor.execute(sql)
# r = cursor.fetchall()
# for row in r:
# ID = row[0]
# NAME = row[1]
# SUBJECT = row[2]
# print "ID = %d, LAST_NAME = %s, SUBJECT = %s " %(ID, NAME, SUBJECT)
print "update ok "
except Exception as e:
print e
db.close()
I want to export specific column from one database to another one using Python but its not coming:
# Display all Non-Duplicate data
import sqlite3
import csv
conn = sqlite3.connect('data.db')
# STEP 2 : create a small data file with only three fields account_id, product_id and unit_quantity
cursor = conn.execute("SELECT field1,field12,field14 FROM database")
for row in cursor:
print row[0:11]
print "Operation done successfully";
conn.close()
Create second connection and insert directly
conn = sqlite3.connect('data.db')
cursor = conn.execute("SELECT field1,field12,field14 FROM database")
export = sqlite3.connect('exported.db')
#get result as list
for values in cursor.fetchall():
export.execute('INSERT INTO tablename(field1,field12,field14) VALUES (%s, %s, %s)' % (values[0], values[1], values[2]))
export.commit()
export.close()
I HAVE ADDED MY OWN ANSWER THAT WORKS BUT OPEN TO IMPROVEMENTS
After seeing a project at datanitro. I took on getting a connection to MySQL (they use SQLite) and I was able to import a small test table into Excel from MySQL.
Inserting new updated data from the Excel sheet was this next task and so far I can get one row to work like so...
import MySQLdb
db = MySQLdb.connect("xxx","xxx","xxx","xxx")
c = db.cursor()
c.execute("""INSERT INTO users (id, username, password, userid, fname, lname)
VALUES (%s, %s, %s, %s, %s, %s);""",
(Cell(5,1).value,Cell(5,2).value,Cell(5,3).value,Cell(5,4).value,Cell(5,5).value,Cell(5,6).value,))
db.commit()
db.close()
...but attempts at multiple rows will fail. I suspect either issues while traversing rows in Excel. Here is what I have so far...
import MySQLdb
db = MySQLdb.connect(host="xxx.com", user="xxx", passwd="xxx", db="xxx")
c = db.cursor()
c.execute("select * from users")
usersss = c.fetchall()
updates = []
row = 2 # starting row
while True:
data = tuple(CellRange((row,1),(row,6)).value)
if data[0]:
if data not in usersss: # new record
updates.append(data)
row += 1
else: # end of table
break
c.executemany("""INSERT INTO users (id, username, password, userid, fname, lname) VALUES (%s, %s, %s, %s, %s, %s)""", updates)
db.commit()
db.close()
...as of now, I don't get any errors, but my new line is not added (id 3). This is what my table looks like in Excel...
The database holds the same structure, minus id 3. There has to be a simpler way to traverse the rows and pull the unique content for INSERT, but after 6 hours trying different things (and 2 new Python books) I am going to ask for help.
If I run either...
print '[%s]' % ', '.join(map(str, updates))
or
print updates
my result is
[]
So this is likely not passing any data to MySQL in the first place.
LATEST UPDATE AND WORKING SCRIPT
Not exactly what I want, but this has worked for me...
c = db.cursor()
row = 2
while Cell(row,1).value != None:
c.execute("""INSERT IGNORE INTO users (id, username, password, userid, fname, lname)
VALUES (%s, %s, %s, %s, %s, %s);""",
(CellRange((row,1),(row,6)).value))
row = row + 1
Here is your problem:
while True:
if data[0]:
...
else:
break
Your first id is 0, so in the first iteration of the loop data[0] will be falsely and your loop will exit, without ever adding any data. What you probably ment is:
while True:
if data[0] is not None:
...
else:
break
I ended up finding a solution that gets me an Insert on new and allows for UPDATE of those that are changed. Not exactly a Python selection based on a single query, but will do.
import MySQLdb
db = MySQLdb.connect("xxx","xxx","xxx","xxx")
c = db.cursor()
row = 2
while Cell(row,1).value is not None:
c.execute("INSERT INTO users (id, username, password, \
userid, fname, lname) \
VALUES (%s, %s, %s, %s, %s, %s) \
ON DUPLICATE KEY UPDATE \
id=VALUES(id), username=VALUES(username), password=VALUES(password), \
userid=VALUES(userid), fname=VALUES(fname), lname=VALUES(lname);",
(CellRange((row,1),(row,6)).value))
row = row + 1
db.commit()
db.close()