Python disk writing speed different on two computers - python

I am trying to run a python script to take data from an input .csv file and print it to an output .csv file, using an ID and column headers to cross reference.
They are very large files so I've included multiprocessing and the first step is to split the output file into parts so they can be processed in parallel.
I tried executing this on Computer A and the disk speed to split up the output file is 0 mb/s (or a very slow trickle) and the program never get executed. Meanwhile on Computer B the program runs correctly and the disk speed splits the output file into parts at a rate of ~40mb/s.
It is the same exact files and the same exact code, why is Computer A not responding properly? Is it an operating system error? Am I missing a driver on Computer A? Something else?
The input and output files are located on the WD 4TB external HDD.
Computer A specs: Windows 10 Pro 64-bit, Intel i9 7920X 12-core, 8 x 8GB Geil Evo Potenza RAM, Samsung 850 Evo 500GB local SSD, WD 4TB external HDD, ASRock X299 Killer motherboard
Copmuter B specs: Windows 10 Pro 64-bit,
Intel i7 6700K 4-core, 2 x 16GB Geil Evo Forza RAM, PNY CS1311 240GB SSD, WD 4TB external HDD, MSI B250M Gaming Pro motherboard
# std lib imports
import csv
import multiprocessing
import os
import os.path
import shutil
import sqlite3
import sys
import tempfile
import timeit
# third party imports
# our imports
buffer_size = 8192000 # 8.192 MB, 10x the default (io.DEFAULT_BUFFER_SiZE)
# working_dir = tempfile.gettempdir()
working_dir = "E:\\temp_files"
def return_csv_header(filename):
"""
Returns the first column of the csv file filename
as a list.
"""
with open(filename, "r", newline="", encoding="utf-8") as csvfile:
reader = csv.reader(csvfile)
return next(reader)
def return_input_mapping(input_file, identifier):
"""
Returns a {identifier:record} dictionary where
identifier is the value of the identifier column
for each row in the input file.
record is a dictionary of {column:value}
representing a row in a csv file.
"""
to_ret = dict()
with open(input_file, "r", newline="", encoding="utf-8") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
key = row[identifier]
to_ret[key] = row
return to_ret
def return_csv_contents(input_file):
"""
Returns a list of lists representing the rows
in a csv file.
"""
to_ret = list()
with open(input_file, "r", newline="", encoding="utf-8") as csvfile:
reader = csv.reader(csvfile)
for row in reader:
to_ret.append(row)
return to_ret
def create_db_and_table(csv_contents, identifier):
"""
Creates a sqlite3 database and table.
Creates the identifier table column along with more
table columns named from col0 to colN. We do this because
the csv column headers can be all sorts of weird stuff. And
we only really care about the order of the columns, and the
identifier so we can set it as the primary key.
No rows are added.
Returns the database path and table name as a tuple.
"""
# assert that the identifier is in the csv_contents header
header = csv_contents[0]
assert(identifier in header)
db_path = os.path.join(working_dir, "csv_input.sqlite")
tablename = "data"
# delete the database if it already exists
if os.path.exists(db_path):
os.remove(db_path)
# create the database, table, and columns
with sqlite3.connect(db_path) as conn:
cursor = conn.cursor()
# prepare the table's columns - it will look like this
# (col1 text, col2 text, col3 text primary key, ...)
name = "col"
counter = 0
column_names = "("
for column in header:
if column == identifier:
column_names += "%s text primary key," % identifier
else:
column_names += "%s text," % (name + str(counter))
counter += 1
# remove the last comma and space
if column_names.endswith(","):
column_names = column_names[0:-1]
column_names += ")"
# prepare the sql statement
sql = "CREATE TABLE %s %s" % (tablename, column_names)
# some performance tweaks for the database
cursor.execute("PRAGMA synchronous=OFF")
cursor.execute("PRAGMA cache_size=20000") # 20,000*1024 = 20.48MB
cursor.execute("PRAGMA journal_mode=off") # dont keep journal of operations
cursor.execute("PRAGMA temp_store=memory") # store temp files in memory
# execute the statement
cursor.execute(sql)
conn.commit()
return db_path, tablename
def delete_db(db_path):
"""
Deletes the sqlite3 database file at the given db_path.
"""
assert(os.path.exists(db_path) is True)
os.remove(db_path)
assert(os.path.exists(db_path) is False)
def load_db_content(db_path, table, contents):
"""
Loads the database table with the given contents.
Skips the first element in contents as that is the
header aka the database column names.
"""
header = contents[0]
num_of_cols = len(header)
assert(num_of_cols != 0)
contents = contents[1:] # remove the header from the contents
# connect to the database
with sqlite3.connect(db_path) as conn:
# only commit once versus after every statement
cursor = conn.cursor()
cursor.execute("BEGIN IMMEDIATE")
# insert into the database in chunks if needed
limit = 999
remaining = num_of_cols
beginning = 0
while remaining > limit: # sqlite column limit
# prepare the sql statement
# this makes the string (?, ?, ?, ...)
columns = "(" + "?," * limit
columns = columns[:-1] # remove last comma
columns += ")"
# prepare the columns to insert
to_insert = []
for i in range(len(contents)):
to_insert.append(contents[i][beginning:beginning+limit])
sql = "INSERT INTO %s VALUES %s" % (table, columns)
cursor.executemany(sql, to_insert)
remaining -= 999
beginning += 999
columns = "(" + "?," * remaining
columns = columns[:-1] # remove last comma
columns += ")"
to_insert = []
for i in range(len(contents)):
to_insert.append(contents[i][beginning:beginning+remaining])
sql = "INSERT INTO %s VALUES %s" % (table, columns)
cursor.executemany(sql, to_insert)
# commit the changes
conn.commit()
def get_na_dict(columns, identifier):
"""
Returns a dict with the given columns as keys, and
"n/a" as the values.
Skip over the identifier because we want to keep
that piece of data as it is.
"""
to_ret = dict()
for column in columns:
if column == identifier:
continue
else:
to_ret[column] = "n/a"
return to_ret
def run_vlookup(input_file, output_file, identifier, db_path, table):
"""
Completes the output file with data from the input file
that matches the record identifier and the header columns.
See the description at the top of this file for an example.
Returns the path of the new output file.
"""
# header of input file
input_header = return_csv_header(input_file)
# fill in the output file with data from the input file
output_file_name = os.path.basename(output_file)
temp_file = os.path.join(working_dir, output_file_name + ".tmp")
with open(output_file, "r", newline="", buffering=buffer_size, encoding="utf-8") as inputcsv:
with open(temp_file, "w", newline="", buffering=buffer_size, encoding="utf-8") as tempcsv:
reader = csv.DictReader(inputcsv)
# set restval to "" which will write empty values for columns
# in the output file that are not in the input file
# set extrasaction to "ignore" which will skip over columns
# from the input file that are not in the output file
writer = csv.DictWriter(tempcsv,
fieldnames=reader.fieldnames,
restval="",
extrasaction="ignore")
writer.writeheader()
# open databse connection
with sqlite3.connect(db_path) as conn:
cursor = conn.cursor()
for row in reader:
key = row[identifier] # get the value for the lookup
# fetch the key from the database
sql = "SELECT * FROM %s where %s = '%s'" % (table, identifier, key)
cursor.execute(sql)
result = cursor.fetchone()
# update the output file with "n/a" if the item
# does not exist in the database
if result is None:
lookup_values = get_na_dict(input_header, identifier)
# otherwise update it with the values from the database
else:
lookup_values = dict(zip(input_header, result))
row.update(lookup_values) # merge the two dictionaries
writer.writerow(row)
return temp_file
def split_file(output_file, cpu_count):
"""
Splits the output_file into cpu_count number of
temporary files.
Returns the names of the temporary files as a list.
"""
# generate the temp file names
output_file_name = os.path.basename(output_file)
tempdir = working_dir
temp_files = []
for i in range(cpu_count):
temp_file = os.path.join(tempdir, "%s_%s" % (output_file_name, i))
temp_files.append(temp_file)
# create the files
with open(output_file, "rb", buffering=0) as outfile:
# calculate size of the file
size = outfile.seek(0, 2)
outfile.seek(0, 0)
# read the header in (at the same time moving the file pointer forward)
header = return_header_bytes(outfile)
# calculate the size of the smaller files (excluding the header)
chunk_size = (size - len(header)) / cpu_count
for file in temp_files:
create_temp_file(file, chunk_size, outfile, header)
return temp_files
def create_temp_file(file, chunk_size, outfile, header):
"""
Creates file with the given header plus chunk_size data
from the given outfile.
Header is a byte string.
If chunk_size is bigger than 100MB we read/write it in
chunks of 100MB.
After reading chunk_size amount of data, if the last byte
does not end with a newline, we keep reading until
it does. That way we dont write a file with truncated data.
If we reach the end of outfile then we stop reading and we
finish our last write.
"""
# start the file with the header
data = header
memory_safe_chunk = 100000000 # bytes, so 100MB
# read in chunk_size data from outfile
if isinstance(chunk_size, float):
chunk_size = int(chunk_size)
# write the data to the temp file
with open(file, "wb", buffering=0) as fp:
# safe to read/write chunk in one go
if chunk_size < memory_safe_chunk:
data += outfile.read(chunk_size)
# need to read/write chunk in chunks, go figure
else:
remaining = chunk_size
while remaining > memory_safe_chunk:
data += outfile.read(memory_safe_chunk)
fp.write(data)
data = b""
remaining -= memory_safe_chunk
data += outfile.read(remaining)
# keep reading 1 byte until we reach a newline
# or the end of the file
while not data.endswith(b"\n"):
char = outfile.read(1)
# reached EOF
if char == b"":
break
else:
data += char
fp.write(data)
del data # free up memory
def return_header_bytes(file_pointer):
"""
Returns a string starting from the file_pointer until
the first newline character.
"""
to_ret = file_pointer.read(1)
while not to_ret.endswith(b"\n"):
to_ret += file_pointer.read(1)
return to_ret
def merge_files(files):
"""
Returns a file that has the contents of files merged
together in one.
Keeps only the header from the first file, and discards
the rest as they are duplicates.
"""
chunk_size = 100000000 # bytes, so 100MB
master_file = os.path.join(working_dir, "temp.csv")
with open(files[0], "rb") as fpointer:
header = return_header_bytes(fpointer)
# open master file for writing
with open(master_file, "wb", buffering=0) as master_fp:
master_fp.write(header)
# loop through each file copying over the contents minus
# the header
for file in files:
# read the temp file in chunks
# and write it to the master file
with open(file, "rb", buffering=0) as temp_fp:
temp_fp.seek(len(header))
data = temp_fp.read(chunk_size)
while data != b"":
master_fp.write(data)
data = temp_fp.read(chunk_size)
del data # free up memory
return master_file
def launch_processes(input_file, output_file, identifier):
"""
Splits the output file into N temporary files.
Launches a process to run the vlookup on each temp file.
Merges the temp files back into one.
Moves the final temp file to the output_file location.
Deletes the N temp files.
"""
# create temporary files equal to the amount of cpu cores
cpu_count = multiprocessing.cpu_count()
files = split_file(output_file, cpu_count)
temp_files = []
# load the input file into memory; this is a memory-hungry operation,
# see note at top of file
contents = return_csv_contents(input_file)
# create sqlite3 database to store input mapping
db_path, table = create_db_and_table(contents, identifier)
load_db_content(db_path, table, contents)
del contents # free up memory
# run vlookup with N processes equal to CPU count
with multiprocessing.Pool(processes=cpu_count) as pool:
results = []
# launch asynchronous processing of each file
for file in files:
res = pool.apply_async(run_vlookup, (input_file, file, identifier, db_path, table))
results.append(res)
# wait for the processes to finish
for result in results:
res = result.get()
temp_files.append(res)
# collect the processes
pool.close()
pool.join()
# delete input mapping db
delete_db(db_path)
# delete the small files
for i in range(len(files)):
os.remove(files[i])
# merge temp files
temp_file = merge_files(temp_files)
# delete temp files
for i in range(len(temp_files)):
os.remove(temp_files[i])
# replace original output file with merged temp file
shutil.move(temp_file, output_file)
if __name__ == "__main__":
print(timeit.default_timer())
input_file = sys.argv[1]
output_file = sys.argv[2]
identifier = sys.argv[3]
launch_processes(input_file, output_file, identifier)
print(timeit.default_timer())

Related

Is there Python code to write directly into a SQLite command line? [duplicate]

I have a CSV file and I want to bulk-import this file into my sqlite3 database using Python. the command is ".import .....". but it seems that it cannot work like this. Can anyone give me an example of how to do it in sqlite3? I am using windows just in case.
Thanks
import csv, sqlite3
con = sqlite3.connect(":memory:") # change to 'sqlite:///your_filename.db'
cur = con.cursor()
cur.execute("CREATE TABLE t (col1, col2);") # use your column names here
with open('data.csv','r') as fin: # `with` statement available in 2.5+
# csv.DictReader uses first line in file for column headings by default
dr = csv.DictReader(fin) # comma is default delimiter
to_db = [(i['col1'], i['col2']) for i in dr]
cur.executemany("INSERT INTO t (col1, col2) VALUES (?, ?);", to_db)
con.commit()
con.close()
Creating an sqlite connection to a file on disk is left as an exercise for the reader ... but there is now a two-liner made possible by the pandas library
df = pandas.read_csv(csvfile)
df.to_sql(table_name, conn, if_exists='append', index=False)
You're right that .import is the way to go, but that's a command from the SQLite3 command line program. A lot of the top answers to this question involve native python loops, but if your files are large (mine are 10^6 to 10^7 records), you want to avoid reading everything into pandas or using a native python list comprehension/loop (though I did not time them for comparison).
For large files, I believe the best option is to use subprocess.run() to execute sqlite's import command. In the example below, I assume the table already exists, but the csv file has headers in the first row. See .import docs for more info.
subprocess.run()
from pathlib import Path
db_name = Path('my.db').resolve()
csv_file = Path('file.csv').resolve()
result = subprocess.run(['sqlite3',
str(db_name),
'-cmd',
'.mode csv',
'.import --skip 1 ' + str(csv_file).replace('\\','\\\\')
+' <table_name>'],
capture_output=True)
edit note: sqlite3's .import command has improved so that it can treat the first row as header names or even skip the first x rows (requires version >=3.32, as noted in this answer. If you have an older version of sqlite3, you may need to first create the table, then strip off the first row of the csv before importing. The --skip 1 argument will give an error prior to 3.32
Explanation
From the command line, the command you're looking for is sqlite3 my.db -cmd ".mode csv" ".import file.csv table". subprocess.run() runs a command line process. The argument to subprocess.run() is a sequence of strings which are interpreted as a command followed by all of it's arguments.
sqlite3 my.db opens the database
-cmd flag after the database allows you to pass multiple follow on commands to the sqlite program. In the shell, each command has to be in quotes, but here, they just need to be their own element of the sequence
'.mode csv' does what you'd expect
'.import --skip 1'+str(csv_file).replace('\\','\\\\')+' <table_name>' is the import command.
Unfortunately, since subprocess passes all follow-ons to -cmd as quoted strings, you need to double up your backslashes if you have a windows directory path.
Stripping Headers
Not really the main point of the question, but here's what I used. Again, I didn't want to read the whole files into memory at any point:
with open(csv, "r") as source:
source.readline()
with open(str(csv)+"_nohead", "w") as target:
shutil.copyfileobj(source, target)
My 2 cents (more generic):
import csv, sqlite3
import logging
def _get_col_datatypes(fin):
dr = csv.DictReader(fin) # comma is default delimiter
fieldTypes = {}
for entry in dr:
feildslLeft = [f for f in dr.fieldnames if f not in fieldTypes.keys()]
if not feildslLeft: break # We're done
for field in feildslLeft:
data = entry[field]
# Need data to decide
if len(data) == 0:
continue
if data.isdigit():
fieldTypes[field] = "INTEGER"
else:
fieldTypes[field] = "TEXT"
# TODO: Currently there's no support for DATE in sqllite
if len(feildslLeft) > 0:
raise Exception("Failed to find all the columns data types - Maybe some are empty?")
return fieldTypes
def escapingGenerator(f):
for line in f:
yield line.encode("ascii", "xmlcharrefreplace").decode("ascii")
def csvToDb(csvFile, outputToFile = False):
# TODO: implement output to file
with open(csvFile,mode='r', encoding="ISO-8859-1") as fin:
dt = _get_col_datatypes(fin)
fin.seek(0)
reader = csv.DictReader(fin)
# Keep the order of the columns name just as in the CSV
fields = reader.fieldnames
cols = []
# Set field and type
for f in fields:
cols.append("%s %s" % (f, dt[f]))
# Generate create table statement:
stmt = "CREATE TABLE ads (%s)" % ",".join(cols)
con = sqlite3.connect(":memory:")
cur = con.cursor()
cur.execute(stmt)
fin.seek(0)
reader = csv.reader(escapingGenerator(fin))
# Generate insert statement:
stmt = "INSERT INTO ads VALUES(%s);" % ','.join('?' * len(cols))
cur.executemany(stmt, reader)
con.commit()
return con
The .import command is a feature of the sqlite3 command-line tool. To do it in Python, you should simply load the data using whatever facilities Python has, such as the csv module, and inserting the data as per usual.
This way, you also have control over what types are inserted, rather than relying on sqlite3's seemingly undocumented behaviour.
Many thanks for bernie's answer! Had to tweak it a bit - here's what worked for me:
import csv, sqlite3
conn = sqlite3.connect("pcfc.sl3")
curs = conn.cursor()
curs.execute("CREATE TABLE PCFC (id INTEGER PRIMARY KEY, type INTEGER, term TEXT, definition TEXT);")
reader = csv.reader(open('PC.txt', 'r'), delimiter='|')
for row in reader:
to_db = [unicode(row[0], "utf8"), unicode(row[1], "utf8"), unicode(row[2], "utf8")]
curs.execute("INSERT INTO PCFC (type, term, definition) VALUES (?, ?, ?);", to_db)
conn.commit()
My text file (PC.txt) looks like this:
1 | Term 1 | Definition 1
2 | Term 2 | Definition 2
3 | Term 3 | Definition 3
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys, csv, sqlite3
def main():
con = sqlite3.connect(sys.argv[1]) # database file input
cur = con.cursor()
cur.executescript("""
DROP TABLE IF EXISTS t;
CREATE TABLE t (COL1 TEXT, COL2 TEXT);
""") # checks to see if table exists and makes a fresh table.
with open(sys.argv[2], "rb") as f: # CSV file input
reader = csv.reader(f, delimiter=',') # no header information with delimiter
for row in reader:
to_db = [unicode(row[0], "utf8"), unicode(row[1], "utf8")] # Appends data from CSV file representing and handling of text
cur.execute("INSERT INTO neto (COL1, COL2) VALUES(?, ?);", to_db)
con.commit()
con.close() # closes connection to database
if __name__=='__main__':
main()
"""
cd Final_Codes
python csv_to_db.py
CSV to SQL DB
"""
import csv
import sqlite3
import os
import fnmatch
UP_FOLDER = os.path.dirname(os.getcwd())
DATABASE_FOLDER = os.path.join(UP_FOLDER, "Databases")
DBNAME = "allCompanies_database.db"
def getBaseNameNoExt(givenPath):
"""Returns the basename of the file without the extension"""
filename = os.path.splitext(os.path.basename(givenPath))[0]
return filename
def find(pattern, path):
"""Utility to find files wrt a regex search"""
result = []
for root, dirs, files in os.walk(path):
for name in files:
if fnmatch.fnmatch(name, pattern):
result.append(os.path.join(root, name))
return result
if __name__ == "__main__":
Database_Path = os.path.join(DATABASE_FOLDER, DBNAME)
# change to 'sqlite:///your_filename.db'
csv_files = find('*.csv', DATABASE_FOLDER)
con = sqlite3.connect(Database_Path)
cur = con.cursor()
for each in csv_files:
with open(each, 'r') as fin: # `with` statement available in 2.5+
# csv.DictReader uses first line in file for column headings by default
dr = csv.DictReader(fin) # comma is default delimiter
TABLE_NAME = getBaseNameNoExt(each)
Cols = dr.fieldnames
numCols = len(Cols)
"""
for i in dr:
print(i.values())
"""
to_db = [tuple(i.values()) for i in dr]
print(TABLE_NAME)
# use your column names here
ColString = ','.join(Cols)
QuestionMarks = ["?"] * numCols
ToAdd = ','.join(QuestionMarks)
cur.execute(f"CREATE TABLE {TABLE_NAME} ({ColString});")
cur.executemany(
f"INSERT INTO {TABLE_NAME} ({ColString}) VALUES ({ToAdd});", to_db)
con.commit()
con.close()
print("Execution Complete!")
This should come in handy when you have a lot of csv files in a folder which you wish to convert to a single .db file in a go!
Notice that you dont have to know the filenames, tablenames or fieldnames (column names) beforehand!
If the CSV file must be imported as part of a python program, then for simplicity and efficiency, you could use os.system along the lines suggested by the following:
import os
cmd = """sqlite3 database.db <<< ".import input.csv mytable" """
rc = os.system(cmd)
print(rc)
The point is that by specifying the filename of the database, the data will automatically be saved, assuming there are no errors reading it.
Here are solutions that'll work if your CSV file is really big. Use to_sql as suggested by another answer, but set chunksize so it doesn't try to process the whole file at once.
import sqlite3
import pandas as pd
conn = sqlite3.connect('my_data.db')
c = conn.cursor()
users = pd.read_csv('users.csv')
users.to_sql('users', conn, if_exists='append', index = False, chunksize = 10000)
You can also use Dask, as described here to write a lot of Pandas DataFrames in parallel:
dto_sql = dask.delayed(pd.DataFrame.to_sql)
out = [dto_sql(d, 'table_name', db_url, if_exists='append', index=True)
for d in ddf.to_delayed()]
dask.compute(*out)
See here for more details.
Based on Guy L solution (Love it) but can handle escaped fields.
import csv, sqlite3
def _get_col_datatypes(fin):
dr = csv.DictReader(fin) # comma is default delimiter
fieldTypes = {}
for entry in dr:
feildslLeft = [f for f in dr.fieldnames if f not in fieldTypes.keys()]
if not feildslLeft: break # We're done
for field in feildslLeft:
data = entry[field]
# Need data to decide
if len(data) == 0:
continue
if data.isdigit():
fieldTypes[field] = "INTEGER"
else:
fieldTypes[field] = "TEXT"
# TODO: Currently there's no support for DATE in sqllite
if len(feildslLeft) > 0:
raise Exception("Failed to find all the columns data types - Maybe some are empty?")
return fieldTypes
def escapingGenerator(f):
for line in f:
yield line.encode("ascii", "xmlcharrefreplace").decode("ascii")
def csvToDb(csvFile,dbFile,tablename, outputToFile = False):
# TODO: implement output to file
with open(csvFile,mode='r', encoding="ISO-8859-1") as fin:
dt = _get_col_datatypes(fin)
fin.seek(0)
reader = csv.DictReader(fin)
# Keep the order of the columns name just as in the CSV
fields = reader.fieldnames
cols = []
# Set field and type
for f in fields:
cols.append("\"%s\" %s" % (f, dt[f]))
# Generate create table statement:
stmt = "create table if not exists \"" + tablename + "\" (%s)" % ",".join(cols)
print(stmt)
con = sqlite3.connect(dbFile)
cur = con.cursor()
cur.execute(stmt)
fin.seek(0)
reader = csv.reader(escapingGenerator(fin))
# Generate insert statement:
stmt = "INSERT INTO \"" + tablename + "\" VALUES(%s);" % ','.join('?' * len(cols))
cur.executemany(stmt, reader)
con.commit()
con.close()
You can do this using blaze & odo efficiently
import blaze as bz
csv_path = 'data.csv'
bz.odo(csv_path, 'sqlite:///data.db::data')
Odo will store the csv file to data.db (sqlite database) under the schema data
Or you use odo directly, without blaze. Either ways is fine. Read this documentation
The following can also add fields' name based on the CSV header:
import sqlite3
def csv_sql(file_dir,table_name,database_name):
con = sqlite3.connect(database_name)
cur = con.cursor()
# Drop the current table by:
# cur.execute("DROP TABLE IF EXISTS %s;" % table_name)
with open(file_dir, 'r') as fl:
hd = fl.readline()[:-1].split(',')
ro = fl.readlines()
db = [tuple(ro[i][:-1].split(',')) for i in range(len(ro))]
header = ','.join(hd)
cur.execute("CREATE TABLE IF NOT EXISTS %s (%s);" % (table_name,header))
cur.executemany("INSERT INTO %s (%s) VALUES (%s);" % (table_name,header,('?,'*len(hd))[:-1]), db)
con.commit()
con.close()
# Example:
csv_sql('./surveys.csv','survey','eco.db')
in the interest of simplicity, you could use the sqlite3 command line tool from the Makefile of your project.
%.sql3: %.csv
rm -f $#
sqlite3 $# -echo -cmd ".mode csv" ".import $< $*"
%.dump: %.sql3
sqlite3 $< "select * from $*"
make test.sql3 then creates the sqlite database from an existing test.csv file, with a single table "test". you can then make test.dump to verify the contents.
With this you can do joins on CSVs as well:
import sqlite3
import os
import pandas as pd
from typing import List
class CSVDriver:
def __init__(self, table_dir_path: str):
self.table_dir_path = table_dir_path # where tables (ie. csv files) are located
self._con = None
#property
def con(self) -> sqlite3.Connection:
"""Make a singleton connection to an in-memory SQLite database"""
if not self._con:
self._con = sqlite3.connect(":memory:")
return self._con
def _exists(self, table: str) -> bool:
query = """
SELECT name
FROM sqlite_master
WHERE type ='table'
AND name NOT LIKE 'sqlite_%';
"""
tables = self.con.execute(query).fetchall()
return table in tables
def _load_table_to_mem(self, table: str, sep: str = None) -> None:
"""
Load a CSV into an in-memory SQLite database
sep is set to None in order to force pandas to auto-detect the delimiter
"""
if self._exists(table):
return
file_name = table + ".csv"
path = os.path.join(self.table_dir_path, file_name)
if not os.path.exists(path):
raise ValueError(f"CSV table {table} does not exist in {self.table_dir_path}")
df = pd.read_csv(path, sep=sep, engine="python") # set engine to python to skip pandas' warning
df.to_sql(table, self.con, if_exists='replace', index=False, chunksize=10000)
def query(self, query: str) -> List[tuple]:
"""
Run an SQL query on CSV file(s).
Tables are loaded from table_dir_path
"""
tables = extract_tables(query)
for table in tables:
self._load_table_to_mem(table)
cursor = self.con.cursor()
cursor.execute(query)
records = cursor.fetchall()
return records
extract_tables():
import sqlparse
from sqlparse.sql import IdentifierList, Identifier, Function
from sqlparse.tokens import Keyword, DML
from collections import namedtuple
import itertools
class Reference(namedtuple('Reference', ['schema', 'name', 'alias', 'is_function'])):
__slots__ = ()
def has_alias(self):
return self.alias is not None
#property
def is_query_alias(self):
return self.name is None and self.alias is not None
#property
def is_table_alias(self):
return self.name is not None and self.alias is not None and not self.is_function
#property
def full_name(self):
if self.schema is None:
return self.name
else:
return self.schema + '.' + self.name
def _is_subselect(parsed):
if not parsed.is_group:
return False
for item in parsed.tokens:
if item.ttype is DML and item.value.upper() in ('SELECT', 'INSERT',
'UPDATE', 'CREATE', 'DELETE'):
return True
return False
def _identifier_is_function(identifier):
return any(isinstance(t, Function) for t in identifier.tokens)
def _extract_from_part(parsed):
tbl_prefix_seen = False
for item in parsed.tokens:
if item.is_group:
for x in _extract_from_part(item):
yield x
if tbl_prefix_seen:
if _is_subselect(item):
for x in _extract_from_part(item):
yield x
# An incomplete nested select won't be recognized correctly as a
# sub-select. eg: 'SELECT * FROM (SELECT id FROM user'. This causes
# the second FROM to trigger this elif condition resulting in a
# StopIteration. So we need to ignore the keyword if the keyword
# FROM.
# Also 'SELECT * FROM abc JOIN def' will trigger this elif
# condition. So we need to ignore the keyword JOIN and its variants
# INNER JOIN, FULL OUTER JOIN, etc.
elif item.ttype is Keyword and (
not item.value.upper() == 'FROM') and (
not item.value.upper().endswith('JOIN')):
tbl_prefix_seen = False
else:
yield item
elif item.ttype is Keyword or item.ttype is Keyword.DML:
item_val = item.value.upper()
if (item_val in ('COPY', 'FROM', 'INTO', 'UPDATE', 'TABLE') or
item_val.endswith('JOIN')):
tbl_prefix_seen = True
# 'SELECT a, FROM abc' will detect FROM as part of the column list.
# So this check here is necessary.
elif isinstance(item, IdentifierList):
for identifier in item.get_identifiers():
if (identifier.ttype is Keyword and
identifier.value.upper() == 'FROM'):
tbl_prefix_seen = True
break
def _extract_table_identifiers(token_stream):
for item in token_stream:
if isinstance(item, IdentifierList):
for ident in item.get_identifiers():
try:
alias = ident.get_alias()
schema_name = ident.get_parent_name()
real_name = ident.get_real_name()
except AttributeError:
continue
if real_name:
yield Reference(schema_name, real_name,
alias, _identifier_is_function(ident))
elif isinstance(item, Identifier):
yield Reference(item.get_parent_name(), item.get_real_name(),
item.get_alias(), _identifier_is_function(item))
elif isinstance(item, Function):
yield Reference(item.get_parent_name(), item.get_real_name(),
item.get_alias(), _identifier_is_function(item))
def extract_tables(sql):
# let's handle multiple statements in one sql string
extracted_tables = []
statements = list(sqlparse.parse(sql))
for statement in statements:
stream = _extract_from_part(statement)
extracted_tables.append([ref.name for ref in _extract_table_identifiers(stream)])
return list(itertools.chain(*extracted_tables))
Example (assuming account.csv and tojoin.csv exist in /path/to/files):
db_path = r"/path/to/files"
driver = CSVDriver(db_path)
query = """
SELECT tojoin.col_to_join
FROM account
LEFT JOIN tojoin
ON account.a = tojoin.a
"""
driver.query(query)
import csv, sqlite3
def _get_col_datatypes(fin):
dr = csv.DictReader(fin) # comma is default delimiter
fieldTypes = {}
for entry in dr:
feildslLeft = [f for f in dr.fieldnames if f not in fieldTypes.keys()]
if not feildslLeft: break # We're done
for field in feildslLeft:
data = entry[field]
# Need data to decide
if len(data) == 0:
continue
if data.isdigit():
fieldTypes[field] = "INTEGER"
else:
fieldTypes[field] = "TEXT"
# TODO: Currently there's no support for DATE in sqllite
if len(feildslLeft) > 0:
raise Exception("Failed to find all the columns data types - Maybe some are empty?")
return fieldTypes
def escapingGenerator(f):
for line in f:
yield line.encode("ascii", "xmlcharrefreplace").decode("ascii")
def csvToDb(csvFile,dbFile,tablename, outputToFile = False):
# TODO: implement output to file
with open(csvFile,mode='r', encoding="ISO-8859-1") as fin:
dt = _get_col_datatypes(fin)
fin.seek(0)
reader = csv.DictReader(fin)
# Keep the order of the columns name just as in the CSV
fields = reader.fieldnames
cols = []
# Set field and type
for f in fields:
cols.append("\"%s\" %s" % (f, dt[f]))
# Generate create table statement:
stmt = "create table if not exists \"" + tablename + "\" (%s)" % ",".join(cols)
print(stmt)
con = sqlite3.connect(dbFile)
cur = con.cursor()
cur.execute(stmt)
fin.seek(0)
reader = csv.reader(escapingGenerator(fin))
# Generate insert statement:
stmt = "INSERT INTO \"" + tablename + "\" VALUES(%s);" % ','.join('?' * len(cols))
cur.executemany(stmt, reader)
con.commit()
con.close()
I've found that it can be necessary to break up the transfer of data from the csv to the database in chunks as to not run out of memory. This can be done like this:
import csv
import sqlite3
from operator import itemgetter
# Establish connection
conn = sqlite3.connect("mydb.db")
# Create the table
conn.execute(
"""
CREATE TABLE persons(
person_id INTEGER,
last_name TEXT,
first_name TEXT,
address TEXT
)
"""
)
# These are the columns from the csv that we want
cols = ["person_id", "last_name", "first_name", "address"]
# If the csv file is huge, we instead add the data in chunks
chunksize = 10000
# Parse csv file and populate db in chunks
with conn, open("persons.csv") as f:
reader = csv.DictReader(f)
chunk = []
for i, row in reader:
if i % chunksize == 0 and i > 0:
conn.executemany(
"""
INSERT INTO persons
VALUES(?, ?, ?, ?)
""", chunk
)
chunk = []
items = itemgetter(*cols)(row)
chunk.append(items)
Here is my version, works already by asking you to select the '.csv' file you want to convert
from multiprocessing import current_process
import pandas as pd
import sqlite3
import os
from tkinter import Tk
from tkinter.filedialog import askopenfilename
from pathlib import Path
def csv_to_db(csv_filedir):
if not Path(csv_filedir).is_file(): # if needed ask for user input of CVS file
current_path = os.getcwd()
Tk().withdraw()
csv_filedir = askopenfilename(initialdir=current_path)
try:
data = pd.read_csv(csv_filedir) # load CSV file
except:
print("Something went wrong when opening to the file")
print(csv_filedir)
csv_df = pd.DataFrame(data)
csv_df = csv_df.fillna('NULL') # make NaN = to 'NULL' for SQL format
[path,filename] = os.path.split(csv_filedir) # define path and filename
[filename,_] = os.path.splitext(filename)
database_filedir = os.path.join(path, filename + '.db')
conn = sqlite3.connect(database_filedir) # connect to SQL server
[fields_sql, header_sql_string] = create_sql_fields(csv_df)
# CREATE EMPTY DATABASE
create_sql = ''.join(['CREATE TABLE IF NOT EXISTS ' + filename + ' (' + fields_sql + ')'])
cursor = conn.cursor()
cursor.execute(create_sql)
# INSERT EACH ROW IN THE SQL DATABASE
for irow in csv_df.itertuples():
insert_values_string = ''.join(['INSERT INTO ', filename, header_sql_string, ' VALUES ('])
insert_sql = f"{insert_values_string} {irow[1]}, '{irow[2]}','{irow[3]}', {irow[4]}, '{irow[5]}' )"
print(insert_sql)
cursor.execute(insert_sql)
# COMMIT CHANGES TO DATABASE AND CLOSE CONNECTION
conn.commit()
conn.close()
print('\n' + csv_filedir + ' \n converted to \n' + database_filedir)
return database_filedir
def create_sql_fields(df): # gather the headers of the CSV and create two strings
fields_sql = [] # str1 = var1 TYPE, va2, TYPE ...
header_names = [] # str2 = var1, var2, var3, var4
for col in range(0,len(df.columns)):
fields_sql.append(df.columns[col])
fields_sql.append(str(df.dtypes[col]))
header_names.append(df.columns[col])
if col != len(df.columns)-1:
fields_sql.append(',')
header_names.append(',')
fields_sql = ' '.join(fields_sql)
fields_sql = fields_sql.replace('int64','integer')
fields_sql = fields_sql.replace('float64','integer')
fields_sql = fields_sql.replace('object','text')
header_sql_string = '(' + ''.join(header_names) + ')'
return fields_sql, header_sql_string
csv_to_db('')

Python3 How to split a large text file into smaller files based on line content

I have a file with the data
# FULL_ID BJD MAG UNCERT FLAG
and nearly 12,000 rows. This table contains data for 32 objects, each identified by a unique FULL_ID. So for instance it may say
# FULL_ID BJD MAG UNCERT FLAG
2_543 3215.52 19.78 0.02937 OO
2_543 3215.84 19.42 0.02231 OO
3_522 3215.52 15.43 0.01122 OO
3_522 3222.22 16.12 0.01223 OO
What I want is to run this file BigData.dat through the code, and end up with multiple files e.g. 2_543.dat, 3_522.dat etc, each containing:
# BJD MAG UNCERT FLAG
for all rows of BigData.dat that belonged to that FULL_ID.
Currently I'm doing this:
with open(path, 'r') as BigFile:
line = BigFile.readline()
for line in BigFile:
fields = line.split(None)
id = fields[0]
output = open(id+".dat", 'a')
writeline = str(fields[1])+' '+str(fields[2])+' '+str(fields[3])+' '+str(fields[4])+'\n'
output.write(writeline)
output.close()
which does produce the correct outputs but they don't have the header line:
# BJD MAG UNCERT FLAG
How can I ensure this line is at the top of each file?
Opening a file is an expensive operation, and repeatedly doing so for each input line is not efficient. I would instead keep a mapping of seen FULL_ID values to a file object. If a FULL_ID is not present, then the file has to be opened in "w" mode and the header should be immediately added. This way:
the header is correctly written to the output files
if the script is runned more than once, the old values in output files are correctly erased
Code could be:
with open(path) as bigFile:
outfiles = {} # mapping FULL_ID -> output file
header = ' '.join(['#'] + next(bigFile).split()[2:]) # compute output header
for line in bigFile:
row = line.split()
try:
output = outfiles[row[0]]
except KeyError:
output = open(f'{row[0]}.dat', 'w')
print(header, file=output)
outfiles[row[0]] = output
print(' '.join(row[1:]), file=output)
for output in outfiles.values(): # close all files before exiting
output.close()
The limit is that you have to keep all files opened until the end of the input file. It should word for 32 objects, but would break for larger numbers. The efficient way would be to change the simple dict into a more sophisticated cache, able of closing the latest file when capacity is exhausted and reopen it (in append mode) if needed.
Here is a possible cache implementation:
class FileCache:
"""Caches a number of open files referenced by string Ids.
(by default the id is the name)"""
def __init__(self, size, namemapping=None, header=None):
"""Creates a new cache of size size.
namemapping is a function that gives the filename from an ID
header is an optional header that will be written at creation
time
"""
self.size = size
self.namemapping = namemapping if namemapping is not None \
else lambda x: x
self.header = header
self.map = {} # dict id -> slot number
self.slots = [(None, None)] * size # list of pairs (id, file object)
self.curslot = 0 # next slot to be used
def getFile(self, id):
"""Gets an open file from the cache.
Directly gets it if it is already present, eventually reopen
it in append mode. Adds it to the cache if absent and open it
in truncate mode."""
try:
slot = self.map[id]
if slot != -1:
return self.slots[slot][1] # found and active
mode = 'a' # need re-opening
except:
mode = 'w' # new id: create file
slot = self.curslot
self.curslot = (slot + 1) % self.size
if self.slots[slot][0] is not None: # eventually close previous
self.slots[slot][1].close()
self.map[self.slots[slot][0]] = -1
fd = open(self.namemapping(id), mode)
# if file is new, write the optional header
if (mode == 'w') and self.header is not None:
print(self.header, file=fd)
self.slots[slot] = (id, fd)
self.map[id] = slot
return fd
def close(self):
"""Closes any cached file."""
for i in self.slots:
i[1].close()
self.map[i[0]] = -1
self.slots = [(None, None)] * self.size
Above code would become:
with open(path) as bigFile:
header = ' '.join(['#'] + next(bigFile).split()[2:]) # compute output header
outfiles = FileCache(10, lambda x: x+'.dat', header) # cache FULL_ID -> file
for line in bigFile:
row = line.split()
output = outfiles.getFile(row[0])
print(' '.join(row[1:]), file=output)
outfiles.close() # close all files before exiting
You are overwriting the header line in the for loop, keep it in a separate variable. Additionally you could remember if the header was already written to a file:
path = 'big.dat'
header_written = []
with open(path, 'r') as BigFile:
header = BigFile.readline() # keep header separately!
for line in BigFile:
fields = line.split(None)
_id = fields[0]
output = open(_id+".dat", 'a')
if _id not in header_written: # check and save the ID to keep track if header was written
output.write(header)
header_written.append(_id)
writeline = str(fields[1])+' '+str(fields[2])+' '+str(fields[3])+' '+str(fields[4])+'\n'
output.write(writeline)
output.close()
File:
# FULL_ID BJD MAG UNCERT FLAG
3215.52 19.78 0.02937 OO
3215.84 19.42 0.02231 OO

best way to check if files have been processed

E: My initial Title was very misleading.
I have a SQL server with a database and I have around 10,000 excel files in a directory. The files contain values I need to copy into the DB with new excel files being added on a daily basis. Additionally, each file contains a field "finished" with a boolean value, that expresses if the file is ready to be copied to the DB. However, the filename is not connected to it's contend. Only the content of file contains primary keys and filed names corresponding to the DB's keys and field names.
Checking if the file's content is already in the DB by comparing the primary key over and over is not feasible, since opening the files is far too slow. I could however check if files are already in the DB initially and write the result in a file (say copied.txt), so it simply holds the filenames of all already copied files. The real service could then load this file's content into a dictionary (dict1) with the filename as the key and with no value (I think hash tables are the fastest for comparative operations), then store the filenames of all existing excel files in the dir in a second dictionary (dict2) and compare both dictionary and create a list of all files that are in dict2 but not in dict1. I would then iterate through the list (should usually only contain around 10-20 files), checking if the files are flagged as "ready to be copied" and copy the values to the database. Finally, I would add this file's name to dict1 and store it back to the copied.txt file.
My idea is to run this python script as a service that loops as long as there are files to work with. When it can't find files to copy from, it should wait for x seconds (maybe 45) than do it all over.
This my best concept so far. Is there a faster/ more efficient way to do it?
It just came back to my mind that sets only contain unique elements and thus are the best data type for a comparison like this. It is a data type that I hardly know but now I can see how useful it can be.
The part of the code that is related to my original question is in Part 1-3:
The program:
1. loads file names from a file to a set
2. loads file names from the filesystem/ a certain dir + subdirs to a set
3. creates a list of the difference of the two sets
4. iterates through all remaining files
looks, if they have been flagged as "finalized",
than for each row:
creates a new record in the database
and adds values to given record (one by one)
5. adds the processed file's name to the file of filenames.
It does so every 5 minutes. This is completely fine for my purpose.
I am very new to coding so sorry for my dilettantish approach. At least it works so far.
#modules
import pandas as pd
import pyodbc as db
import xlwings as xw
import glob
import os
from datetime import datetime, date
from pathlib import Path
import time
import sys
#constants
tick_time_seconds = 300
line = ("################################################################################### \n")
pathTodo = "c:\\myXlFiles\\**\\*"
pathDone = ("c:\\Done\\")
pathError = ("c:\\Error\\")
sqlServer = "MyMachine\\MySQLServer"
sqlDriver = "{SQL Server}"
sqlDatabase="master"
sqlUID="SA"
sqlPWD="PWD"
#functions
def get_list_of_files_by_extension(path:str, extension:str) -> list:
"""Recieves string patch and extension;
gets list of files with corresponding extension in path;
return list of file with full path."""
fileList = glob.glob(path+extension, recursive=True)
if not fileList:
print("no found files")
else:
print("found files")
return fileList
def write_error_to_log(description:str, errorString:str, optDetails=""):
"""Recieves strings description errorstring and opt(ional)Details;
writes the error with date and time in logfile with the name of current date;
return nothing."""
logFileName = str(date.today())+".txt"
optDetails = optDetails+"\n"
dateTimeNow = datetime.now()
newError = "{0}\n{1}\n{2}{3}\n".format(line, str(dateTimeNow), optDetails, errorString)
print(newError)
with open(Path(pathError, logFileName), "a") as logFile:
logFile.write(newError)
def sql_connector():
"""sql_connector: Recieves nothing;
creates a connection to the sql server (conncetion details sould be constants);
returns a connection."""
return db.connect("DRIVER="+sqlDriver+"; \
SERVER="+sqlServer+"; \
DATABASE="+sqlDatabase+"; \
UID="+sqlUID+"; \
PWD="+sqlPWD+";")
def sql_update_builder(dbField:str, dbValue:str, dbKey:str) -> str:
""" sql_update_builder: takes strings dbField, dbValue and dbKey;
creates a sql syntax command with the purpose to update the value of the
corresponding field with the corresponding key;
returns a string with a sql command."""
return "\
UPDATE [tbl_Main] \
SET ["+dbField+"]='"+dbValue+"' \
WHERE ((([tbl_Main].MyKey)="+dbKey+"));"
def sql_insert_builder(dbKey: str) -> str:
""" sql_insert_builder: takes strings dbKey;
creates a sql syntax command with the purpose to create a new record;
returns a string with a sql command."""
return "\
INSERT INTO [tbl_Main] ([MyKey])\
VALUES ("+dbKey+")"
def append_filename_to_fileNameFile(xlFilename):
"""recieves anywthing xlFilename;
converts it to string and writes the filename (full path) to a file;
returns nothing."""
with open(Path(pathDone, "filesDone.txt"), "a") as logFile:
logFile.write(str(xlFilename)+"\n")
###################################################################################
###################################################################################
# main loop
while __name__ == "__main__":
###################################################################################
""" 1. load filesDone.txt into set"""
listDone = []
print(line+"reading filesDone.txt in "+pathDone)
try:
with open(Path(pathDone, "filesDone.txt"), "r") as filesDoneFile:
if filesDoneFile:
print("file contains entries")
for filePath in filesDoneFile:
filePath = filePath.replace("\n","")
listDone.append(Path(filePath))
except Exception as err:
errorDescription = "failed to read filesDone.txt from {0}".format(pathDone)
write_error_to_log(description=errorDescription, errorString=str(err))
continue
else: setDone = set(listDone)
###################################################################################
""" 2. load filenames of all .xlsm files into set"""
print(line+"trying to get list of files in filesystem...")
try:
listFileSystem = get_list_of_files_by_extension(path=pathTodo, extension=".xlsm")
except Exception as err:
errorDescription = "failed to read file system "
write_error_to_log(description=errorDescription, errorString=str(err))
continue
else:
listFiles = []
for filename in listFileSystem:
listFiles.append(Path(filename))
setFiles = set(listFiles)
###################################################################################
""" 3. create list of difference of setMatchingFiles and setDone"""
print(line+"trying to compare done files and files in filesystem...")
setDifference = setFiles.difference(setDone)
###################################################################################
""" 4. iterate thru list of files """
for filename in setDifference:
""" 4.1 try: look if file is marked as "finalized=True";
if the xlfile does not have sheet 7 (old ones)
just add the xlfilename to the xlfilenameFile"""
try:
print("{0}trying to read finalized state ... of {1}".format(line, filename))
filenameClean = str(filename).replace("\n","")
xlFile = pd.ExcelFile(filenameClean)
except Exception as err:
errorDescription = "failed to read finalized-state from {0} to dataframe".format(filename)
write_error_to_log(description=errorDescription, errorString=str(err))
continue
else:
if "finalized" in xlFile.sheet_names:
dataframe = xlFile.parse("finalized")
print("finalized state ="+str(dataframe.iloc[0]["finalized"]))
if dataframe.iloc[0]["finalized"] == False:
continue
else:
append_filename_to_fileNameFile(filename) #add the xlfilename to the xlfilenameFile"
continue
###################################################################################
""" 4.2 try: read values to dataframe"""
try:
dataframe = pd.read_excel(Path(filename), sheet_name=4)
except Exception as err:
errorDescription = "Failed to read values from {0} to dataframe".format(filename)
write_error_to_log(description=errorDescription, errorString=str(err))
continue
###################################################################################
""" 4.2 try: open connection to database"""
print("{0}Trying to open connection to database {1} on {2}".format(line, sqlDatabase, sqlServer))
try:
sql_connection = sql_connector() #create connection to server
stuff = sql_connection.cursor()
except Exception as err:
write_error_to_log(description="Failed to open connection:", errorString=str(err))
continue
###################################################################################
""" 4.3 try: write to database"""
headers = list(dataframe) #copy header from dataframe to list; easier to iterate
values = dataframe.values.tolist() #copy values from dataframe to list of lists [[row1][row2]...]; easier to iterate
for row in range(len(values)): #iterate over lines
dbKey = str(values[row][0]) #first col is key
sqlCommandString = sql_insert_builder(dbKey=dbKey)
""" 4.3.1 firts trying to create (aka insert) new record in db ..."""
try:
print("{0}Trying insert new record with the id {1}".format(line, dbKey))
stuff.execute(sqlCommandString)
sql_connection.commit()
print(sqlCommandString)
except Exception as err:
sql_log_string = " ".join(sqlCommandString.split()) #get rid of whitespace in sql command
write_error_to_log(description="Failed to create new record in DB:", errorString=str(err), optDetails=sql_log_string)
else: #if record was created add the values one by one:
print("{0}Trying to add values to record with the ID {1}".format(line, dbKey))
""" 4.3.2 ... than trying to add the values one by one"""
for col in range(1, len(headers)): #skip col 0 (the key)
dbField = str(headers[col]) #field in db is header in the excel sheet
dbValue = str(values[row][col]) #get the corresponding value
dbValue = (dbValue.replace("\"","")).replace("\'","") #getting rid of ' and " to prevent trouble with the sql command
sqlCommandString = sql_update_builder(dbField, dbValue, dbKey) # calling fuction to create a sql update command string
try: #try to commit the sql command
stuff.execute(sqlCommandString)
sql_connection.commit()
print(sqlCommandString)
except Exception as err:
sql_log_string = " ".join(sqlCommandString.split()) #get rid of whitespace in sql command
write_error_to_log(description="Failed to add values in DB:", errorString=str(err), optDetails=sql_log_string)
append_filename_to_fileNameFile(filename)
print(line)
# wait for a certain amount of time
for i in range(tick_time_seconds, 0, -1):
sys.stdout.write("\r" + str(i))
sys.stdout.flush()
time.sleep(1)
sys.stdout.flush()
print(line)
#break # this is for debuggung

How to insert csv files in mysql db through multiprocessing?

import pymysql
import time
import csv,os
from multiprocessing import Pool
start = time.time()
db = pymysql.connect(host="localhost",user="root",passwd="root",db="dummydb")
mycursor = db.cursor()
Here I am taking the input csv file:
csv_file = raw_input("Please enter absolute path for the input File (Just file name, to be precise), this script can process csv files only\n")
save_path = './'
input_csv = csv.reader(open(csv_file,'rU'), delimiter=",")
headers = input_csv.next()
for row in input_csv:
validity_start = "2018-03-03 00:00:00"
firstName = row[0]
SMC = row[3]
lastName = ""
countrycode = "IN"
validity_end = row[6]
mobile = row[2]
state = row[4]
city = row[5]
mycursor.execute("INSERT INTO dummydb.thirdparty_subscriber_o(first_name,last_name,country_code,validity_start_date,validity_end_date,mobile) VALUES ('"+str(firstName)+"','"+str(lastName)+"','"+str(countrycode)+"','"+str(validity_start)+"','"+str(validity_end)+"','"+str(mobile)+"')")
db.commit()
print "Inserted"
end = time.time()
print end-start
I want to implement multiprocessing Pool module to read and insert the file faster than usual.
Lets assume you csv contains N records. Read the csv and split the records population into M chunks of data each one of them has the size S.
Now you have M lists each one has the size S. Create a process pool and pass the lists to the pool. The 'func' is the function that does the INSERT into DB

Python is reading rows from a CSV file out of order on Windows 7 corporate laptop - what could be the issue?

I have a Python script that is creating a CSV file named 'reader.csv' with queried data from Salesforce for staging purposes. It then sends the data, reads the data from 'reader.csv' row by row and writes this row to our output file. For some context, it needs to use a staging file to manipulate the array indexes in each row based off of certain values to place them elsewhere in the output file (grabbing the Mortgage Status Date values and placing that same date in accordance to the column headers).
On a Windows 7 corporate machine, the script is reading the array from each row in the 'reader.csv' file out of order... which means it's writing to the output file out of order. It has been tested on several Macs, a newer Windows 10 machine, and even a Raspberry Pi running an old Linux distro... and it all works perfectly. Yet, when we run the script on a Windows 7 machine, it reads the data from the staging file out of order and then writes it out of order in the output file. Here is my code that extracts the data and then writes it to the corresponding CSV files (alongside the transform method I'm using to manipulate the dictionary:
def sf_extract(sf_object, query):
""" Queries Salesforce objects and returns the data in an
OrderedDict """
# Define the job and object, then query the data
job = sf.create_query_job(sf_object, contentType='CSV')
batch = sf.query(job, query)
sf.close_job(job)
# Waits to make sure the query has been fetched
while not sf.is_batch_done(batch):
time.sleep(10)
# Decode the results
for result in sf.get_all_results_for_query_batch(batch):
data = unicodecsv.DictReader(result, encoding='utf-8')
return data
def csv_writer(sf_object, data):
""" Creates a 'Reader' file, transforms the data, then writes
the transformed data to an output file """
file_name = "contact_product_data " + str(year_month_day) + ".csv"
# Opens a CSV file
with open("reader.csv", 'a+', encoding='utf-8', newline='') as in_file:
# Grabs each item (record dictionary) from the data and loops through it
for item in data:
# Logic to get rid of records that are not owned by Loan Officer's
if item["Owner.Id"] in users_list:
item = data_transform(sf_object, item)
writer = csv.DictWriter(in_file, item.keys())
writer.writerow(item)
with open("reader.csv", 'r+', encoding='utf-8', newline='') as in_file, open(file_name, 'w+', encoding='utf-8', newline='') as out_file:
reader = csv.reader(in_file, delimiter=',')
writer = csv.writer(out_file)
writer.writerow(config.csv_header)
for row in reader:
print(row)
try:
status = row[28]
date = row[29]
if str(status) in config.milestone_keys:
row[config.milestone_index[status]] = str(date)
except:
writer.writerow(row)
else:
writer.writerow(row)
def data_transform(sf_object, item):
""" Take the results of a query job and transform the data
(append
a "Contact Type" header and value, depending on the Salesforce
object). """
item = OrderedDict(item)
if sf_object == "Bank_Account__c":
item["Mortgage_Status_Date__c"] = (item["Mortgage_Status_Date__c"])[:-14]
item.update({'Contact Type':'Client'})
item.move_to_end('Contact Type', last=False)
item.update({'Lead Source':''})
item.move_to_end('Lead Source', last=False)
item.update({'App Received':''})
item.update({'Submitted to Underwriting':''})
item.update({'Initial/Conditional Approval':''})
item.update({'Clear to Close':''})
item.update({'Funded':''})
item.update({'Denied':''})
item.update({'Withdrawn':''})
del item["Owner.Id"]
if item["Opportunity__r.LeadID__c"]:
item["Primary_Customer__r.Id"] = item["Opportunity__r.LeadID__c"]
del item["Opportunity__r.LeadID__c"]
if sf_object == "Lead":
item.update({'Contact Type':'Lead'})
item.move_to_end('Contact Type', last=False)
item.update({'Lead Source':item["LeadSource"]})
item.move_to_end('Lead Source', last=False)
del item["LeadSource"]
del item["Owner.Id"]
if sf_object == "Opportunity":
item.update({'Contact Type':'Lead'})
item.move_to_end('Contact Type', last=False)
item.update({'Lead Source':item["LeadSource"]})
item.move_to_end('Lead Source', last=False)
del item["LeadSource"]
del item["Owner.Id"]
if item["LeadID__c"]:
item["Account.Id"] = item["LeadID__c"]
del item["LeadID__c"]
if sf_object == "Contact":
item.update({'Contact Type':'Referral Source'})
item.move_to_end('Contact Type', last=False)
item.update({'Lead Source':''})
item.move_to_end('Lead Source', last=False)
del item["Owner.Id"]
return item
For some reason, it's just not working on this Windows machine and I cannot figure out why. Is it the csv.reader() that's taking in the array for each row out of order? I need help, and quick.

Categories